How to change bar size plotly time series - python

I'm working on creating some analysis on a player's games over time for League of Legends. I'm trying to create a histogram using plotly, with the date range on the x axis and no. of games on y. This works but i can't get individual bars for each day, just month. I've tried using the xaxis, 'size' object but this doesnt change anything, I guess because the x axis is in date form.
So question, in Plotly how do I change the size of the bars on the histogram from a monthly bin size to daily bin size?
Here's an example of the code:
from datetime import date, timedelta
import random
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np
from plotly import tools
from plotly.offline import *#plotly.offline.iplot()
init_notebook_mode(connected=True)
############## create date ranges #################
d1 = date(2014, 3, 22) # start date
d2 = date(2014, 6, 22) # end date
delta = d2 - d1 # timedelta
dates = []
for i in range(delta.days + 1):
dates.append((d1 + timedelta(days=i)))
#################################################
def games_p_day():
sizeo = 1
trace_total = go.Histogram(
y=[random.randint(1, 10) for y in range(1, 100)],
x=dates,
name = 'total games',
xbins=dict(
size=sizeo
)
)
trace_wins = go.Histogram(
y=[random.randint(1, 10) for y in range(1, 100)],
x=dates,
name = 'won games',
xbins=dict(
size=sizeo
)
)
trace_losses = go.Histogram(
y=[random.randint(1, 10) for y in range(1, 100)],
x=dates,
name = 'lost games',
xbins=dict(
size=sizeo
)
)
layout = dict(
title = "Wins and losses over time",
xaxis=dict(
rangeselector=dict(
buttons=list([
dict(count=1,
label='1m',
step='month',
stepmode='backward'),
dict(count=6,
label='6m',
step='month',
stepmode='backward'),
dict(step='all')
])
),
rangeslider=dict(),
type='date',
),
bargap=0.2,
bargroupgap=0.1)
data=[trace_total]
fig = go.Figure(data=data, layout=layout)
iplot(fig, filename = "Wins and losses over time")
games_p_day()
Any help massively appreciated.
Oh and if you see anything else that could help me (ie. bad code structure) please let me know!

A histogram is the representation of the distribution of numerical data. It seems to me that what you're aiming to de here is an aggregation of data from daily to weekly data. That is, as long as you'd like to have a time dimension and not count, average or any other aggregation function on your x-axis. If this is the case then the key to your challenge lies not in plotly itself but in aggregation and time functions such as resample('W-Mon', on='index').sum(). Here are some examples:
Plot for sampled raw data:
Code for raw data:
import pandas as pd
import numpy as np
import datetime
# data
np.random.seed(12)
numdays=100
dates = pd.date_range('1/1/2020', periods=numdays)
games = np.random.randint(low=100, high=200, size=numdays).tolist()
losses = np.random.randint(low=0, high=100, size=numdays).tolist()
wins = list(np.array(games)-np.array(wins))
df = pd.DataFrame({'games': games,
'wins':wins,
'losses':losses}, index=dates)
# resample daily data to weekly sums
df2=df.reset_index().resample('W-Mon', on='index').sum()
df2['formatted_date'] = pd.to_datetime(df3.index)
df2['year'] = df2.formatted_date.apply(lambda x: x.year)
df2['week_of_year'] = df2.formatted_date.apply(lambda x: x.weekofyear)
df2['year_week'] = df2['year'].map(str)+'_'+df3['week_of_year'].map(str)
# build and show plotly plot for daily games
fig = go.Figure(data=[go.Bar(name='games', x=df.index, y=df['games'])])
fig.show()
Plot for weekly aggregated data. Date as index:
Code for weekly aggregated data. Date as index:
# build and show plotly plot for weekly games. Dates as index
fig = go.Figure(data=[go.Bar(name='games', x=df2.index, y=df2['games'])])
fig.show()
Plot for weekly aggregated data. Year and week number as index:
Code for weekly aggregated data. Year and week number as index:
# build and show plotly plot for weekly games. Year and week number as index
fig = go.Figure(data=[go.Bar(name='games', x=df2['year_week'], y=df2['games'])])
fig.show()
Plot for weekly aggregated data, split on wins and losses:
Code for weekly aggregated data, split on wins and losses:
import pandas as pd
import numpy as np
import datetime
# data
np.random.seed(12)
numdays=100
dates = pd.date_range('1/1/2020', periods=numdays)
games = np.random.randint(low=100, high=200, size=numdays).tolist()
losses = np.random.randint(low=0, high=100, size=numdays).tolist()
wins = list(np.array(games)-np.array(wins))
df = pd.DataFrame({'games': games,
'wins':wins,
'losses':losses}, index=dates)
# resample daily data to weekly sums
df2=df.reset_index().resample('W-Mon', on='index').sum()
df2['formatted_date'] = pd.to_datetime(df3.index)
df2['year'] = df2.formatted_date.apply(lambda x: x.year)
df2['week_of_year'] = df2.formatted_date.apply(lambda x: x.weekofyear)
df2['year_week'] = df2['year'].map(str)+'_'+df3['week_of_year'].map(str)
fig = go.Figure(data=[go.Bar(name='victory', x=df2['year_week'], y=df2['wins']),
go.Bar(name='defeat', x=df2['year_week'], y=df2['losses'])])
fig.update_layout(barmode='group')
fig.show()

Related

Plot datetime data in 24 hour window on x axis

I have a dataframe with datetime data:
Start_time: eg(2013-09-21 00:14:00) - the timestamp a task has started
End_time: eg(2013-09-22 11:04:00) - the timestamp a task has ended
Time_diff:eg(0 days 06:07:00) - the time the task took.
I want to plot a histogram of the time events start and end, without considering the date (so only the 24 clock).
I have tried to use:
df['Start_time'].dt.time
to just get the time and plot.
However I am then unable afterwards to BIN the timestamps (now objects) in 20 bins.
This is my result so far:
This is what I am trying to get, a plot with 24hours on the x axis, and the binned distribution of start time & end_time for the y
Here is the code
from random import randrange
import datetime
import pandas as pd
import plotly.express as px
# make the EXAMPLE dataset
startDate = datetime.datetime(2013, 9, 20,13,00)
start_lst = []
end_lst = []
for i in range(200):
start_time= startDate + datetime.timedelta(hours=randrange(23), minutes= randrange(60))
end_time = start_time + datetime.timedelta(hours=randrange(2,7), minutes= randrange(60))
startDate = startDate + datetime.timedelta(days=randrange(4))
start_lst.append(start_time)
end_lst.append(end_time)
df = pd.DataFrame({'Start_time': start_lst,
'End_time': end_lst
})
df['Time_diff'] = df['End_time']-df['Start_time']
#start of code
#tried just using histogram, but sicne the date changes, it wont plot over 24hours
fig = px.histogram(df, x=['Start_time', 'End_time'], nbins=20)
fig.show()
#so tried removing the date part, and just leaving time, however now it wont properly bin
df['Start_time_nodate'] = df['Start_time'].dt.time
df['End_time_nodate'] = df['End_time'].dt.time
fig = px.histogram(df, x=['Start_time_nodate', 'End_time_nodate'], nbins=20)
fig.show()
If I understand correctly, with your example dataframe, here is one way to do it with Matplotlib:
from matplotlib import pyplot as plt
# Setup
df["Start_time_nodate"] = df["Start_time"].dt.hour
df["End_time_nodate"] = df["End_time"].dt.hour
fig, ax = plt.subplots(figsize=(8, 4))
# Plot frequencies
ax.plot(df["Start_time_nodate"].value_counts(sort=False).sort_index())
ax.plot(df["End_time_nodate"].value_counts(sort=False).sort_index())
# Style plot
ax.legend(["Start time", "End time"])
ax.set_xticks(ticks=[i for i in range(0, 25)])
ax.set_xticklabels([i for i in range(0, 25)])
plt.xlabel("24 hours")
plt.ylabel("Frequency")
ax.margins(x=0)
In a Jupyter notebook, this code outputs the following image:

Combining multple figure in window

I'm playing around with kaggle dataframe to practice using matplotlib.
I was creating bar graph one by one, but it keeps adding up.
When I called plt.show() there were like 10 windows of figure suddenly shows up.
Is it possible to combine 4 of those figures into 1 window?
These part are in the same segments "Time Analysis" So I want to combine these 4 figures in 1 window.
import matplotlib.pyplot as plt
import seaborn as sns
dataset = ('accidents_data.csv')
df = pd.read_csv(dataset)
"""Time Analysis :
Analyze the time that accidents happen for various patterns and trends"""
df.Start_Time = pd.to_datetime(df.Start_Time) #convert the start time column to date time format
df['Hour_of_Accident'] = df.Start_Time.dt.hour #extract the hour from the time data
hour_accident = df['Hour_of_Accident'].value_counts()
hour_accident_df = hour_accident.to_frame() #convert the series data to dataframe in order to sort the index columns
hour_accident_df.index.names = ['Hours'] #naming the index column
hour_accident_df.sort_index(ascending=True, inplace=True)
print(hour_accident_df)
# Plotting the hour of accidents data in a bargraph
hour_accident_df.plot(kind='bar',figsize=(8,4),color='blue',title='Hour of Accident')
#plt.show() #Show the bar graph
"""Analyzing the accident frequency per day of the week"""
df['Day_of_the_week'] = df.Start_Time.dt.day_of_week
day_of_accident = df['Day_of_the_week'].value_counts()
day_of_accident_df = day_of_accident.to_frame() #convert the series data to dataframe so that we can sort the index columns
day_of_accident_df.index.names = ['Day'] # Renaming the index column
day_of_accident_df.sort_index(ascending=True, inplace=True)
print(day_of_accident_df)
f, ax = plt.subplots(figsize = (8, 5))
x = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Sartuday', 'Sunday']
l = day_of_accident_df.index.values
y = day_of_accident_df.Day_of_the_week
plt.bar(l, y, color='green')
plt.title('Day of the week vs total number of accidents')
plt.ylabel("No. of accidents recorded")
ax.set_xticks(l)
ax.set_xticklabels(x)
#plt.show()
"""Analysis for the months"""
df['Month'] = df.Start_Time.dt.month
accident_month = df['Month'].value_counts()
accident_month_df = accident_month.to_frame() #convert the series data to dataframe so that we can sort the index columns
accident_month_df.index.names = ['Month'] # Renaming the index column
accident_month_df.sort_index(ascending=True, inplace=True)
print(accident_month_df)
#Plotting the Bar Graph
accident_month_df.plot(kind='bar',figsize=(8,5),color='purple',title='Month of Accident')
"""Yearly Analysis"""
df['Year_of_accident'] = df.Start_Time.dt.year
#Check the yearly trend
yearly_count = df['Year_of_accident'].value_counts()
yearly_count_df = pd.DataFrame({'Year':yearly_count.index, 'Accidents':yearly_count.values})
yearly_count_df.sort_values(by='Year', ascending=True, inplace=True)
print(yearly_count_df)
#Creating line plot
yearly_count_df.plot.line(x='Year',color='red',title='Yearly Accident Trend ')
plt.show()

How to plot multiple graphs with Plotly, where each plot is for a different (next) day?

I want to plot machine observation data by days separately,
so changes between Current, Temperature etc. can be seen by hour.
Basically I want one plot for each day. Thing is when I make too many of these Jupyter Notebook can't display each one of them and plotly gives error.
f_day --> first day
n_day --> next day
I think of using sub_plots with a shared y-axis but then I don't know how I can put different dates in x-axis
How can I make these with graph objects and sub_plots ? So therefore using only 1 figure object so plots doesn't crash.
Data looks like this
,ID,IOT_ID,DATE,Voltage,Current,Temperature,Noise,Humidity,Vibration,Open,Close
0,9466,5d36edfe125b874a36c6a210,2020-08-06 09:02:00,228.893,4.17,39.9817,73.1167,33.3133,2.05,T,F
1,9467,5d36edfe125b874a36c6a210,2020-08-06 09:03:00,228.168,4.13167,40.0317,69.65,33.265,2.03333,T,F
2,9468,5d36edfe125b874a36c6a210,2020-08-06 09:04:00,228.535,4.13,40.11,71.7,33.1717,2.08333,T,F
3,9469,5d36edfe125b874a36c6a210,2020-08-06 09:05:00,228.597,4.14,40.1683,71.95,33.0417,2.0666700000000002,T,F
4,9470,5d36edfe125b874a36c6a210,2020-08-06 09:06:00,228.405,4.13333,40.2317,71.2167,32.9933,2.0,T,F
Code with display error is this
f_day = pd.Timestamp('2020-08-06 00:00:00')
for day in range(days_between.days):
n_day = f_day + pd.Timedelta('1 days')
fig_df = df[(df["DATE"] >= f_day) & (df["DATE"] <= n_day) & (df["IOT_ID"] == iot_id)]
fig_cn = px.scatter(
fig_df, x="DATE", y="Current", color="Noise", color_continuous_scale= "Sunset",
title= ("IoT " + iot_id + " " + str(f_day.date())),
range_color= (min_noise,max_noise)
)
f_day = n_day
fig_cn.show()
updated
The question was with respect to plotly not matplotlib. Same approach works. Clearly axis and titles need some beautification
import pandas as pd
import plotly.subplots
import plotly.express as px
import datetime as dt
import random
df = pd.DataFrame([{"DATE":d, "IOT_ID":random.randint(1,5), "Noise":random.uniform(0,1), "Current":random.uniform(15,25)}
for d in pd.date_range(dt.datetime(2020,9,1), dt.datetime(2020,9,4,23,59), freq="15min")])
# get days to plot
days = df["DATE"].dt.floor("D").unique()
# create axis for each day
fig = plotly.subplots.make_subplots(len(days))
iot_id=3
for i,d in enumerate(days):
# filter data and plot ....
mask = (df["DATE"].dt.floor("D")==d)&(df["IOT_ID"]==iot_id)
splt = px.scatter(df.loc[mask], x="DATE", y="Current", color="Noise", color_continuous_scale= "Sunset",
title= f"IoT ({iot_id}) Date:{pd.to_datetime(d).strftime('%d %b')}")
# select_traces() returns a generator so turn it into a list and take first one
fig.add_trace(list(splt.select_traces())[0], row=i+1, col=1)
fig.show()
It's simple - create the axis that you want to plot on first. Then plot. I've simulated your data as you didn't provide in your question.
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import random
df = pd.DataFrame([{"DATE":d, "IOT_ID":random.randint(1,5), "Noise":random.uniform(0,1), "Current":random.uniform(15,25)}
for d in pd.date_range(dt.datetime(2020,9,1), dt.datetime(2020,9,4,23,59), freq="15min")])
# get days to plot
days = df["DATE"].dt.floor("D").unique()
# create axis for each day
fig, ax = plt.subplots(len(days), figsize=[20,10],
sharey=True, sharex=False, gridspec_kw={"hspace":0.4})
iot_id=3
for i,d in enumerate(days):
# filter data and plot ....
df.loc[(df["DATE"].dt.floor("D")==d)&(df["IOT_ID"]==iot_id),].plot(kind="scatter", ax=ax[i], x="DATE", y="Current", c="Noise",
colormap= "turbo", title=f"IoT ({iot_id}) Date:{pd.to_datetime(d).strftime('%d %b')}")
ax[i].set_xlabel("") # it's in the titles...
output

how to analyze time-series data as a function of the time of day in pandas

Suppose I have a random sample of data collected every 1 minute for a month. Then suppose I want to use pandas to analyze this data as a function of the time of day, and see the differences between a weekend and weekday. I can do this in pandas if my index is a DateTimeIndex by calculating the time of day as a 0-1 decimal value, manually binning the results in intervals of 10 minutes (or whatever) and then plotting the results using the bins column to actually calculate averages over the time intervals of the day, and then manually setting my tick positions and labels into something understandable.
However, this feels a little bit hacky and I am wondering if there are built-in pandas functions to achieve this same kind of analysis. I haven't been able to find them so far.
dates = pd.date_range(start='2018-10-01', end='2018-11-01', freq='min')
vals = np.random.rand(len(dates))
df = pd.DataFrame(data={'dates': dates, 'vals': vals})
df.set_index('dates', inplace=True)
# set up a column to make the time of day a value from 0 to 1
df['day_fraction'] = (df.index.hour + df.index.minute / 60) / 24
# bin the time of day to analyze data during 10 minute intervals
df['day_bins'] = df['day_fraction'] - df['day_fraction'] % (1 / 24 / 6)
ax = df.plot('day_fraction', 'vals', marker='o', color='pink', alpha=0.05, label='')
df.groupby('day_bins')['vals'].mean().plot(ax=ax, label='average')
df[df.index.weekday < 5].groupby('day_bins')['vals'].mean().plot(ax=ax, label='weekday average')
df[df.index.weekday >= 5].groupby('day_bins')['vals'].mean().plot(ax=ax, label='weekend average')
xlabels = [label if label else 12 for label in [i % 12 for i in range(0, 25, 2)]]
xticks = [i / 24 for i in range(0, 25, 2)]
ax.set_xticks(xticks)
ax.set_xticklabels(xlabels)
ax.set_xlabel('time of day')
ax.legend()
I think you just need to use groupby with a lot of the built in .dt accessors. Group based on weekday or weekend and then form bins every 10 minutes (with .floor) and calculate the mean.
Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dates = pd.date_range(start='2018-10-01', end='2018-11-01', freq='min')
vals = np.random.rand(len(dates))
df = pd.DataFrame(data={'dates': dates, 'vals': vals})
df.set_index('dates', inplace=True)
Plot
df1 = (df.groupby([np.where(df.index.weekday < 5, 'weekday', 'weekend'),
df.index.floor('10min').time])
.mean()
.rename(columns={'vals': 'average'}))
fig, ax = plt.subplots(figsize=(12,7))
df1.unstack(0).plot(ax=ax)
# Plot Full Average
df.groupby(df.index.floor('10min').time).mean().rename(columns={'vals': 'average'}).plot(ax=ax)
plt.show()

How can I reduce the frequency of x-axis ticks in Python when plotting multiple groups of values on one axis?

This produces a graph of all these stock prices plotted against the date. If you zoom in, all the tiny ticks have labels for the dates. I wanted to reduce the frequency of ticks so that it only displayed tick marks at the month and year. I have tried using locators and formatters, but whenever I add them all of the ticks and tick labels completely disappear. All that's left at the x-axis is the x-axis label.
Does any of the issue lie within the fact that I extract the date and use that for the x-axis plot for every new batch of stock prices I want to plot? Any advice would be appreciated. I am a beginner programmer.
from iexfinance import get_historical_data
import pandas as pd
import matplotlib.pyplot as plt
def tester():
start_date = '20170828'
end_date = '20180828'
symbols =['GOOG', 'IBM', 'CRON']
for symbol in symbols:
f_temp = get_historical_data(symbol, start_date, end_date, output_format='pandas')
df_close = pd.DataFrame(f_temp['close'])
df_open = pd.DataFrame(f_temp['open'])
df_date_string =
pd.to_datetime(f_temp.index).strftime("%Y%m%d").astype(str)
df = pd.merge(df_open, df_close, on=df_date_string)
df.columns = ['date', 'open', 'close']
plt.legend(symbols)
plot_data(df)
plt.show()
return df
def normalize_data(df):
return df/df.ix[0, :]
def plot_data(df):
normalized = normalize_data(df['close'])
plt.plot(df['date'], normalized)
plt.title("Normalized close stock prices")
plt.xlabel("Dates")
plt.ylabel("Close prices")
plt.tight_layout()
if __name__ == "__main__":
df = tester()

Categories