I have a dataframe with datetime data:
Start_time: eg(2013-09-21 00:14:00) - the timestamp a task has started
End_time: eg(2013-09-22 11:04:00) - the timestamp a task has ended
Time_diff:eg(0 days 06:07:00) - the time the task took.
I want to plot a histogram of the time events start and end, without considering the date (so only the 24 clock).
I have tried to use:
df['Start_time'].dt.time
to just get the time and plot.
However I am then unable afterwards to BIN the timestamps (now objects) in 20 bins.
This is my result so far:
This is what I am trying to get, a plot with 24hours on the x axis, and the binned distribution of start time & end_time for the y
Here is the code
from random import randrange
import datetime
import pandas as pd
import plotly.express as px
# make the EXAMPLE dataset
startDate = datetime.datetime(2013, 9, 20,13,00)
start_lst = []
end_lst = []
for i in range(200):
start_time= startDate + datetime.timedelta(hours=randrange(23), minutes= randrange(60))
end_time = start_time + datetime.timedelta(hours=randrange(2,7), minutes= randrange(60))
startDate = startDate + datetime.timedelta(days=randrange(4))
start_lst.append(start_time)
end_lst.append(end_time)
df = pd.DataFrame({'Start_time': start_lst,
'End_time': end_lst
})
df['Time_diff'] = df['End_time']-df['Start_time']
#start of code
#tried just using histogram, but sicne the date changes, it wont plot over 24hours
fig = px.histogram(df, x=['Start_time', 'End_time'], nbins=20)
fig.show()
#so tried removing the date part, and just leaving time, however now it wont properly bin
df['Start_time_nodate'] = df['Start_time'].dt.time
df['End_time_nodate'] = df['End_time'].dt.time
fig = px.histogram(df, x=['Start_time_nodate', 'End_time_nodate'], nbins=20)
fig.show()
If I understand correctly, with your example dataframe, here is one way to do it with Matplotlib:
from matplotlib import pyplot as plt
# Setup
df["Start_time_nodate"] = df["Start_time"].dt.hour
df["End_time_nodate"] = df["End_time"].dt.hour
fig, ax = plt.subplots(figsize=(8, 4))
# Plot frequencies
ax.plot(df["Start_time_nodate"].value_counts(sort=False).sort_index())
ax.plot(df["End_time_nodate"].value_counts(sort=False).sort_index())
# Style plot
ax.legend(["Start time", "End time"])
ax.set_xticks(ticks=[i for i in range(0, 25)])
ax.set_xticklabels([i for i in range(0, 25)])
plt.xlabel("24 hours")
plt.ylabel("Frequency")
ax.margins(x=0)
In a Jupyter notebook, this code outputs the following image:
Related
I would like to plot the graph for 30 minute interval I have data of 1 min interval so plotting it directly results in contraction of x-axis and I have also reviwed a similar question for datetime nut there the date interval was constant in my case the date changes everytime as it is real time stock market graph
Following is the small part of my code
import yfinance as yf
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.animation
class real_time_graph:
def __init__(self):
self.stock_ticker = "RELIANCE.BO"
today=dt.datetime.utcnow() + dt.timedelta(hours=5, minutes=30)
self.todays_data=yf.download(self.stock_ticker,start=today.strftime("%Y-%m-%d"),interval="1m")
self.todays_data=pd.DataFrame(self.todays_data)
del self.todays_data['Open']
del self.todays_data['High']
del self.todays_data['Low']
del self.todays_data['Adj Close']
del self.todays_data['Volume']
self.todays_data.reset_index(inplace=True)
self.todays_data['Datetime'] = self.todays_data['Datetime'].apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S")).astype(str)
self.todays_data['Datetime'] = pd.DataFrame(map(lambda x: dt.datetime.strptime(x,"%Y-%m-%d %H:%M:%S"),self.todays_data['Datetime']))
self.todays_data['Datetime']=pd.to_datetime(self.todays_data['Datetime'],format="%Y-%m-%d %H:%M:%S")
self.todays_data=self.todays_data.set_index('Datetime')
def current_price(self):
current_info=yf.Ticker(self.stock_ticker).info
today=dt.datetime.utcnow() + dt.timedelta(hours=5, minutes=30)
cp_dict={'Datetime':[],'Close': []}
cp_dict['Datetime'].append(dt.datetime.strptime(today.strftime("%Y-%m-%d %H:%M:%S"),"%Y-%m-%d %H:%M:%S"))
cp_dict['Close'].append(round(current_info.get('currentPrice'),2))
return cp_dict
def animate_real_tm_g(self,i):
CP_dict=self.current_price()
current_df=pd.DataFrame.from_dict(CP_dict)
current_df['Datetime']=pd.to_datetime(current_df['Datetime'],format="%Y-%m-%d %H:%M:%S")
current_df=current_df.set_index('Datetime')
self.todays_data = pd.concat([self.todays_data,current_df])
plt.clf()
plt.plot(self.todays_data['Close'], label = 'RELIANCE.BO', linewidth = 2, alpha = 0.3)
def plot_graph(self):
fig1 = plt.figure()
g_anim=matplotlib.animation.FuncAnimation(fig1, self.animate_real_tm_g,interval = 1000)
plt.show()
plt.close(fig1)
if __name__=='__main__':
obj1 = real_time_graph()
today = dt.datetime.utcnow() + dt.timedelta(hours=5, minutes=30)
obj1.plot_graph()
obj1.todays_data.to_csv(f'{today.strftime("%d_%m_%Y")}Data{"RELIANCE.BO"}.csv',index=True)
Above code I have written for Indian stock of Reliance companny so it will run only on stock market working days based on Indian Time
For a reference I have shared the csv file too
01_06_2022DataRELIANCE.BO.csv
Thank you in advance for your solutions
I want to plot date vs time graph using matplot lib. The issue I am facing is that due to access of data many lines are showing on the xaxis and I can't find a way to plot my time on xaxis cleanly with one hour gap. Say i have data in my list as string as ['6:01','6:30','7:20','7:25']. I want to divide my xaxis from 6:00 to 7:00 and the time points between them should be plotted based on time.
Note: time list is just and example I want to do this for whole 24 hour.
I tried to use ticks and many other options to complete my task but unfortunatly I am stuck at this problem. My data is in csv file.
Below is my code:
def arrivalGraph():
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from matplotlib import dates as mpl_dates
with open("Timetable2021.csv","r") as f:
fileData = f.readlines()
del fileData[0]
date = []
train1 = []
for data in fileData:
ind = data.split(",")
date.append(datetime.strptime(ind[0],"%d/%m/%Y").date())
train1Time = datetime.strptime(ind[1],"%H:%M").time()
train1.append(train1Time.strftime("%H:%M"))
plt.style.use("seaborn")
plt.figure(figsize = (10,10))
plt.plot_date(train1,date)
plt.gcf().autofmt_xdate()#gcf is get current figure - autofmt is auto format
dateformater = mpl_dates.DateFormatter("%b ,%d %Y")
plt.gca().xaxis.set_major_formatter(dateformater) # to format the xaxis
plt.xlabel("Date")
plt.ylabel("Time")
plt.title("Train Time vs Date Schedule")
plt.tight_layout()
plt.show()
When i run the code i get the following output:
output of above code
Assuming that every single minute that every single minute is present in train1 (i.e. train1 = ["00:00", "00:01", "00:02", "00:03", ... , "23:59"]), you can use plt.xticks() by generating an array representing xticks with empty string on every minute which is not 0.
unique_times = sorted(set(train1))
xticks = ['' if time[-2:]!='00' else time for time in unique_times]
plt.style.use("seaborn")
plt.figure(figsize = (10,10))
plt.plot_date(train1,date)
plt.gcf().autofmt_xdate()#gcf is get current figure - autofmt is auto format
dateformater = mpl_dates.DateFormatter("%b ,%d %Y")
# I think you wanted to format the yaxis instead of xaxis
plt.gca().yaxis.set_major_formatter(dateformater) # to format the yaxis
plt.ylabel("Date")
plt.xlabel("Time")
plt.title("Train Time vs Date Schedule")
plt.xticks(range(len(xticks)), xticks)
plt.tight_layout()
plt.show()
If every single minute is not in the train1 array, you have to keep train1 data as an object and generate arrays representing xticks location and values to be used as plt.xticks() parameters.
date = []
train1 = []
for data in fileData:
ind = data.split(",")
date.append(datetime.strptime(ind[0],"%d/%m/%Y").date())
train1Time = datetime.strptime(ind[1],"%H:%M")
train1.append(train1Time)
plt.style.use("seaborn")
plt.figure(figsize = (10,10))
plt.plot_date(train1,date)
plt.gcf().autofmt_xdate()#gcf is get current figure - autofmt is auto format
dateformater = mpl_dates.DateFormatter("%b ,%d %Y")
# I think you wanted to format the y axis instead of xaxis
plt.gca().yaxis.set_major_formatter(dateformater) # to format the yaxis
plt.ylabel("Date")
plt.xlabel("Time")
plt.title("Train Time vs Date Schedule")
ax = plt.gca()
xticks_val = []
xticks_loc = []
distance = (ax.get_xticks()[-1] - ax.get_xticks()[0]) / 24
def to_hour_str(x):
x = str(x)
if len(x) < 2:
x = '0' + x
return x + ':00'
for h in range(25):
xticks_val.append(to_hour_str(h))
xticks_loc.append(ax.get_xticks()[0] + h * distance)
plt.xticks(xticks_loc, xticks_val, rotation=90, ha='left')
plt.tight_layout()
plt.show()
Here's the code output using dummy data I generated myself.
I want to plot machine observation data by days separately,
so changes between Current, Temperature etc. can be seen by hour.
Basically I want one plot for each day. Thing is when I make too many of these Jupyter Notebook can't display each one of them and plotly gives error.
f_day --> first day
n_day --> next day
I think of using sub_plots with a shared y-axis but then I don't know how I can put different dates in x-axis
How can I make these with graph objects and sub_plots ? So therefore using only 1 figure object so plots doesn't crash.
Data looks like this
,ID,IOT_ID,DATE,Voltage,Current,Temperature,Noise,Humidity,Vibration,Open,Close
0,9466,5d36edfe125b874a36c6a210,2020-08-06 09:02:00,228.893,4.17,39.9817,73.1167,33.3133,2.05,T,F
1,9467,5d36edfe125b874a36c6a210,2020-08-06 09:03:00,228.168,4.13167,40.0317,69.65,33.265,2.03333,T,F
2,9468,5d36edfe125b874a36c6a210,2020-08-06 09:04:00,228.535,4.13,40.11,71.7,33.1717,2.08333,T,F
3,9469,5d36edfe125b874a36c6a210,2020-08-06 09:05:00,228.597,4.14,40.1683,71.95,33.0417,2.0666700000000002,T,F
4,9470,5d36edfe125b874a36c6a210,2020-08-06 09:06:00,228.405,4.13333,40.2317,71.2167,32.9933,2.0,T,F
Code with display error is this
f_day = pd.Timestamp('2020-08-06 00:00:00')
for day in range(days_between.days):
n_day = f_day + pd.Timedelta('1 days')
fig_df = df[(df["DATE"] >= f_day) & (df["DATE"] <= n_day) & (df["IOT_ID"] == iot_id)]
fig_cn = px.scatter(
fig_df, x="DATE", y="Current", color="Noise", color_continuous_scale= "Sunset",
title= ("IoT " + iot_id + " " + str(f_day.date())),
range_color= (min_noise,max_noise)
)
f_day = n_day
fig_cn.show()
updated
The question was with respect to plotly not matplotlib. Same approach works. Clearly axis and titles need some beautification
import pandas as pd
import plotly.subplots
import plotly.express as px
import datetime as dt
import random
df = pd.DataFrame([{"DATE":d, "IOT_ID":random.randint(1,5), "Noise":random.uniform(0,1), "Current":random.uniform(15,25)}
for d in pd.date_range(dt.datetime(2020,9,1), dt.datetime(2020,9,4,23,59), freq="15min")])
# get days to plot
days = df["DATE"].dt.floor("D").unique()
# create axis for each day
fig = plotly.subplots.make_subplots(len(days))
iot_id=3
for i,d in enumerate(days):
# filter data and plot ....
mask = (df["DATE"].dt.floor("D")==d)&(df["IOT_ID"]==iot_id)
splt = px.scatter(df.loc[mask], x="DATE", y="Current", color="Noise", color_continuous_scale= "Sunset",
title= f"IoT ({iot_id}) Date:{pd.to_datetime(d).strftime('%d %b')}")
# select_traces() returns a generator so turn it into a list and take first one
fig.add_trace(list(splt.select_traces())[0], row=i+1, col=1)
fig.show()
It's simple - create the axis that you want to plot on first. Then plot. I've simulated your data as you didn't provide in your question.
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import random
df = pd.DataFrame([{"DATE":d, "IOT_ID":random.randint(1,5), "Noise":random.uniform(0,1), "Current":random.uniform(15,25)}
for d in pd.date_range(dt.datetime(2020,9,1), dt.datetime(2020,9,4,23,59), freq="15min")])
# get days to plot
days = df["DATE"].dt.floor("D").unique()
# create axis for each day
fig, ax = plt.subplots(len(days), figsize=[20,10],
sharey=True, sharex=False, gridspec_kw={"hspace":0.4})
iot_id=3
for i,d in enumerate(days):
# filter data and plot ....
df.loc[(df["DATE"].dt.floor("D")==d)&(df["IOT_ID"]==iot_id),].plot(kind="scatter", ax=ax[i], x="DATE", y="Current", c="Noise",
colormap= "turbo", title=f"IoT ({iot_id}) Date:{pd.to_datetime(d).strftime('%d %b')}")
ax[i].set_xlabel("") # it's in the titles...
output
Im using the following code:
import matplotlib.pyplot as pyplot
import pandas as pandas
from datetime import datetime
dataset = pandas.read_csv("HugLog_17.01.11.csv", sep=",", header=0)
print('filter data for SrcAddr')
dataset_filtered = dataset[dataset['SrcAddr']=='0x1FD3']
print('get Values')
varY = dataset_filtered.Battery_Millivolt.values
varX = dataset_filtered.Timestamp.values
print('Convert the date-strings in date-objects.')
dates_list = [datetime.strptime(date, '%y-%m-%d %H:%M:%S') for date in varX]
fig = pyplot.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.set_xlabel('Time')
ax1.set_ylabel('Millivolt')
ax1.bar(dates_list, varY)
pyplot.locator_params(axis='x',nbins=10)
pyplot.show()
The problem i have is, its a large datacollection with 180k datapoints.
And pyplot displays all points an the graph which makes it slow and the bars overlap. Is there a way to set a maximum-limit on how much datapoints a displayed at a "view".
What i mean by that is, that as soon as the graph is render ther are only 50 datapoints and when i zoomm in i only get a maximum of 50 datapoints again.
Resampling can be done with the resample function from pandas.
Note that the resample syntax has changed between version 0.17 and 0.19 of pandas. The example below uses the old style. See e.g. this tutorial for the new style.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# generate some data for every second over a whole day
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
np.random.rand(len(df.index))*100
# resample the data, taking the mean over 1 hours ("H")
t = "H" # for hours, try "T" for minutes as well
width=1./24 #matplotlib default uses a width of 1 day per bar
# try width=1./(24*60) for minutes
df_resampled = pd.DataFrame()
df_resampled['data'] = df.data.resample(t, how="mean")
fig, ax = plt.subplots()
#ax.bar(df.index, df['data'], width=1./(24*60*60)) # original data, takes too long to plot
ax.bar(df_resampled.index, df_resampled['data'], width=width)
ax.xaxis_date()
plt.show()
Automatic adaption of the resampling when zooming would indeed require some manual work. There is a resampling example on the matplotlib event handling page, which does not work out of the box but could be adapted accordingly.
This is how it would look like:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates
class Sampler():
def __init__(self,df):
self.df = df
def resample(self, limits):
print limits
dt = limits[1] - limits[0]
if (type(dt) != pd.tslib.Timedelta) and (type(dt) != datetime.timedelta):
dt = datetime.timedelta(days=dt)
print dt
#see #http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
if dt > datetime.timedelta(hours=5):
t = "H"; width=1./24
elif dt > datetime.timedelta(minutes=60):
t = "15T"; width=15./(24.*60)
elif dt > datetime.timedelta(minutes=5):
t = "T"; width=1./(24.*60)
elif dt > datetime.timedelta(seconds=60):
t = "15S"; width=15./(24.*60*60)
else:
#dt < datetime.timedelta(seconds=60):
t = "S"; width=1./(24.*60*60)
self.resampled = pd.DataFrame()
self.resampled['data'] = self.df.data.resample(t, how="mean")
print t, len(self.resampled['data'])
print "indextype", type(self.resampled.index[0])
print "limitstype", type(limits[1])
if type(limits[1]) == float or type(limits[1]) == np.float64 :
dlowlimit = matplotlib.dates.num2date(limits[0])
duplimit = matplotlib.dates.num2date(limits[1])
print type(duplimit), duplimit
self.resampled = self.resampled.loc[self.resampled.index <= duplimit]
self.resampled = self.resampled.loc[self.resampled.index >= dlowlimit]
else:
self.resampled = self.resampled.loc[self.resampled.index <= limits[1]]
self.resampled = self.resampled.loc[self.resampled.index >= limits[0]]
return self.resampled.index,self.resampled['data'],width
def update(self, ax):
print "update"
lims = ax.viewLim
start, stop = lims.intervalx
ax.clear()
x,y,width = self.resample([start, stop])
ax.bar(x,y, width=width)
ax.set_xlim([start, stop])
ax.callbacks.connect('xlim_changed', self.update)
ax.figure.canvas.draw()
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
np.random.rand(len(df.index))*500
sampler = Sampler(df)
x,y,width = sampler.resample( [df.index[0],df.index[-1] ] )
fig, ax = plt.subplots()
ax.bar(x,y, width=width)
ax.xaxis_date()
# connect to limits changes
ax.callbacks.connect('xlim_changed', sampler.update)
plt.show()
One thing you can do is plot a random subset of the data by using the sample method on your pandas DataFrame. Use the frac argument to determine the fraction of points you want to use. It ranges from 0 to 1.
After you get your dataset_filtered DataFrame, take a sample of it like this
dataset_filtered_sample = dataset_filtered.sample(frac=.001)
How can I use a date from a Sqlite database on the x-axis to make a bar graph with matplotlib?
If I convert the date to unix timestamp the graph works, but I would like to get something like this: http://i.stack.imgur.com/ouKBy.png
lowestNumber = self.c.execute('SELECT number,date, time FROM testDB ORDER BY number ASC LIMIT 1')
for rows in lowestNumber:
datesLow = rows[1]#returns 2016-02-23
splitDate = datesLow.split('-' )
spaces = ""
# tabs = '/'
# tabsDatesLow = tabs.join( splitDate )
joinDatesLow = spaces.join( splitDate )
x = int(joinDatesLow)
plt.bar(x,low, label="Minimum number of players", color="red")
plt.show()
You need to have an integer time format for plotting dates in matplotlib, and then a date formatting object is passed to format the axes. Matplotlib's date2num function can do this for you. Another good example is Matplotlib's documentation with an example here: http://matplotlib.org/examples/pylab_examples/date_demo1.html. Here is a solution yo may find useful:
import datetime
import matplotlib.pyplot as plt
from matplotlib.dates import AutoDateLocator, AutoDateFormatter, date2num
#make my own data:
date = '2016-02-23'
low = 10
#how to format dates:
date_datetime = datetime.datetime.strptime(date, '%Y-%m-%d')
int_date = date2num( date_datetime)
#create plots:
fig, ax = plt.subplots()
#plot data:
ax.bar(int_date,low, label="Minimum number of players", color="red")
#format date strings on xaxis:
locator = AutoDateLocator()
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter( AutoDateFormatter(locator) )
#adjust x limits and apply autoformatter fordisplay of dates
min_date = date2num( datetime.datetime.strptime('2016-02-16', '%Y-%m-%d') )
max_date = date2num( datetime.datetime.strptime('2016-02-28', '%Y-%m-%d') )
ax.set_xlim([min_date, max_date])
fig.autofmt_xdate()
#show plot:
plt.show()