I need to plot data that starts and end at a certain time, next to this I need to exclude a period in the weekend in that time period.
How can I create a time_mask of my data that has two rules?
I already created a code for the "Start" and "End" period, but I am not able to add the rule for excluding the "Weekend period".
#create a time_mask
start_date = '2022-06-30 15:26:00'
end_date = '2022-07-11 15:30:00'
weekend_end = '2022-07-08 14:30:00'
weekend_start = '2022-07-11 09:50:00'
time_mask = (df['Time'] > start_date) & (df['Time'] <= end_date)
# use only this part of the dataframe as training data
df1_train = df1.loc[time_mask]
I tried to exclude the "Weekend period" with the code below, but this is not working...
time_mask = ((df['Time'] > start_date) & (df['Time'] <= end_date) & ((df['Time'] < weekend_start) or (df['Time'] > weekend_end)))
I already solved the problem for one part. But now in my plot the period is not excluded:
Plot
Plot in operating hours
UPDATE 22-08-22
#%% Plot data
fig, ax = plt.subplots()
ax.plot(df['Time'], df1[Temp])
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y-%m-%d %H:%M:%S'))
fig.autofmt_xdate()
plt.show()
#%% Plot the data without empty values
N = len(df['Time'])
ind = np.arange(N)
def format_date(x, pos=None):
thisind = np.clip(int(x + 0.5), 0, N - 1)
return df['Time'][thisind].strftime('%Y-%m-%d %H:%M:%S')
fig, ax = plt.subplots()
ax.plot(ind, df[Temp])
ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
ax.set_title("Without empty values")
fig.autofmt_xdate()
plt.show()
Update 22-08-22
use '|' instead of or.
And in my opinion, you confused weekend_end with weekend_start, since the start is a later date, and the end, on the contrary, is early.
After filtering by condition:
(df['Time'] > start_date) & (df['Time'] <= end_date)
the data is again filtered by time greater than weekend_start:
(df['Time'] > weekend_start)
or time less than weekend_end:
(df['Time'] < weekend_end)
that is, the period from 2022-07-08 14:30:00 to 2022-07-11 09:50:00 is excluded.
Now about drawing. The fact is that the axis with dates and times is continuous. Even if there is no data in a certain period. On the left is a picture that does not remove this gap, on the right, the 'format_date' function is used to exclude this gap.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates
import matplotlib.ticker as ticker
df = pd.read_csv('Data.csv', sep=',', header=0)
start_date = '2022-06-30 15:26:00'
end_date = '2022-07-11 15:30:00'
weekend_end = '2022-07-08 14:30:00'
weekend_start = '2022-07-11 09:50:00'
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
time_mask = ((df['Timestamp'] > start_date) & (df['Timestamp'] <= end_date) & (
(df['Timestamp'] > weekend_start) | (df['Timestamp'] < weekend_end)))
df1 = df[time_mask].copy()
df1 = df1.set_index('Timestamp')
fig, axes = plt.subplots(ncols=2)
ax = axes[0]
ax.plot(df1.index, df1['Data'])
ax.xaxis.set_major_formatter(matplotlib.dates.DateFormatter('%Y-%m-%d %H:%M:%S'))
ax.set_title("Default")
fig.autofmt_xdate()
N = len(df1['Data'])
ind = np.arange(N)
def format_date(x, pos=None):
thisind = np.clip(int(x + 0.5), 0, N - 1)
return df1.index[thisind].strftime('%Y-%m-%d %H:%M:%S')
ax = axes[1]
ax.plot(ind, df1['Data'])
ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
ax.set_title("Without empty values")
fig.autofmt_xdate()
plt.show()
Note that the 'Timestamp' column is converted to an index.
df1 = df1.set_index('Timestamp')
Below is the drawing code with a simple moving average. It's hard for me to calculate ema. You can use a library like TA-Lib.
df1['sma'] = df1['Data'].rolling(window=33).mean()
N = len(df1.index)
ind = np.arange(N)
def format_date(x, pos=None):
thisind = np.clip(int(x + 0.5), 0, N - 1)
return df1.index[thisind].strftime('%Y-%m-%d %H:%M:%S')
fig, ax = plt.subplots()
ax.plot(ind, df1['Data'])
ax.plot(ind, df1['sma'])
ax.xaxis.set_major_formatter(ticker.FuncFormatter(format_date))
fig.autofmt_xdate()
plt.show()
also it seems correct to me, to convert strings to date time format, make them like in file:
also it seems correct to me, to convert strings to date time format, make them like in file:
start_date = pd.to_datetime('2022-06-30T15:26:00+02:00', errors='coerce')
end_date = pd.to_datetime('2022-07-11T15:30:00+02:00', errors='coerce')
weekend_end = pd.to_datetime('2022-07-08T14:30:00+02:00', errors='coerce')
weekend_start = pd.to_datetime('2022-07-11T09:50:00+02:00', errors='coerce')
Update 12/09/2022.
made it more convenient to draw without gaps. Created a column from an index by converting the data to strings. In the previous version, the same principle, but here everything is done at once without a function. Also applied MaxNLocator is how many divisions to display.
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
df = pd.read_csv('Data.csv', sep=',', header=0)
start_date = '2022-06-30 15:26:00'
end_date = '2022-07-11 15:30:00'
weekend_end = '2022-07-08 14:30:00'
weekend_start = '2022-07-11 09:50:00'
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
time_mask = ((df['Timestamp'] > start_date) & (df['Timestamp'] <= end_date) & (
(df['Timestamp'] > weekend_start) | (df['Timestamp'] < weekend_end)))
df1 = df[time_mask].copy()
df1 = df1.set_index('Timestamp')
df1['string'] = df1.index.astype(str)
df1['sma'] = df1['Data'].rolling(window=33).mean()
fig, ax = plt.subplots()
ax.plot(df1['string'], df1['Data'])
ax.plot(df1['string'], df1['sma'])
ax.xaxis.set_major_locator(MaxNLocator(nbins=5))
fig.autofmt_xdate()
plt.show()
Related
I've got the following line diagram with two time series values and two vertical lines and would like to colour the area where "FromGen" is larger than "ToCons" and on the outside of the two vertical lines.
start_date="2019-06-18"
end_date="2019-06-19"
x0='2019-06-18 9:00:00'
x1='2019-06-18 17:00:00'
x1= pd.to_datetime(x1, format="%Y-%m-%d", utc=True)
x2= pd.to_datetime(x2, format="%Y-%m-%d", utc=True)
zeit = (df['DateTime'] > start_date) & (df['DateTime'] <= end_date)
zeit = df.loc[zeit]
zeit.plot(figsize=(15,10),x="DateTime", y=["FromGen", "ToCons"])
plt.xlabel("Zeit")
plt.ylabel("Wh")
legend = plt.legend(title="comp",
loc=1, fontsize='large', fancybox=True, labels=['FromGen', 'ToCons'])
plt.axvline(x=x0, color='red')
plt.axvline(x=x1, color='red')
kde_x, kde_y = ax.lines[0].get_data()
ax.fill_between(kde_x, kde_y, where=(kde_x<x0) | (kde_x>x1) ,
interpolate=True, color='#8FF536')
plt.show()
I already found that the code on the last few lines might be helpful - but at the moment I'm struggeling with this error:
'<' not supported between instances of 'numpy.ndarray' and 'str'
Many thanks in advance!
EDIT:
This is how my code looks at the moment- if I don't convert the df into the correct tz everything works perfect. But when I do the green area gets moved. I guess that I'm missing a Timezone convert somewhere but can't find out where....
from datetime import datetime
from pytz import timezone
df= pd.read_csv("filename.csv", error_bad_lines=False, sep=";")
df['DateTime'] = pd.to_datetime(df['DateTime'], format="%Y-%m-%d", utc=True)
#df['DateTime'] = df['DateTime'].dt.tz_convert('Europe/Berlin')
#Works great without this line! But when I use it the coloured area seems to be in the wrong timezone
start_date = "2019-06-18" end_date = "2019-06-19"
x0 = '2019-06-18 9:00:00' x1 = '2019-06-18 16:00:00'
zeit = df.loc[(df['DateTime'] > start_date) & (df['DateTime'] <= end_date)]
ax = zeit.plot(figsize=(15, 10), x="DateTime", y=["FromGen", "ToCons"])
ax.set_xlabel("Zeit") ax.set_ylabel("Wh") legend = ax.legend(title="comp",
loc='upper right', fontsize='large', fancybox=True, labels=['FromGen', 'ToCons']) ax.axvline(x=x0, color='red') ax.axvline(x=x1, color='red')
x0 = datetime_obj.replace(tzinfo=timezone('UTC'))
ax.fill_between(zeit['DateTime'].values, zeit['FromGen'], zeit['ToCons'],
where=((zeit['FromGen'] > zeit['ToCons']) & ((zeit['DateTime'] <= x0) | (zeit['DateTime'] >=x1))),
interpolate=False, color='#8FF536') plt.show()
Working with dates has evolved a lot in the last pandas and matplotlib versions. Therefore, referring to old posts can be misleading. The following code has been tested with matplotlib 3.4.1 and pandas 1.2.4.
Something strange in the code of the question, is that first x0 and x1 are used, and thereafter x1 and x2 without giving a value to x2.
plt.fill_between() can work directly with the numerical columns. To work with the datatime column, now it is happy with ...['DateTime'].values. For the where clause, to compare the datetime column, the x0 and x1 need to be converted with pd.to_datetime(...). As the datetime values in my example don't have utc, the comparison doesn't work when using x0 = pd.to_datetime(..., utc=True).
Also note that pandas plotting (zeit.plot(...)) returns a matplotlib ax.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
start_date = "2019-06-18"
end_date = "2019-06-19"
x0 = '2019-06-18 9:00:00'
x1 = '2019-06-18 17:00:00'
N = 200
df = pd.DataFrame({'DateTime': pd.date_range(start_date, freq='10min', periods=N),
'FromGen': 8 * np.exp(-(np.arange(N) - 80) ** 2 / 1000) + np.random.uniform(0, 0.4, N),
'ToCons': np.random.uniform(1, 1.5, N)})
x0 = pd.to_datetime(x0, format="%Y-%m-%d")
x1 = pd.to_datetime(x1, format="%Y-%m-%d")
zeit = df.loc[(df['DateTime'] > start_date) & (df['DateTime'] <= end_date)]
ax = zeit.plot(figsize=(15, 10), x="DateTime", y=["FromGen", "ToCons"])
ax.set_xlabel("Zeit")
ax.set_ylabel("Wh")
legend = ax.legend(title="comp",
loc='upper right', fontsize='large', fancybox=True, labels=['FromGen', 'ToCons'])
ax.axvline(x=x0, color='red')
ax.axvline(x=x1, color='red')
ax.fill_between(zeit['DateTime'].values, zeit['FromGen'], zeit['ToCons'],
where=(zeit['FromGen'] > zeit['ToCons']) & ((zeit['DateTime'] <= x0) | (zeit['DateTime'] >= x1)),
interpolate=False, color='#8FF536')
plt.show()
I am dealing with 20 excels. For each excel, I do the same subsetting and plotting. I can get 20 separate figures now using for loop. But how can I put these figures in one subplot (5*4)? Check some posts but cannot get the answer.
for files in allfiles:
#if re.search(r".*.xlsx", files):
df = pd.read_excel("D:\Brown research\Task2 site selection\All_excel\{0}".format(files))
newdf = df[(df.slope != 0) & (df.AI >= 0.8) & (df.reach_len > 5000)]
Q1=df['slope'].quantile(0.25)
Q3=df['slope'].quantile(0.75)
IQR=Q3-Q1
Upper_Whisker = Q3+3*IQR
newdf = newdf[newdf['slope']<Upper_Whisker]
x = newdf['slope']
y = newdf['AI']
nbins = 20
plt.figure()
plt.hist2d(x,y,nbins,cmap=plt.cm.coolwarm, cmin=1)
plt.colorbar()
plt.title(files.split('_')[0],x=0.5,y=0.9)
Create your fig and axes ahead of time and use matplotlib's object-oriented interface:
from pathlib import Path
from matplotlib import pyplot
import pandas
basedir = Path(r"D:\Brown research\Task2 site selection\All_excel")
fig, axes = pyplot.subplots(5, 4, figsize=(10, 10))
for xlfile, ax in zip(basedir.glob("*.xlsx"), axes.flat):
df = pandas.read_excel(xlfile)
Q1=df['slope'].quantile(0.25)
Q3=df['slope'].quantile(0.75)
IQR=Q3-Q1
Upper_Whisker = Q3+3*IQR
newdf = df.loc[lambda df:
(df["slope"] != 0) &
(df["slope"] < Upper_Whisker) &
(df["AI"] >= 0.8) &
(df["reach_len"] > 5000)
]
x = newdf['slope']
y = newdf['AI']
nbins = 20
hist = ax.hist2d(x,y,nbins,cmap=plt.cm.coolwarm, cmin=1)
fig.colorbar(hist)
ax.set_title(xlfile.stem.split('_')[0], x=0.5, y=0.9)
This assumes that you have exactly 20 files in your directory.
Better yet, I would concat everything into a single dataframe would use seaborn to build up the plot dynamically:
from pathlib import Path
from matplotlib import pyplot
import pandas
import seaborn
def get_upper_whisker(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
return Q3 + (3 * IQR)
basedir = Path(r"D:\Brown research\Task2 site selection\All_excel")
data = pandas.concat([
pandas.read_excel(xlfile).assign(file=xlfile.stem)
for xlfile in basefir.glob("*.xlsx")
])
fig = (
data.groupby("file")
.apply(lambda g:
g.loc[g["slope"] < get_upper_whisker(g, "slope")]
)
.loc[lambda df:
(df["slope"] != 0) &
(df["AI"] >= 0.8) &
(df["reach_len"] > 5000)
]
.pipe(seaborn.FacetGrid, col="file", col_wrap=4)
)
fig.map(pyplot.hist2d, "slope", "AI", bins=20, cmap=pyplot.cm.coolwarm, cmin=1)
Thank you for the two solutions from #Paul H. For solution 1, just need to revise a little bit, e.g., "fig.colorbar(hist[3],ax=ax)". hist[3] is the "image".
I am currently working on an intra-day stock chart using the Alpha Vantage API. The data frame contains values from 4:00 to 20:00. In my matplotlib.pyplot chart however, the x-Axis also includes values from 20:00 to 4:00 over night. I dont want this as it messes up the aesthetics and also the Volume subplot.
Q: Is there any way to skip x-Axis values which dont exist in the actual Data Frame (the values from 20:00 to 04:00)?
As you can see, the Data Frame clearly jumps from 20:00 to 04:00
However in the Matplotlib chart, the x-Axis contains the values from 20:00 to 4:00, messing with the chart
Code so far. I believe so far everything is right:
import pandas as pd
import matplotlib.pyplot as plt
from alpha_vantage.timeseries import TimeSeries
import time
import datetime as dt
from datetime import timedelta as td
from dateutil.relativedelta import relativedelta
#Accessing and Preparing API
ts = TimeSeries(key=api_key, output_format='pandas')
ticker_input = "TSLA"
interval_input = "15min"
df, meta_data = ts.get_intraday(symbol = ticker_input, interval = interval_input, outputsize = 'full')
slice_date = 16*4*5
df = df[0:slice_date]
df = df.iloc[::-1]
df["100ma"] = df["4. close"].rolling(window = 50, min_periods = 0).mean()
df["Close"] = df["4. close"]
df["Date"] = df.index
#Plotting all as 2 different subplots
ax1 = plt.subplot2grid((7,1), (0,0), rowspan = 5, colspan = 1)
ax1.plot(df["Date"], df['Close'])
ax1.plot(df["Date"], df["100ma"], linewidth = 0.5)
plt.xticks(rotation=45)
ax2 = plt.subplot2grid((6,1), (5,0), rowspan = 2, colspan = 2, sharex = ax1)
ax2.bar(df["Date"], df["5. volume"])
ax2.axes.xaxis.set_visible(False)
plt.tight_layout()
plt.show()
It would be great if anybody could help. Im still a complete beginner and only started Python 2 weeks ago.
We got the data from the same place, although the data acquisition method is different. After extracting it in 15 units, I created a graph by excluding the data after 8pm and before 4pm. I created the code with the understanding that your skip would open up the pause. What you want it to skip is skipped once the NaN is set.
import datetime
import pandas as pd
import numpy as np
import pandas_datareader.data as web
import mplfinance as mpf
# import matplotlib.pyplot as plt
with open('./alpha_vantage_api_key.txt') as f:
api_key = f.read()
now_ = datetime.datetime.today()
start = datetime.datetime(2019, 1, 1)
end = datetime.datetime(now_.year, now_.month, now_.day - 1)
symbol = 'TSLA'
df = web.DataReader(symbol, 'av-intraday', start, end, api_key=api_key)
df.columns = ['Open', 'High', 'Low', 'Close', 'Volume']
df.index = pd.to_datetime(df.index)
df["100ma"] = df["Close"].rolling(window = 50, min_periods = 0).mean()
df["Date"] = df.index
df_15 = df.asfreq('15min')
df_15 = df_15[(df_15.index.hour >= 4)&(df_15.index.hour <= 20) ]
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,4.5),dpi=144)
#Plotting all as 2 different subplots
ax1 = plt.subplot2grid((7,1), (0,0), rowspan = 5, colspan = 1)
ax1.plot(df_15["Date"], df_15['Close'])
ax1.plot(df_15["Date"], df_15["100ma"], linewidth = 0.5)
plt.xticks(rotation=20)
ax2 = plt.subplot2grid((6,1), (5,0), rowspan = 2, colspan = 2, sharex = ax1)
ax2.bar(df_15["Date"], df_15["Volume"])
ax2.axes.xaxis.set_visible(False)
# plt.tight_layout()
plt.show()
I fixed it using matplotlib.ticker.formatter.
I first created a class and using:
class MyFormatter(Formatter):
def __init__(self, dates, fmt='%Y-%m-%d %H:%M'):
self.dates = dates
self.fmt = fmt
def __call__(self, x, pos=0):
'Return the label for time x at position pos'
ind = int(np.round(x))
if ind >= len(self.dates) or ind < 0:
return ''
return self.dates[ind].strftime(self.fmt)
formatter = MyFormatter(df.index)
ax1 = plt.subplot2grid((7,1), (0,0), rowspan = 5, colspan = 1)
ax1.xaxis.set_major_formatter(formatter)
ax1.plot(np.arange(len(df)), df["Close"])
ax1.plot(np.arange(len(df)), df["100ma"], linewidth = 0.5)
ax1.xticks(rotation=45)
ax1.axis([xmin,xmax,ymin,ymax])
ax2 = plt.subplot2grid((6,1), (5,0), rowspan = 2, colspan = 2, sharex = ax1)
ax2.bar(np.arange(len(df)), df["5. volume"])
plt.show()
This gave me a smoother graph than the one before and also that recommended by r-beginner.
The only issue that I have is that if I zoom in the x-axis doesnt really change. it always has teh year, month, date, hour, and minute. Obviously I only want hour and minute when Im zoomed in further. I am yet to figure out how to do that
I have very simple code:
from matplotlib import dates
import matplotlib.ticker as ticker
my_plot=df_h.boxplot(by='Day',figsize=(12,5), showfliers=False, rot=90)
I've got:
but I would like to have fewer labels on X axis. To do this I've add:
my_plot.xaxis.set_major_locator(ticker.MaxNLocator(12))
It generates fewer labels but values of labels have wrong values (=first of few labels from whole list)
What am I doing wrong?
I have add additional information:
I've forgoten to show what is inside DataFrame.
I have three columns:
reg_Date - datetime64 (index)
temperature - float64
Day - date converted from reg_Date to string, it looks like '2017-10' (YYYY-MM)
Box plot group date by 'Day' and I would like to show values 'Day" as a label but not all values
, for example every third one.
You were almost there. Just set ticker.MultipleLocator.
The pandas.DataFrame.boxplot also returns axes, which is an object of class matplotlib.axes.Axes. So you can use this code snippet to customize your labels:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
center = np.random.randint(50,size=(10, 20))
spread = np.random.rand(10, 20) * 30
flier_high = np.random.rand(10, 20) * 30 + 30
flier_low = np.random.rand(10, 20) * -30
y = np.concatenate((spread, center, flier_high, flier_low))
fig, ax = plt.subplots(figsize=(10, 5))
ax.boxplot(y)
x = ['Label '+str(i) for i in range(20)]
ax.set_xticklabels(x)
ax.set_xlabel('Day')
# Set a tick on each integer multiple of a base within the view interval.
ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
plt.xticks(rotation=90)
I think there is a compatibility issue with Pandas plots and Matplotlib formatters.
With the following code:
df = pd.read_csv('lt_stream-1001-full.csv', header=0, encoding='utf8')
df['reg_date'] = pd.to_datetime(df['reg_date'] , format='%Y-%m-%d %H:%M:%S')
df.set_index('reg_date', inplace=True)
df_h = df.resample(rule='H').mean()
df_h['Day']=df_h.index.strftime('%Y-%m')
print(df_h)
f, ax = plt.subplots()
my_plot = df_h.boxplot(by='Day',figsize=(12,5), showfliers=False, rot=90, ax=ax)
locs, labels = plt.xticks()
i = 0
new_labels = list()
for l in labels:
if i % 3 == 0:
label = labels[i]
i += 1
new_labels.append(label)
else:
label = ''
i += 1
new_labels.append(label)
ax.set_xticklabels(new_labels)
plt.show()
You get this chart:
But I notice that this is grouped by month instead of by day. It may not be what you wanted.
Adding the day component to the string 'Day' messes up the chart as there seems to be too many boxes.
df = pd.read_csv('lt_stream-1001-full.csv', header=0, encoding='utf8')
df['reg_date'] = pd.to_datetime(df['reg_date'] , format='%Y-%m-%d %H:%M:%S')
df.set_index('reg_date', inplace=True)
df_h = df.resample(rule='H').mean()
df_h['Day']=df_h.index.strftime('%Y-%m-%d')
print(df_h)
f, ax = plt.subplots()
my_plot = df_h.boxplot(by='Day',figsize=(12,5), showfliers=False, rot=90, ax=ax)
locs, labels = plt.xticks()
i = 0
new_labels = list()
for l in labels:
if i % 15 == 0:
label = labels[i]
i += 1
new_labels.append(label)
else:
label = ''
i += 1
new_labels.append(label)
ax.set_xticklabels(new_labels)
plt.show()
The for loop creates the tick labels every as many periods as desired. In the first chart they were set every 3 months. In the second one, every 15 days.
If you would like to see less grid lines:
df = pd.read_csv('lt_stream-1001-full.csv', header=0, encoding='utf8')
df['reg_date'] = pd.to_datetime(df['reg_date'] , format='%Y-%m-%d %H:%M:%S')
df.set_index('reg_date', inplace=True)
df_h = df.resample(rule='H').mean()
df_h['Day']=df_h.index.strftime('%Y-%m-%d')
print(df_h)
f, ax = plt.subplots()
my_plot = df_h.boxplot(by='Day',figsize=(12,5), showfliers=False, rot=90, ax=ax)
locs, labels = plt.xticks()
i = 0
new_labels = list()
new_locs = list()
for l in labels:
if i % 3 == 0:
label = labels[i]
loc = locs[i]
i += 1
new_labels.append(label)
new_locs.append(loc)
else:
i += 1
ax.set_xticks(new_locs)
ax.set_xticklabels(new_labels)
ax.grid(axis='y')
plt.show()
I've read about x_compat in Pandas plot in order to apply Matplotlib formatters, but I get an error when trying to apply it. I'll give it another shot later.
Old unsuccesful answer
The tick labels seem to be dates. If they are set as datetime in your dataframe, you can:
months = mdates.MonthLocator(1,4,7,10) #Choose the months you like the most
ax.xaxis.set_major_locator(months)
Otherwise, you can let Matplotlib know they are dates by:
ax.xaxis_date()
Your comment:
I have add additional information:
I've forgoten to show what is inside DataFrame.
I have three columns:
reg_Date - datetime64 (index)
temperature - float64
Day - date converted from reg_Date to string, it looks like '2017-10' *(YYYY-MM) *
Box plot group date by 'Day' and I would like to show values 'Day" as a label but not all values
, for example every third one.
Based on your comment in italic above, I would use reg_Date as the input and the following lines:
days = mdates.DayLocator(interval=3)
daysFmt = mdates.DateFormatter('%Y-%m') #to format display
ax.xaxis.set_major_locator(days)
ax.xaxis.set_major_formatter(daysFmt)
I forgot to mention that you will need to:
import matplotlib.dates as mdates
Does this work?
Im using the following code:
import matplotlib.pyplot as pyplot
import pandas as pandas
from datetime import datetime
dataset = pandas.read_csv("HugLog_17.01.11.csv", sep=",", header=0)
print('filter data for SrcAddr')
dataset_filtered = dataset[dataset['SrcAddr']=='0x1FD3']
print('get Values')
varY = dataset_filtered.Battery_Millivolt.values
varX = dataset_filtered.Timestamp.values
print('Convert the date-strings in date-objects.')
dates_list = [datetime.strptime(date, '%y-%m-%d %H:%M:%S') for date in varX]
fig = pyplot.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.set_xlabel('Time')
ax1.set_ylabel('Millivolt')
ax1.bar(dates_list, varY)
pyplot.locator_params(axis='x',nbins=10)
pyplot.show()
The problem i have is, its a large datacollection with 180k datapoints.
And pyplot displays all points an the graph which makes it slow and the bars overlap. Is there a way to set a maximum-limit on how much datapoints a displayed at a "view".
What i mean by that is, that as soon as the graph is render ther are only 50 datapoints and when i zoomm in i only get a maximum of 50 datapoints again.
Resampling can be done with the resample function from pandas.
Note that the resample syntax has changed between version 0.17 and 0.19 of pandas. The example below uses the old style. See e.g. this tutorial for the new style.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# generate some data for every second over a whole day
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
np.random.rand(len(df.index))*100
# resample the data, taking the mean over 1 hours ("H")
t = "H" # for hours, try "T" for minutes as well
width=1./24 #matplotlib default uses a width of 1 day per bar
# try width=1./(24*60) for minutes
df_resampled = pd.DataFrame()
df_resampled['data'] = df.data.resample(t, how="mean")
fig, ax = plt.subplots()
#ax.bar(df.index, df['data'], width=1./(24*60*60)) # original data, takes too long to plot
ax.bar(df_resampled.index, df_resampled['data'], width=width)
ax.xaxis_date()
plt.show()
Automatic adaption of the resampling when zooming would indeed require some manual work. There is a resampling example on the matplotlib event handling page, which does not work out of the box but could be adapted accordingly.
This is how it would look like:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates
class Sampler():
def __init__(self,df):
self.df = df
def resample(self, limits):
print limits
dt = limits[1] - limits[0]
if (type(dt) != pd.tslib.Timedelta) and (type(dt) != datetime.timedelta):
dt = datetime.timedelta(days=dt)
print dt
#see #http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
if dt > datetime.timedelta(hours=5):
t = "H"; width=1./24
elif dt > datetime.timedelta(minutes=60):
t = "15T"; width=15./(24.*60)
elif dt > datetime.timedelta(minutes=5):
t = "T"; width=1./(24.*60)
elif dt > datetime.timedelta(seconds=60):
t = "15S"; width=15./(24.*60*60)
else:
#dt < datetime.timedelta(seconds=60):
t = "S"; width=1./(24.*60*60)
self.resampled = pd.DataFrame()
self.resampled['data'] = self.df.data.resample(t, how="mean")
print t, len(self.resampled['data'])
print "indextype", type(self.resampled.index[0])
print "limitstype", type(limits[1])
if type(limits[1]) == float or type(limits[1]) == np.float64 :
dlowlimit = matplotlib.dates.num2date(limits[0])
duplimit = matplotlib.dates.num2date(limits[1])
print type(duplimit), duplimit
self.resampled = self.resampled.loc[self.resampled.index <= duplimit]
self.resampled = self.resampled.loc[self.resampled.index >= dlowlimit]
else:
self.resampled = self.resampled.loc[self.resampled.index <= limits[1]]
self.resampled = self.resampled.loc[self.resampled.index >= limits[0]]
return self.resampled.index,self.resampled['data'],width
def update(self, ax):
print "update"
lims = ax.viewLim
start, stop = lims.intervalx
ax.clear()
x,y,width = self.resample([start, stop])
ax.bar(x,y, width=width)
ax.set_xlim([start, stop])
ax.callbacks.connect('xlim_changed', self.update)
ax.figure.canvas.draw()
times = pd.date_range(start='2017-01-11',periods=86400, freq='1S')
df = pd.DataFrame(index = times)
df['data'] = np.sort(np.random.randint(low=1300, high=1600, size=len(df.index)) )[::-1] + \
np.random.rand(len(df.index))*500
sampler = Sampler(df)
x,y,width = sampler.resample( [df.index[0],df.index[-1] ] )
fig, ax = plt.subplots()
ax.bar(x,y, width=width)
ax.xaxis_date()
# connect to limits changes
ax.callbacks.connect('xlim_changed', sampler.update)
plt.show()
One thing you can do is plot a random subset of the data by using the sample method on your pandas DataFrame. Use the frac argument to determine the fraction of points you want to use. It ranges from 0 to 1.
After you get your dataset_filtered DataFrame, take a sample of it like this
dataset_filtered_sample = dataset_filtered.sample(frac=.001)