I have a dataframe with a time stamp as the index and a column of labels
df=DataFrame({'time':[ datetime(2015,11,2,4,41,10), datetime(2015,11,2,4,41,39), datetime(2015,11,2,4,41,47),
datetime(2015,11,2,4,41,59), datetime(2015,11,2,4,42,4), datetime(2015,11,2,4,42,11),
datetime(2015,11,2,4,42,15), datetime(2015,11,2,4,42,30), datetime(2015,11,2,4,42,39),
datetime(2015,11,2,4,42,41),datetime(2015,11,2,5,2,9),datetime(2015,11,2, 5,2,10),
datetime(2015,11,2,5,2,16),datetime(2015,11,2,5,2,29),datetime(2015,11,2, 5,2,51),
datetime(2015,11,2,5,9,1),datetime(2015,11,2,5,9,21),datetime(2015,11,2,5,9,31),
datetime(2015,11,2,5,9,40),datetime(2015,11,2,5,9,55)],
'Label':[2,0,0,0,1,0,0,1,1,1,1,3,0,0,3,0,1,0,1,1]}).set_index(['time'])
I want to get the avergae number of times that a label appears in a distinct minute
in a distnct hour.
For example, Label 0 appears 3 times in hour 4 in minute 41, 2 times in hour 4
in minute 42,
2 times in hour 5 in in minute 2, and 2 times in hour 5 in minute 9 so its average count per
minute in hour 4 is
(2+3)/2=2.5
and its count per minute in hour 5 is
(2+2)/2=2
The output I am looking for is
Hour 1
Label avg
0 2.5
1 2
2 .5
3 0
Hour 2
Label avg
0 2
1 1.5
2 0
3 1
What I have so far is
df['hour']=df.index.hour
hour_grp=df.groupby(['hour'], as_index=False)
then I can deo something like
res=[]
for key, value in hour_grp:
res.append(value)
then group by minute
res[0].groupby(pd.TimeGrouper('1Min'))['Label'].value_counts()
but this is where I'm stuck, not to mention it is not very efficient
Start by squeezing you DataFrame into a Series (after all, it only has one column):
s = df.squeeze()
Compute how many times each label occurs by minute:
counts_by_min = (s.resample('min')
.apply(lambda x: x.value_counts())
.unstack()
.fillna(0))
# 0 1 2 3
# time
# 2015-11-02 04:41:00 3.0 0.0 1.0 0.0
# 2015-11-02 04:42:00 2.0 4.0 0.0 0.0
# 2015-11-02 05:02:00 2.0 1.0 0.0 2.0
# 2015-11-02 05:09:00 2.0 3.0 0.0 0.0
Resample counts_by_min by hour to obtain the number of times each label occurs by hour:
counts_by_hour = counts_by_min.resample('H').sum()
# 0 1 2 3
# time
# 2015-11-02 04:00:00 5.0 4.0 1.0 0.0
# 2015-11-02 05:00:00 4.0 4.0 0.0 2.0
Count the number of minutes each label occurs by hour:
minutes_by_hour = counts_by_min.astype(bool).resample('H').sum()
# 0 1 2 3
# time
# 2015-11-02 04:00:00 2.0 1.0 1.0 0.0
# 2015-11-02 05:00:00 2.0 2.0 0.0 1.0
Divide the last two to get the result you want:
avg_per_hour = counts_by_hour.div(minutes_by_hour).fillna(0)
# 0 1 2 3
# time
# 2015-11-02 04:00:00 2.5 4.0 1.0 0.0
# 2015-11-02 05:00:00 2.0 2.0 0.0 2.0
Accessing minute of DateTimeIndex:
mn = df.index.minute
Accessing hour of DateTimeIndex:
hr = df.index.hour
Perform Groupby by keeping the above obtained variables as keys. Compute value_counts of contents under Label and unstack by filling missing values with 0. Finally, average them across the index-axis containing hour values.
df.groupby([mn,hr])['Label'].value_counts().unstack(fill_value=0).mean(level=1)
Related
I have data that looks like this: (assume start and end are date times)
id
start
end
1
01-01
01-02
1
01-03
01-05
1
01-04
01-07
1
01-06
NaT
1
01-07
NaT
I want to get a data frame that would include all dates, that has a 'cumulative sum' that only counts for the range they are in.
dates
count
01-01
1
01-02
0
01-03
1
01-04
2
01-05
1
01-06
2
01-07
3
One idea I thought of was simply using cumcount on the start dates, and doing a 'reverse cumcount' decreasing the counts using the end dates, but I am having trouble wrapping my head around doing this in pandas and I'm wondering whether there's a more elegant solution.
Here is two options. first consider this data with only one id, note that your columns start and end must be datetime.
d = {'id': [1, 1, 1, 1, 1],
'start': [pd.Timestamp('2021-01-01'), pd.Timestamp('2021-01-03'),
pd.Timestamp('2021-01-04'), pd.Timestamp('2021-01-06'),
pd.Timestamp('2021-01-07')],
'end': [pd.Timestamp('2021-01-02'), pd.Timestamp('2021-01-05'),
pd.Timestamp('2021-01-07'), pd.NaT, pd.NaT]}
df = pd.DataFrame(d)
so to get your result, you can do a sub between the get_dummies of start and end. then sum if several start and or end at the same dates, cumsum along the dates, reindex to get all the dates between the min and max dates available. create a function.
def dates_cc(df_):
return (
pd.get_dummies(df_['start'])
.sub(pd.get_dummies(df_['end'], dtype=int), fill_value=0)
.sum()
.cumsum()
.to_frame(name='count')
.reindex(pd.date_range(df_['start'].min(), df_['end'].max()), method='ffill')
.rename_axis('dates')
)
Now you can apply this function to your dataframe
res = dates_cc(df).reset_index()
print(res)
# dates count
# 0 2021-01-01 1.0
# 1 2021-01-02 0.0
# 2 2021-01-03 1.0
# 3 2021-01-04 2.0
# 4 2021-01-05 1.0
# 5 2021-01-06 2.0
# 6 2021-01-07 2.0
Now if you have several id, like
df1 = df.assign(id=[1,1,2,2,2])
print(df1)
# id start end
# 0 1 2021-01-01 2021-01-02
# 1 1 2021-01-03 2021-01-05
# 2 2 2021-01-04 2021-01-07
# 3 2 2021-01-06 NaT
# 4 2 2021-01-07 NaT
then you can use the above function like
res1 = df1.groupby('id').apply(dates_cc).reset_index()
print(res1)
# id dates count
# 0 1 2021-01-01 1.0
# 1 1 2021-01-02 0.0
# 2 1 2021-01-03 1.0
# 3 1 2021-01-04 1.0
# 4 1 2021-01-05 0.0
# 5 2 2021-01-04 1.0
# 6 2 2021-01-05 1.0
# 7 2 2021-01-06 2.0
# 8 2 2021-01-07 2.0
that said, a more straightforward possibility is with crosstab that create a row per id, the rest is about the same manipulations.
res2 = (
pd.crosstab(index=df1['id'], columns=df1['start'])
.sub(pd.crosstab(index=df1['id'], columns=df1['end']), fill_value=0)
.reindex(columns=pd.date_range(df1['start'].min(), df1['end'].max()), fill_value=0)
.rename_axis(columns='dates')
.cumsum(axis=1)
.stack()
.reset_index(name='count')
)
print(res2)
# id dates count
# 0 1 2021-01-01 1.0
# 1 1 2021-01-02 0.0
# 2 1 2021-01-03 1.0
# 3 1 2021-01-04 1.0
# 4 1 2021-01-05 0.0
# 5 1 2021-01-06 0.0
# 6 1 2021-01-07 0.0
# 7 2 2021-01-01 0.0
# 8 2 2021-01-02 0.0
# 9 2 2021-01-03 0.0
# 10 2 2021-01-04 1.0
# 11 2 2021-01-05 1.0
# 12 2 2021-01-06 2.0
# 13 2 2021-01-07 2.0
the main difference between the two options is that this one create extra dates for each id, because for example 2021-01-01 is in id=1 but not id=2 and with this version, you get this date also for id=2 while in groupby it is not taken into account.
I've got a dataframe called new_dh of web request that looks like (there are more columns
s-sitename sc-win32-status
date_time
2006-11-01 00:00:00 W3SVC1 0.0
2006-11-01 00:00:00 W3SVC1 0.0
2006-11-01 01:00:00 W3SVC1 0.0
2006-11-01 01:00:00 W3SVC1 0.0
2006-11-01 02:00:00 W3SVC1 0.0
2007-02-28 02:00:00 W3SVC1 0.0
2007-02-28 10:00:00 W3SVC1 0.0
2007-02-28 23:00:00 W3SVC1 0.0
2007-02-28 23:00:00 W3SVC1 0.0
2007-02-28 23:00:00 W3SVC1 0.0
What I would like to do is group by the hours(the actual date of the request does not matter, just the hour and all the times have already been rounded down to not include minutes) for the datetimeindex and instead return
count
hour
0 2
01 2
02 2
10 1
23 3
Any help would be much appreciated.
I have tried
new_dh.groupby([new_dh.index.hour]).count()
but find myself printing many columns of the same value whereas I only want the above version
If need DatetimeIndex in output use DataFrame.resample:
new_dh.resample('H')['s-sitename'].count()
Or DatetimeIndex.floor:
new_dh.groupby(new_dh.index.floor('H'))['s-sitename'].count()
Problem of your solution is if use GroupBy.count it count all columns value per Hours with exclude missing values, so if no missing values get multiple columns with same values. Possible solution is specify column after groupby:
new_dh.groupby([new_dh.index.hour])['s-sitename'].count()
So data was changed for see how count with exclude missing values:
print (new_dh)
s-sitename sc-win32-status
date_time
2006-11-01 00:00:00 W3SVC1 0.0
2006-11-01 00:00:00 W3SVC1 0.0
2006-11-01 01:00:00 W3SVC1 0.0
2006-11-01 01:00:00 W3SVC1 0.0
2006-11-01 02:00:00 NaN 0.0
2007-02-28 02:00:00 W3SVC1 0.0
2007-02-28 10:00:00 W3SVC1 0.0
2007-02-28 23:00:00 NaN 0.0
2007-02-28 23:00:00 NaN 0.0
2007-02-28 23:00:00 W3SVC1 0.0
df = new_dh.groupby([new_dh.index.hour]).count()
print (df)
s-sitename sc-win32-status
date_time
0 2 2
1 2 2
2 1 2
10 1 1
23 1 3
So if column is specified:
s = new_dh.groupby([new_dh.index.hour])['s-sitename'].count()
print (s)
date_time
0 2
1 2
2 1
10 1
23 1
Name: s-sitename, dtype: int64
df = new_dh.groupby([new_dh.index.hour])['s-sitename'].count().to_frame()
print (df)
s-sitename
date_time
0 2
1 2
2 1
10 1
23 1
If need count also missing values then use GroupBy.size:
s = new_dh.groupby([new_dh.index.hour])['s-sitename'].size()
print (s)
date_time
0 2
1 2
2 2
10 1
23 3
Name: s-sitename, dtype: int64
df = new_dh.groupby([new_dh.index.hour])['s-sitename'].size().to_frame()
print (df)
s-sitename
date_time
0 2
1 2
2 2
10 1
23 3
new_dh['hour'] = new_dh.index.map(lambda x: x.hour)
new_dh.groupby('hour')['hour'].count()
Result
hour
0 2
1 2
2 2
10 1
23 3
Name: hour, dtype: int64
If you need a DataFrame as result:
new_dh.groupby('hour')['hour'].count().rename('count').to_frame()
In this case, the result will be:
count
hour
0 2
1 2
2 2
10 1
23 3
You can also do this by using groupby() and assign() method:
If 'date_time' column is not your index:
result=df.assign(hour=df['date_time'].dt.hour).groupby('hour').agg(count=('s-sitename','count'))
If It's your index then use:
result=df.groupby(df.index.hour)['s-sitename'].count().to_frame('count')
result.index.name='hour'
Now if you print result then you will get your desired output:
count
hour
0 1
1 2
2 2
10 1
23 3
I have a dataframe, below:
ID Date Volume Sales
1 2020-02 10 4
1 2020-03 8 6
2 2020-02 6 8
2 2020-03 4 10
Is there an easy way to convert this to weekly data using resampling? And dividing the volume and sales column by the number of weeks in the month?
I have started my process which code which looks like:
import pandas as pd
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('date')
grouped = df.groupby('ID').resmaple('W').ffill().reset_index()
print(grouped)
After this step, I get an error message: cannot inset ID, already exists
Also is there a code to use for finding the number of weeks in a month for dividing the volume and sales column by the number of weeks in the month.
The Expected output is :
ID Volume Sales Weeks
0 1 2.5 1.0 2020-02-02
0 1 2.5 1.0 2020-02-09
0 1 2.5 1.0 2020-02-16
0 1 2.5 1.0 2020-02-23
1 1 1.6 1.2 2020-03-01
1 1 1.6 1.2 2020-03-08
1 1 1.6 1.2 2020-03-15
1 1 1.6 1.2 2020-03-22
1 1 1.6 1.2 2020-03-29
2 2 1.5 2 2020-02-02
2 2 1.5 2 2020-02-09
2 2 1.5 2 2020-02-16
2 2 1.5 2 2020-02-23
3 2 0.8 2 2020-03-01
3 2 0.8 2 2020-03-08
3 2 0.8 2 2020-03-15
3 2 0.8 2 2020-03-22
3 2 0.8 2 2020-03-29
After review, a much simpler solution can be used. Please refer to subsection labeled New Solution in Part 1 below.
This task requires multiple steps. Let's break it down as follows:
Part 1: Transform Date & Resample
New Solution
With consideration that the weekly frequency required, being Sunday based (i.e. freq='W-SUN') is independent for each month and is not related to or affected by any adjacent month(s), we can directly use the year-month values in column Date to generate date ranges in weekly basis in one step rather than breaking into 2 steps by first generating daily date ranges from year-month and then resample the daily date ranges to weekly afterwards.
The new program logics just needs to use pd.date_range() with freq='W' with the help of pd.offsets.MonthEnd() to generate weekly frequency for a month. Altogether, it does not need to call .resample() or .asfreq() like other solutions. Effectively, the pd.date_range() with freq='W' is doing the resampling task for us.
Here goes the codes:
df['Weeks'] = df['Date'].map(lambda x:
pd.date_range(
start=pd.to_datetime(x),
end=(pd.to_datetime(x) + pd.offsets.MonthEnd()),
freq='W'))
df = df.explode('Weeks')
Result:
print(df)
ID Date Volume Sales Weeks
0 1 2020-02 10 4 2020-02-02
0 1 2020-02 10 4 2020-02-09
0 1 2020-02 10 4 2020-02-16
0 1 2020-02 10 4 2020-02-23
1 1 2020-03 8 6 2020-03-01
1 1 2020-03 8 6 2020-03-08
1 1 2020-03 8 6 2020-03-15
1 1 2020-03 8 6 2020-03-22
1 1 2020-03 8 6 2020-03-29
2 2 2020-02 6 8 2020-02-02
2 2 2020-02 6 8 2020-02-09
2 2 2020-02 6 8 2020-02-16
2 2 2020-02 6 8 2020-02-23
3 2 2020-03 4 10 2020-03-01
3 2 2020-03 4 10 2020-03-08
3 2 2020-03 4 10 2020-03-15
3 2 2020-03 4 10 2020-03-22
3 2 2020-03 4 10 2020-03-29
By the 2 lines of codes above, we already get the required result for Part 1. We don't need to go through the complicated codes of .groupby() and .resample() in the old solution.
We can continue to go to Part 2. As we have not created the grouped object, we can either replace grouped by df in for the codes in Part 2 or add a new line grouped = df to continue.
Old Solution
We use pd.date_range() with freq='D' with the help of pd.offsets.MonthEnd() to produce daily entries for the full month. Then transform these full month ranges to index before resampling to week frequency. Resampled with closed='left' to exclude the unwanted week of 2020-04-05 produced under default resample() parameters.
df['Weeks'] = df['Date'].map(lambda x:
pd.date_range(
start=pd.to_datetime(x),
end=(pd.to_datetime(x) + pd.offsets.MonthEnd()),
freq='D'))
df = df.explode('Weeks').set_index('Weeks')
grouped = (df.groupby(['ID', 'Date'], as_index=False)
.resample('W', closed='left')
.ffill().dropna().reset_index(-1))
Result:
print(grouped)
Weeks ID Date Volume Sales
0 2020-02-02 1.0 2020-02 10.0 4.0
0 2020-02-09 1.0 2020-02 10.0 4.0
0 2020-02-16 1.0 2020-02 10.0 4.0
0 2020-02-23 1.0 2020-02 10.0 4.0
1 2020-03-01 1.0 2020-03 8.0 6.0
1 2020-03-08 1.0 2020-03 8.0 6.0
1 2020-03-15 1.0 2020-03 8.0 6.0
1 2020-03-22 1.0 2020-03 8.0 6.0
1 2020-03-29 1.0 2020-03 8.0 6.0
2 2020-02-02 2.0 2020-02 6.0 8.0
2 2020-02-09 2.0 2020-02 6.0 8.0
2 2020-02-16 2.0 2020-02 6.0 8.0
2 2020-02-23 2.0 2020-02 6.0 8.0
3 2020-03-01 2.0 2020-03 4.0 10.0
3 2020-03-08 2.0 2020-03 4.0 10.0
3 2020-03-15 2.0 2020-03 4.0 10.0
3 2020-03-22 2.0 2020-03 4.0 10.0
3 2020-03-29 2.0 2020-03 4.0 10.0
Here, we retain the column Date for some use later.
Part 2: Divide Volume and Sales by number of weeks in month
Here, the number of weeks in month used to divide the Volume and Sales figures should actually be the number of resampled weeks within the month as shown in the interim result above.
If we use the actual number of weeks, then for Feb 2020, because of leap year, it has 29 days in that month and thus it actually spans across 5 weeks instead of the 4 resampled weeks in the interim result above. Then it would cause inconsistent results because there are only 4 week entries above while we divide each Volume and Sales figure by 5.
Let's go to the codes then:
We group by columns ID and Date and then divide each value in columns Volume and Sales by group size (i.e. number of resampled weeks).
grouped[['Volume', 'Sales']] = (grouped.groupby(['ID', 'Date'])[['Volume', 'Sales']]
.transform(lambda x: x / x.count()))
or simplified form using /= as follows:
grouped[['Volume', 'Sales']] /= (grouped.groupby(['ID', 'Date'])[['Volume', 'Sales']]
.transform('count'))
Result:
print(grouped)
Weeks ID Date Volume Sales
0 2020-02-02 1.0 2020-02 2.5 1.0
0 2020-02-09 1.0 2020-02 2.5 1.0
0 2020-02-16 1.0 2020-02 2.5 1.0
0 2020-02-23 1.0 2020-02 2.5 1.0
1 2020-03-01 1.0 2020-03 1.6 1.2
1 2020-03-08 1.0 2020-03 1.6 1.2
1 2020-03-15 1.0 2020-03 1.6 1.2
1 2020-03-22 1.0 2020-03 1.6 1.2
1 2020-03-29 1.0 2020-03 1.6 1.2
2 2020-02-02 2.0 2020-02 1.5 2.0
2 2020-02-09 2.0 2020-02 1.5 2.0
2 2020-02-16 2.0 2020-02 1.5 2.0
2 2020-02-23 2.0 2020-02 1.5 2.0
3 2020-03-01 2.0 2020-03 0.8 2.0
3 2020-03-08 2.0 2020-03 0.8 2.0
3 2020-03-15 2.0 2020-03 0.8 2.0
3 2020-03-22 2.0 2020-03 0.8 2.0
3 2020-03-29 2.0 2020-03 0.8 2.0
Optionally, you can do some cosmetic works to drop the column Date and rearrange column Weeks to your desired position if you like.
Edit: (Similarity and difference from other questions resampling from month to week)
In this review, I have searched some other questions of similar titles and compared the questions and solutions.
There is another question with similar requirement to split the monthly values equally to weekly values according to the number of weeks in the resampled month. In that question, the months are represented as the first date of the months and they are in datetime format and used as index in the dataframe while in this question, the months are represented as YYYY-MM which can be of string type.
A big and critical difference is that in that question, the last month period index 2018-05-01 with value 22644 was actually not processed. That is, the month of 2018-05 is not resampled into weeks in May 2018 and the value 22644 has never been processed to split into weekly proportions. The accepted solution using .asfreq() does not show any entry for 2018-05 at all and the other solution using .resample() still keeps one (un-resampled) entry for 2018-05 and the value 22644 is not split into weekly proportions.
However, in our question here, the last month listed in each group still needs to be resampled into weeks and values split equally for the resampled weeks.
Looking at the solution, my new solution makes no call to .resample() nor .asfreq(). It just uses pd.date_range() with freq='W' with the help of pd.offsets.MonthEnd() to generate weekly frequency for a month based on 'YYYY-MM' values. This is what I could not imagine of when I worked on the old solution making use of .resample()
I need to subtract dates based on the progression of fault count.
Below is the table that has the two input columns Date and Fault_Count. The output columns I need are Option1 and Option2. The last two columns show the date difference calculations. Basically when the Fault_Count changes I need to count the number of days from when the Fault_Count changed to the initial start of fault count. For example the Fault_Count changed to 2 on 1/4/2020, I need to get the number of days from when the Fault_Count started at 0 and changed to 2 (i.e. 1/4/2020 - 1/1/2020 = 3).
Date Fault_Count Option1 Option2 Option1calc Option2calc
1/1/2020 0 0 0
1/2/2020 0 0 0
1/3/2020 0 0 0
1/4/2020 2 3 3 1/4/2020-1/1/2020 1/4/2020-1/1/2020
1/5/2020 2 0 0
1/6/2020 2 0 0
1/7/2020 4 3 3 1/7/2020-1/4/2020 1/7/2020-1/4/2020
1/8/2020 4 0 0
1/9/2020 5 2 2 1/9/2020-1/7/2020 1/9/2020-1/7/2020
1/10/2020 5 0 0
1/11/2020 0 2 -2 1/11/2020-1/9/2020 (1/11/2020-1/9/2020)*-1 as the fault resets
1/12/2020 1 1 1 1/12/2020-1/11/2020 1/12/2020-1/11/2020
Below is the code.
import pandas as pd
d = {'Date': ['1/1/2020', '1/2/2020', '1/3/2020', '1/4/2020', '1/5/2020', '1/6/2020', '1/7/2020', '1/8/2020', '1/9/2020', '1/10/2020', '1/11/2020', '1/12/2020'], 'Fault_Count' : [0, 0, 0, 2, 2, 2, 4, 4, 5, 5, 0, 1]}
df = pd.DataFrame(d)
df['Date'] = pd.to_datetime(df['Date'])
df['Fault_count_diff'] = df.Fault_Count.diff().fillna(0)
df['Cumlative_Sum'] = df.Fault_count_diff.cumsum()
I thought I could use cumulative sum and group by to get the groups and get the differences of the first value of groups. That's as far as I could get, also I noticed that using cumulative sum was not giving me ordered groups as some of the Fault_Count get reset.
Date Fault_Count Fault_count_diff Cumlative_Sum
0 2020-01-01 0 0.0 0.0
1 2020-01-02 0 0.0 0.0
2 2020-01-03 0 0.0 0.0
3 2020-01-04 2 2.0 2.0
4 2020-01-05 2 0.0 2.0
5 2020-01-06 2 0.0 2.0
6 2020-01-07 4 2.0 4.0
7 2020-01-08 4 0.0 4.0
8 2020-01-09 5 1.0 5.0
9 2020-01-10 5 0.0 5.0
10 2020-01-11 0 -5.0 0.0
11 2020-01-12 1 1.0 1.0
Desired output:
Date Fault_Count Option1 Option2
0 2020-01-01 0 0.0 0.0
1 2020-01-02 0 0.0 0.0
2 2020-01-03 0 0.0 0.0
3 2020-01-04 2 3.0 3.0
4 2020-01-05 2 0.0 0.0
5 2020-01-06 2 0.0 0.0
6 2020-01-07 4 3.0 3.0
7 2020-01-08 4 0.0 0.0
8 2020-01-09 5 2.0 2.0
9 2020-01-10 5 0.0 0.0
10 2020-01-11 0 2.0 -2.0
11 2020-01-12 1 1.0 1.0
Thanks for the help.
Use:
m1 = df['Fault_Count'].ne(df['Fault_Count'].shift(fill_value=0))
m2 = df['Fault_Count'].eq(0) & df['Fault_Count'].shift(fill_value=0).ne(0)
s = df['Date'].groupby(m1.cumsum()).transform('first')
df['Option1'] = df['Date'].sub(s.shift()).dt.days.where(m1, 0)
df['Option2'] = df['Option1'].where(~m2, df['Option1'].mul(-1))
Details:
Use Series.ne + Series.shift to create boolean mask m1 which represent the boundary condition when Fault_count changes, similarly use Series.eq + Series.shift and Series.ne to create a boolean mask m2 which represent the condition where Fault_count resets:
m1 m2
0 False False
1 False False
2 False False
3 True False
4 False False
5 False False
6 True False
7 False False
8 True False
9 False False
10 True True # --> Fault count reset
11 True False
Use Series.groupby on consecutive fault counts obtained using m1.cumsum and transform the Date column using groupby.first:
print(s)
0 2020-01-01
1 2020-01-01
2 2020-01-01
3 2020-01-04
4 2020-01-04
5 2020-01-04
6 2020-01-07
7 2020-01-07
8 2020-01-09
9 2020-01-09
10 2020-01-11
11 2020-01-12
Name: Date, dtype: datetime64[ns]
Use Series.sub to subtract Date for s shifted using Series.shift and use Series.where to fill 0 based on mask m2 and assign this to Option1. Similary we obtain Option2 from Option1 based on mask m2:
print(df)
Date Fault_Count Option1 Option2
0 2020-01-01 0 0.0 0.0
1 2020-01-02 0 0.0 0.0
2 2020-01-03 0 0.0 0.0
3 2020-01-04 2 3.0 3.0
4 2020-01-05 2 0.0 0.0
5 2020-01-06 2 0.0 0.0
6 2020-01-07 4 3.0 3.0
7 2020-01-08 4 0.0 0.0
8 2020-01-09 5 2.0 2.0
9 2020-01-10 5 0.0 0.0
10 2020-01-11 0 2.0 -2.0
11 2020-01-12 1 1.0 1.0
Instead of df['Fault_count_diff'] = ... and the next line, do:
df['cycle'] = (df.Fault_Count.diff() < 0).cumsum()
Then to get the dates in between each count change.
Option1. If all calendar dates are present in df:
ndays = df.groupby(['cycle', 'Fault_Count']).Date.size()
Option2. If there's the possibility of a date not showing up in df and you still want to get the calendar days between incidents:
ndays = df.groupby(['cycle', 'Fault_Count']).Date.min().diff().dropna()
I need to resample timeseries data and interpolate missing values in 15 min intervals over the course of an hour. Each ID should have four rows of data per hour.
In:
ID Time Value
1 1/1/2019 12:17 3
1 1/1/2019 12:44 2
2 1/1/2019 12:02 5
2 1/1/2019 12:28 7
Out:
ID Time Value
1 2019-01-01 12:00:00 3.0
1 2019-01-01 12:15:00 3.0
1 2019-01-01 12:30:00 2.0
1 2019-01-01 12:45:00 2.0
2 2019-01-01 12:00:00 5.0
2 2019-01-01 12:15:00 7.0
2 2019-01-01 12:30:00 7.0
2 2019-01-01 12:45:00 7.0
I wrote a function to do this, however efficiency goes down drastically when trying to process a larger dataset.
Is there a more efficient way to do this?
import datetime
import pandas as pd
data = pd.DataFrame({'ID': [1,1,2,2],
'Time': ['1/1/2019 12:17','1/1/2019 12:44','1/1/2019 12:02','1/1/2019 12:28'],
'Value': [3,2,5,7]})
def clean_dataset(data):
ids = data.drop_duplicates(subset='ID')
data['Time'] = pd.to_datetime(data['Time'])
data['Time'] = data['Time'].apply(
lambda dt: datetime.datetime(dt.year, dt.month, dt.day, dt.hour,15*(dt.minute // 15)))
data = data.drop_duplicates(subset=['Time','ID']).reset_index(drop=True)
df = pd.DataFrame(columns=['Time','ID','Value'])
for i in range(ids.shape[0]):
times = pd.DataFrame(pd.date_range('1/1/2019 12:00','1/1/2019 13:00',freq='15min'),columns=['Time'])
id_data = data[data['ID']==ids.iloc[i]['ID']]
clean_data = times.join(id_data.set_index('Time'), on='Time')
clean_data = clean_data.interpolate(method='linear', limit_direction='both')
clean_data.drop(clean_data.tail(1).index,inplace=True)
df = df.append(clean_data)
return df
clean_dataset(data)
Linear interpolation does become slow with a large data set. Having a loop in your code is also responsible for a large part of the slowdown. Anything that can be removed from the loop and pre-computed will help increase efficiency. For example, if you pre-define the data frame that you use to initialize times, the code becomes 14% more efficient:
times_template = pd.DataFrame(pd.date_range('1/1/2019 12:00','1/1/2019 13:00',freq='15min'),columns=['Time'])
for i in range(ids.shape[0]):
times = times_template.copy()
Profiling your code confirms that the interpolation takes the longest amount of time (22.7%), followed by the join (13.1%), the append (7.71%), and then the drop (7.67%) commands.
You can use:
#round datetimes by 15 minutes
data['Time'] = pd.to_datetime(data['Time'])
minutes = pd.to_timedelta(15*(data['Time'].dt.minute // 15), unit='min')
data['Time'] = data['Time'].dt.floor('H') + minutes
#change date range for 4 values (to `12:45`)
rng = pd.date_range('1/1/2019 12:00','1/1/2019 12:45',freq='15min')
#create MultiIndex and reindex
mux = pd.MultiIndex.from_product([data['ID'].unique(), rng], names=['ID','Time'])
data = data.set_index(['ID','Time']).reindex(mux).reset_index()
#interpolate per groups
data['Value'] = (data.groupby('ID')['Value']
.apply(lambda x: x.interpolate(method='linear', limit_direction='both')))
print (data)
ID Time Value
0 1 2019-01-01 12:00:00 3.0
1 1 2019-01-01 12:15:00 3.0
2 1 2019-01-01 12:30:00 2.0
3 1 2019-01-01 12:45:00 2.0
4 2 2019-01-01 12:00:00 5.0
5 2 2019-01-01 12:15:00 7.0
6 2 2019-01-01 12:30:00 7.0
7 2 2019-01-01 12:45:00 7.0
If range cannot be change:
data['Time'] = pd.to_datetime(data['Time'])
minutes = pd.to_timedelta(15*(data['Time'].dt.minute // 15), unit='min')
data['Time'] = data['Time'].dt.floor('H') + minutes
#end in 13:00
rng = pd.date_range('1/1/2019 12:00','1/1/2019 13:00',freq='15min')
mux = pd.MultiIndex.from_product([data['ID'].unique(), rng], names=['ID','Time'])
data = data.set_index(['ID','Time']).reindex(mux).reset_index()
data['Value'] = (data.groupby('ID')['Value']
.apply(lambda x: x.interpolate(method='linear', limit_direction='both')))
#remove last row per groups
data = data[data['ID'].duplicated(keep='last')]
print (data)
ID Time Value
0 1 2019-01-01 12:00:00 3.0
1 1 2019-01-01 12:15:00 3.0
2 1 2019-01-01 12:30:00 2.0
3 1 2019-01-01 12:45:00 2.0
5 2 2019-01-01 12:00:00 5.0
6 2 2019-01-01 12:15:00 7.0
7 2 2019-01-01 12:30:00 7.0
8 2 2019-01-01 12:45:00 7.0
EDIT:
Another solution with merge and left join instead reindex:
from itertools import product
#round datetimes by 15 minutes
data['Time'] = pd.to_datetime(data['Time'])
minutes = pd.to_timedelta(15*(data['Time'].dt.minute // 15), unit='min')
data['Time'] = data['Time'].dt.floor('H') + minutes
#change date range for 4 values (to `12:45`)
rng = pd.date_range('1/1/2019 12:00','1/1/2019 12:45',freq='15min')
#create helper DataFrame and merge with left join
df = pd.DataFrame(list(product(data['ID'].unique(), rng)), columns=['ID','Time'])
print (df)
ID Time
0 1 2019-01-01 12:00:00
1 1 2019-01-01 12:15:00
2 1 2019-01-01 12:30:00
3 1 2019-01-01 12:45:00
4 2 2019-01-01 12:00:00
5 2 2019-01-01 12:15:00
6 2 2019-01-01 12:30:00
7 2 2019-01-01 12:45:00
data = df.merge(data, how='left')
##interpolate per groups
data['Value'] = (data.groupby('ID')['Value']
.apply(lambda x: x.interpolate(method='linear', limit_direction='both')))
print (data)
ID Time Value
0 1 2019-01-01 12:00:00 3.0
1 1 2019-01-01 12:15:00 3.0
2 1 2019-01-01 12:30:00 2.0
3 1 2019-01-01 12:45:00 2.0
4 2 2019-01-01 12:00:00 5.0
5 2 2019-01-01 12:15:00 7.0
6 2 2019-01-01 12:30:00 7.0
7 2 2019-01-01 12:45:00 7.0