Conditionally merging dataframes on time variable - python

I have to dataframe that I want to merge on the date, id and time variable in order to compute a duration.
from numpy import *
from pandas import *
df1 = DataFrame({
'id': ['a']*4,
'date': ['02-02-2015']*4,
'time_1': ['08:00:00', '09:00:00', '10:30:00', '12:45']})
df1
id date time
0 a 02-02-2015 08:00:00
1 a 02-02-2015 09:00:00
2 a 02-02-2015 10:30:00
3 a 02-02-2015 12:45:00
-------------------------------------------------------------------------------------------------
df2 = DataFrame({
'id': ['a']*7,
'date': ['02-02-2015']*7,
'time_2': ['08:00:00', '08:09:00', '08:04:01','08:52:36', '09:34:25', '10:30:00', '11:23:38']})
df2
id date time
0 a 02-02-2015 08:00:00
1 a 02-02-2015 08:09:00
2 a 02-02-2015 08:04:01
3 a 02-02-2015 08:52:36
4 a 02-02-2015 09:00:00
5 a 02-02-2015 10:30:00
6 a 02-02-2015 11:23:38
The rule that I want my merge to follow is that each row needs in df2 needs to go with the closest previous time in df1.
The intermediate result would be
intermediateResult = DataFrame({
'id': ['a']*8,
'date': ['02-02-2015']*8,
'time_1': ['08:00:00', '08:00:00', '08:00:00','08:00:00', '09:00:00', '10:30:00', '10:30:00', '12:45'],
'time_2': ['08:00:00', '08:09:00', '08:04:01','08:52:36', '09:34:25', '10:30:00', '11:23:38', nan] })
intermediateResult
id date time_1 time_2
0 a 02-02-2015 08:00:00 08:00:00
1 a 02-02-2015 08:00:00 08:09:00
2 a 02-02-2015 08:00:00 08:04:01
3 a 02-02-2015 08:00:00 08:52:36 # end
4 a 02-02-2015 09:00:00 09:34:25 # end
5 a 02-02-2015 10:30:00 10:30:00
6 a 02-02-2015 10:30:00 11:23:38 # end
7 a 02-02-2015 12:45 NaN
Finally, I want to get the time difference between the latest time_2 of each period (inicated with the comment # end) and their corresponding time_1.
The final result would look like this
finalResult = DataFrame({
'id': ['a']*4,
'date': ['02-02-2015']*4,
'Duration': ['00:52:36', '00:34:25', '00:53:38', nan]})
finalResult
id date Duration
0 a 02-02-2015 00:52:36
1 a 02-02-2015 00:34:25
2 a 02-02-2015 00:53:38
3 a 02-02-2015 NaN

Using different merge methods, came to the same answer. Eventually used merge_as0f direction =backward as per your request. Unfortunately not similar to yours in the sense that I have no NaN. Happy to help furtherif you gave information on how you end up with NaN in one row.
#Join dateto time and coerce to datetime
df1['datetime']=pd.to_datetime(df1.date.str.cat(df1.time_1,sep=' '))
df2['datetime']=pd.to_datetime(df2.date.str.cat(df2.time_2,sep=' '))
df2['time_2'] = df2['time_2'].apply(lambda x: (x[-5:]))#StripHours from time_2. I anticipate to use it as duration
#sort to allow merge_asof
df1=df1.sort_values('datetime')
df2=df2.sort_values('datetime')
#Merge to the dataframes joining using datetime to the nearest hour
df3=pd.merge_asof(df2, df1,on='datetime', by='id', tolerance=pd.Timedelta('2H'),allow_exact_matches=True,direction='backward').dropna()
#df3=df2.merge(df1, left_on=df2.datetime.dt.hour, right_on=df1.datetime.dt.hour, how='left').drop(columns=['key_0', 'id_y', 'date_y']).fillna(0)#Alternative merge
df3.set_index('datetime', inplace=True)#set datetime as index
df3['minutes']=df3.index.minute#Extract minute in each row. Looks to me you want the highest minute in each hour
#Groupby hour idxmax Helps boolean select the index with the highest minutes in an hour. aND DROP UNWANTED ROWS
finalResult=df3.loc[df3.groupby([df3.index.hour, df3.date_x])['minutes'].idxmax()].reset_index().drop(columns=['datetime','time_1','date_y','minutes'])
finalResult.columns=['id','date','Duration(min)']
finalResult

Using the solution suggested by #wwnde, I've found one that scales better to my real data set:
import numpy as np
import pandas as pd
df1 = DataFrame({
'id': ['a']*4,
'date': ['02-02-2015']*4,
'time_1': ['08:00:00', '09:00:00', '10:30:00', '12:45:00']
})
df2 = DataFrame({
'id': ['a']*7,
'date': ['02-02-2015',
'02-02-2015',
'03-02-2015', # small change here relatively to the df un my first post
'02-02-2015',
'02-02-2015',
'02-02-2015',
'02-02-2015'],
'time_2': ['08:00:00', '08:09:00', '08:04:01','08:52:36', '09:34:25', '10:30:00', '11:23:38']
})
----------------------------------------------------
def preproDf(df1, df2, time_1, time_2, _id, date):
'''
Preprocess the dataframes for the following operations
df1: pd.DataFrame, left dataframe
df2: pd.DataFrame, right dataframe
time_1:str, name of the left dataframe
time_2:str, name of the right dataframe
_id:str, name of the id variable. Should be the same for both dataframes
date:str, name of the date variable. Should be the same for both dataframes
return: None
'''
df2[time_2] = df2[time_2].apply(pd.to_datetime)
df1[time_1] = df1[time_1].apply(pd.to_datetime)
#sort to allow merge_asof
df1=df1.sort_values([_id, date, time_1])
df2=df2.sort_values([_id, date, time_2])
def processDF(df1, df2, time_1, time_2, _id, date):
# initialisation
groupKeys = list(df2.groupby([_id, date]).groups.keys())
dfGroup=groupKeys[0]
group = df2.groupby([_id, date]).get_group(dfGroup)
rslt = pd.merge_asof(group, df1, left_on=time_2, right_on=time_1, by=[_id, date], tolerance=pd.Timedelta('2H'),allow_exact_matches=True,direction='backward')#.dropna()
# For loop to get the values in an array
for group in groupKeys[1:]: # Iteration start at the second elmt
group = df2.groupby([_id, date]).get_group(group)
item = pd.merge_asof(group, df1, left_on=time_2, right_on=time_1, by=[_id, date], tolerance=pd.Timedelta('2H'),allow_exact_matches=True,direction='backward')#.dropna()
rslt = np.vstack((rslt, item))
rslt = DataFrame(rslt, columns=item.columns)
# Creating timeDifference variable
rslt['timeDifference'] = rslt[time_2] - rslt[time_1]
# Getting the actual result
rslt = rslt.groupby([_id, date, time_1]).timeDifference.max()
rslt = pd.DataFrame(rslt).reset_index()
rslt.rename({time_1: 'openTime'}, axis='columns')
return rslt
The result:
preproDf(df1, df2, 'time_1', 'time_2', 'id', 'date')
processDF(df1, df2, 'time_1', 'time_2', 'id', 'date')
id date time_1 screenOnDuration
0 a 02-02-2015 2020-05-29 08:00:00 00:52:36
1 a 02-02-2015 2020-05-29 09:00:00 00:34:25
2 a 02-02-2015 2020-05-29 10:30:00 00:53:38

Related

Pandas change time values based on condition

I have a dataframe:
data = {'time':['08:45:00', '09:30:00', '18:00:00', '15:00:00']}
df = pd.DataFrame(data)
I would like to convert the time based on conditions: if the hour is less than 9, I want to set it to 9 and if the hour is more than 17, I need to set it to 17.
I tried this approach:
df['time'] = np.where(((df['time'].dt.hour < 9) & (df['time'].dt.hour != 0)), dt.time(9, 00))
I am getting an error: Can only use .dt. accesor with datetimelike values.
Can anyone please help me with this? Thanks.
Here's a way to do what your question asks:
df.time = pd.to_datetime(df.time)
df.loc[df.time.dt.hour < 9, 'time'] = (df.time.astype('int64') + (9 - df.time.dt.hour)*3600*1000000000).astype('datetime64[ns]')
df.loc[df.time.dt.hour > 17, 'time'] = (df.time.astype('int64') + (17 - df.time.dt.hour)*3600*1000000000).astype('datetime64[ns]')
Input:
time
0 2022-06-06 08:45:00
1 2022-06-06 09:30:00
2 2022-06-06 18:00:00
3 2022-06-06 15:00:00
Output:
time
0 2022-06-06 09:45:00
1 2022-06-06 09:30:00
2 2022-06-06 17:00:00
3 2022-06-06 15:00:00
UPDATE:
Here's alternative code to try to address OP's error as described in the comments:
import pandas as pd
import datetime
data = {'time':['08:45:00', '09:30:00', '18:00:00', '15:00:00']}
df = pd.DataFrame(data)
print('', 'df loaded as strings:', df, sep='\n')
df.time = pd.to_datetime(df.time, format='%H:%M:%S')
print('', 'df converted to datetime by pd.to_datetime():', df, sep='\n')
df.loc[df.time.dt.hour < 9, 'time'] = (df.time.astype('int64') + (9 - df.time.dt.hour)*3600*1000000000).astype('datetime64[ns]')
df.loc[df.time.dt.hour > 17, 'time'] = (df.time.astype('int64') + (17 - df.time.dt.hour)*3600*1000000000).astype('datetime64[ns]')
df.time = [time.time() for time in pd.to_datetime(df.time)]
print('', 'df with time column adjusted to have hour between 9 and 17, converted to type "time":', df, sep='\n')
Output:
df loaded as strings:
time
0 08:45:00
1 09:30:00
2 18:00:00
3 15:00:00
df converted to datetime by pd.to_datetime():
time
0 1900-01-01 08:45:00
1 1900-01-01 09:30:00
2 1900-01-01 18:00:00
3 1900-01-01 15:00:00
df with time column adjusted to have hour between 9 and 17, converted to type "time":
time
0 09:45:00
1 09:30:00
2 17:00:00
3 15:00:00
UPDATE #2:
To not just change the hour for out-of-window times, but to simply apply 9:00 and 17:00 as min and max times, respectively (see OP's comment on this), you can do this:
df.loc[df['time'].dt.hour < 9, 'time'] = pd.to_datetime(pd.DataFrame({
'year':df['time'].dt.year, 'month':df['time'].dt.month, 'day':df['time'].dt.day,
'hour':[9]*len(df.index)}))
df.loc[df['time'].dt.hour > 17, 'time'] = pd.to_datetime(pd.DataFrame({
'year':df['time'].dt.year, 'month':df['time'].dt.month, 'day':df['time'].dt.day,
'hour':[17]*len(df.index)}))
df['time'] = [time.time() for time in pd.to_datetime(df['time'])]
Since your 'time' column contains strings they can kept as strings and assign new string values where appropriate. To filter for your criteria it is convenient to: create datetime Series from the 'time' column, create boolean Series by comparing the datetime Series with your criteria, use the boolean Series to filter the rows which need to be changed.
Your data:
import numpy as np
import pandas as pd
data = {'time':['08:45:00', '09:30:00', '18:00:00', '15:00:00']}
df = pd.DataFrame(data)
print(df.to_string())
>>>
time
0 08:45:00
1 09:30:00
2 18:00:00
3 15:00:00
Convert to datetime, make boolean Series with your criteria
dts = pd.to_datetime(df['time'])
lt_nine = dts.dt.hour < 9
gt_seventeen = (dts.dt.hour >= 17)
print(lt_nine)
print(gt_seventeen)
>>>
0 True
1 False
2 False
3 False
Name: time, dtype: bool
0 False
1 False
2 True
3 False
Name: time, dtype: bool
Use the boolean series to assign a new value:
df.loc[lt_nine,'time'] = '09:00:00'
df.loc[gt_seventeen,'time'] = '17:00:00'
print(df.to_string())
>>>
time
0 09:00:00
1 09:30:00
2 17:00:00
3 15:00:00
Or just stick with strings altogether and create the boolean Series using regex patterns and .str.match.
data = {'time':['08:45:00', '09:30:00', '18:00:00', '15:00:00','07:22:00','22:02:06']}
dg = pd.DataFrame(data)
print(dg.to_string())
>>>
time
0 08:45:00
1 09:30:00
2 18:00:00
3 15:00:00
4 07:22:00
5 22:02:06
# regex patterns
pattern_lt_nine = '^00|01|02|03|04|05|06|07|08'
pattern_gt_seventeen = '^17|18|19|20|21|22|23'
Make boolean Series and assign new values
gt_seventeen = dg['time'].str.match(pattern_gt_seventeen)
lt_nine = dg['time'].str.match(pattern_lt_nine)
dg.loc[lt_nine,'time'] = '09:00:00'
dg.loc[gt_seventeen,'time'] = '17:00:00'
print(dg.to_string())
>>>
time
0 09:00:00
1 09:30:00
2 17:00:00
3 15:00:00
4 09:00:00
5 17:00:00
Time series / date functionality
Working with text data

Time Series Data Reformat

I am working on some code that will rearrange a time series. Currently I have a standard time series. I have a three columns with with the header being [Date, Time, Value]. I want to reformat the dataframe to index with the date and use a header with the time (i.e. 0:00, 1:00, ... , 23:00). The dataframe will be filled in with the value.
Here is the Dataframe currently have
essentially I'd like to mve the index toa single day and show the hours through the columns.
Thanks,
Use pivot:
df = df.pivot(index='Date', columns='Time', values='Total')
Output (first 10 columns and with random values for Total):
>>> df.pivot(index='Date', columns='Time', values='Total').iloc[0:10]
time 00:00:00 01:00:00 02:00:00 03:00:00 04:00:00 05:00:00 06:00:00 07:00:00 08:00:00 09:00:00
date
2019-01-01 0.732494 0.087657 0.930405 0.958965 0.531928 0.891228 0.664634 0.432684 0.009653 0.604878
2019-01-02 0.471386 0.575126 0.509707 0.715290 0.337983 0.618632 0.413530 0.849033 0.725556 0.186876
You could try this.
Split the time part to get only the hour. Add hr to it.
df = pd.DataFrame([['2019-01-01', '00:00:00',-127.57],['2019-01-01', '01:00:00',-137.57],['2019-01-02', '00:00:00',-147.57],], columns=['Date', 'Time', 'Totals'])
df['hours'] = df['Time'].apply(lambda x: 'hr'+ str(int(x.split(':')[0])))
print(pd.pivot_table(df, values ='Totals', index=['Date'], columns = 'hours'))
Output
hours hr0 hr1
Date
2019-01-01 -127.57 -137.57
2019-01-02 -147.57 NaN

Finding the last and first day of months between two specific date using pandas

I have a pandas dataframe with the columns "buying date" and "selling date".
Lets say that the first row in the dataframe can look like:
buying date
selling date
2000-01-04
2000-04-15
And I want to create a new dataframe that starts with the buying date, ends with the selling date and between them i want the first date and last date of every month between the buying date and the selling date. So it would give me something like:
Date
2000-01-04
2000-01-31
2000-02-01
2000-02-28
2000-03-01
2000-03-31
2000-04-01
2000-04-15
Does anyone have a smart solution to this?
Thank you!
Try:
ms = pd.date_range('2000-01-04', '2000-04-15', freq='MS')
me = pd.date_range('2000-01-04', '2000-04-15', freq='M')
df.T.set_index(0).index.union(ms).union(me).to_frame().sort_index()
Output:
0
2000-01-04 2000-01-04
2000-01-31 2000-01-31
2000-02-01 2000-02-01
2000-02-29 2000-02-29
2000-03-01 2000-03-01
2000-03-31 2000-03-31
2000-04-01 2000-04-01
2000-04-15 2000-04-15
from datetime import date
import pandas as pd
import calendar
def main():
start_date = date(2021, 1, 4)
end_date = date(2021, 4, 15)
dates = [start_date]
for month in range(start_date.month, end_date.month+1):
first_day_of_month = date(2021, month, 1)
last_day_of_month = date(2021, month, calendar.monthrange(2021, month)[1])
print(first_day_of_month)
print(last_day_of_month)
dates.append(first_day_of_month)
dates.append(last_day_of_month)
dates.append(end_date)
df = pd.DataFrame({'dates': dates})
print(df)
if __name__ == "__main__":
main()
I don't know how smart this is but it should work.
starting with the beginning date create a date range using the MonthBegin DateOffset
starting with the beginning date create a date range using the MonthEnd DateOffset
interleave them
import pandas as pd
import numpy as np
df = pd.DataFrame({'buying date':pd.to_datetime(['2000-01-04']),
'selling date':pd.to_datetime(['2000-04-15'])})
start,end = df.iloc[0]
lasts = pd.date_range(start,end,freq='M')
firsts = pd.date_range(start,end,freq='MS')
middle = np.vstack((lasts,firsts)).ravel('F')
start = np.array(start,dtype='datetime64[ns]')
end = np.array(end,dtype='datetime64[ns]')
s = pd.Series(np.hstack([start,middle,end]))
>>> print(s)
0 2000-01-04
1 2000-01-31
2 2000-02-01
3 2000-02-29
4 2000-03-01
5 2000-03-31
6 2000-04-01
7 2000-04-15
dtype: datetime64[ns]
>>>

Create a list of years with pandas

I have a dataframe with a column of dates of the form
2004-01-01
2005-01-01
2006-01-01
2007-01-01
2008-01-01
2009-01-01
2010-01-01
2011-01-01
2012-01-01
2013-01-01
2014-01-01
2015-01-01
2016-01-01
2017-01-01
2018-01-01
2019-01-01
Given an integer number k, let's say k=5, I would like to generate an array of the next k years after the maximum date of the column. The output should look like:
2020-01-01
2021-01-01
2022-01-01
2023-01-01
2024-01-01
Let's use pd.to_datetime + max to compute the largest date in the column date then use pd.date_range to generate the dates based on the offset frequency one year and having the number of periods equals to k=5:
strt, offs = pd.to_datetime(df['date']).max(), pd.DateOffset(years=1)
dates = pd.date_range(strt + offs, freq=offs, periods=k).strftime('%Y-%m-%d').tolist()
print(dates)
['2020-01-01', '2021-01-01', '2022-01-01', '2023-01-01', '2024-01-01']
Here you go:
import pandas as pd
# this is your k
k = 5
# Creating a test DF
array = {'dt': ['2018-01-01', '2019-01-01']}
df = pd.DataFrame(array)
# Extracting column of year
df['year'] = pd.DatetimeIndex(df['dt']).year
year1 = df['year'].max()
# creating a new DF and populating it with k years
years_df = pd.DataFrame()
for i in range (1,k+1):
row = {'dates':[str(year1 + i) + '-01-01']}
years_df = years_df.append(pd.DataFrame(row))
years_df
The output:
dates
2020-01-01
2021-01-01
2022-01-01
2023-01-01
2024-01-01

Extend datetimeindex to previous times in pandas

MRE:
idx = pd.date_range('2015-07-03 08:00:00', periods=30,
freq='H')
data = np.random.randint(1, 100, size=len(idx))
df = pd.DataFrame({'index':idx, 'col':data})
df.set_index("index", inplace=True)
which looks like:
col
index
2015-07-03 08:00:00 96
2015-07-03 09:00:00 79
2015-07-03 10:00:00 15
2015-07-03 11:00:00 2
2015-07-03 12:00:00 84
2015-07-03 13:00:00 86
2015-07-03 14:00:00 5
.
.
.
Note that dataframe contain multiple days. Since frequency is in hours, starting from 07/03 08:00:00 it will contain hourly date.
I want to get all data from 05:00:00 including day 07/03 even if it will contain value 0 in "col" column.
I want to extend it backwards so it starts from 05:00:00.
No I just can't start from 05:00:00 since I already have dataframe that starts from 08:00:00. I am trying to keep everything same but add 3 rows in the beginning to include 05:00:00, 06:00:00, and 07:00:00
The reindex method is handy for changing the index values:
idx = pd.date_range('2015-07-03 08:00:00', periods=30, freq='H')
data = np.random.randint(1, 100, size=len(idx))
# use the index param to set index or you might lose the freq
df = pd.DataFrame({'col':data}, index=idx)
# reindex with a new index
start = df.tshift(-3).index[0]
end = df.index[-1]
new_index = pd.date_range(start, end, freq='H')
new_df = df.reindex(new_index)
resample is also very useful for date indices
Just change the time from 08:00:00 to 05:00:00 in your code and create 3 more rows and update this dataframe to the existing one.
idx1 = pd.date_range('2015-07-03 05:00:00', periods=3,freq='H')
df1 = pd.DataFrame({'index': idx1 ,'col':np.random.randint(1,100,size = 3)})
df1.set_index('index',inplace=True)
df = df1.append(df)
print(df)
Add this snippet to your code...

Categories