Pandas: Reduce overlapping date ranges to one line (per group) - python

For every customer_id I have several start dates and end dates.
When a customer has several overlapping date ranges I would like to reduce those to one line that has the minimum start date of the overlapping date ranges and the maximum end date of the overlapping date ranges.
Here's my example data frame:
customer_id start_date end_date
1 2019-01-01 2019-03-01
1 2020-01-02 2020-03-01
1 2020-01-03 2020-05-04
1 2020-01-05 2020-06-01
1 2020-01-07 2020-02-02
1 2020-09-03 2020-09-05
1 2020-09-04 2020-09-04
1 2020-10-01 NaT
2 2020-05-01 2020-05-03
This is what the end result should look like:
customer_id start_date end_date
1 2019-01-01 2019-03-01
1 2020-01-02 2020-06-01
1 2020-09-03 2020-09-05
1 2020-10-01 NaT
2 2020-05-01 2020-05-03
I've tried the following already, but that didn't really work out:
Find date range overlap in python
Here's sample code that generated these examples:
import pandas as pd
df = pd.DataFrame(data=[
[1, '2019-01-01', '2019-03-01'],
[1, '2020-01-03', '2020-05-04'],
[1, '2020-01-05', '2020-06-01'],
[1, '2020-01-02', '2020-03-01'],
[1, '2020-01-07', '2020-02-02'],
[1, '2020-09-03', '2020-09-05'],
[1, '2020-09-04', '2020-09-04'],
[1, '2020-10-01', None],
[2, '2020-05-01', '2020-05-03']],
columns=['customer_id', 'start_date', 'end_date'],
)
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
df.sort_values(by=['customer_id', 'start_date', 'end_date'])
expected_df = pd.DataFrame(data=[
[1, '2019-01-01', '2019-03-01'],
[1, '2020-01-02', '2020-06-01'],
[1, '2020-09-03', '2020-09-05'],
[1, '2020-10-01', None],
[2, '2020-05-01', '2020-05-03']],
columns=['customer_id', 'start_date', 'end_date'],
)
expected_df['start_date'] = pd.to_datetime(expected_df['start_date'])
expected_df['end_date'] = pd.to_datetime(expected_df['end_date'])
expected_df.sort_values(by=['customer_id', 'start_date', 'end_date'])

Henry Ecker pointed me in the right direction considering this problem as a graph:
Pandas combining rows based on dates
The code only needed a very small bit of rewriting to get the right answer:
from scipy.sparse.csgraph import connected_components
def reductionFunction(data):
# create a 2D graph of connectivity between date ranges
start = data.start_date.values
end = data.end_date.values
graph = (start <= end[:, None]) & (end >= start[:, None])
# find connected components in this graph
n_components, indices = connected_components(graph)
# group the results by these connected components
return data.groupby(indices).aggregate({'start_date': 'min',
'end_date': 'max'})
df.groupby(['customer_id']).apply(reductionFunction).reset_index('customer_id')

Related

Converting timestamps in large dataset to multiple timezones

I have a large dataset with ~ 9 million rows and 4 columns - one of which is a utc timestamp. Data in this set has been recorded from 507 sites across Australia, and there is a site ID column. I have another dataset that has the timezones for each site ID in the format 'Australia/Brisbane'. I've written a function to create a new column in the main dataset that is the utc timestamp converted to the local time. However the wrong new time is being matched up with the utc timestamp, for example 2019-01-05 12:10:00+00:00 and 2019-01-13 18:55:00+11:00 (wrong timezone). I believe that sites are not mixed up in the data, but I've tried to sort the data incase that was the problem. Below is my code and images of the first row of each dataset, any help is much appreciated!
import pytz
from dateutil import tz
def update_timezone(df):
newtimes = []
df = df.sort_values('site_id')
sites = df['site_id'].unique().tolist()
for site in sites:
timezone = solarbom.loc[solarbom['site_id'] == site].iloc[0, 1]
dfsub = df[df['site_id'] == site].copy()
dfsub['utc_timestamp'] = dfsub['utc_timestamp'].dt.tz_convert(timezone)
newtimes.extend(dfsub['utc_timestamp'].tolist())
df['newtimes'] = newtimes
Main large dataset
Site info dataset
IIUC, you're looking to group your data by ID, then convert the timestamp specific to each ID. You could achieve this by using groupby, then applying a converter function to each group. Ex:
import pandas as pd
# dummy data:
df = pd.DataFrame({'utc_timestamp': [pd.Timestamp("2022-01-01 00:00 Z"),
pd.Timestamp("2022-01-01 01:00 Z"),
pd.Timestamp("2022-01-05 00:00 Z"),
pd.Timestamp("2022-01-03 00:00 Z"),
pd.Timestamp("2022-01-03 01:00 Z"),
pd.Timestamp("2022-01-03 02:00 Z")],
'site_id': [1, 1, 5, 3, 3, 3],
'values': [11, 11, 55, 33, 33, 33]})
# time zone info for each ID:
timezdf = pd.DataFrame({'site_id': [1, 3, 5],
'timezone_id_x': ["Australia/Adelaide", "Australia/Perth", "Australia/Darwin"]})
### what we want:
# for row, data in timezdf.iterrows():
# print(f"ID: {data['site_id']}, tz: {data['timezone_id_x']}")
# print(pd.Timestamp("2022-01-01 00:00 Z"), "to", pd.Timestamp("2022-01-01 00:00 Z").tz_convert(data['timezone_id_x']))
# ID: 1, tz: Australia/Adelaide
# 2022-01-01 00:00:00+00:00 to 2022-01-01 10:30:00+10:30
# ID: 3, tz: Australia/Perth
# 2022-01-01 00:00:00+00:00 to 2022-01-01 08:00:00+08:00
# ID: 5, tz: Australia/Darwin
# 2022-01-01 00:00:00+00:00 to 2022-01-01 09:30:00+09:30
###
def converter(group, timezdf):
# get the time zone by looking for the current group ID in timezdf
z = timezdf.loc[timezdf["site_id"] == group["site_id"].iloc[0], 'timezone_id_x'].iloc[0]
group["localtime"] = group["localtime"].dt.tz_convert(z)
return group
df["localtime"] = df["utc_timestamp"]
df = df.groupby("site_id").apply(lambda g: converter(g, timezdf))
now df looks like
df
Out[71]:
utc_timestamp site_id values localtime
0 2022-01-01 00:00:00+00:00 1 11 2022-01-01 10:30:00+10:30
1 2022-01-01 01:00:00+00:00 1 11 2022-01-01 11:30:00+10:30
2 2022-01-05 00:00:00+00:00 5 55 2022-01-05 09:30:00+09:30
3 2022-01-03 00:00:00+00:00 3 33 2022-01-03 08:00:00+08:00
4 2022-01-03 01:00:00+00:00 3 33 2022-01-03 09:00:00+08:00
5 2022-01-03 02:00:00+00:00 3 33 2022-01-03 10:00:00+08:00

Forward rolling 365 day groupby sum - irregular intervals

I have a pandas dataframe consisting of transactional data that looks like the below:
Customer_ID
Day
Sales
1
2018-08-01
80.11
2
2019-01-07
10.15
2
2021-02-21
74.15
1
2019-06-18
10.00
3
2020-03-17
15.15
2
2020-04-29
80.98
4
2016-06-01
133.54
3
2022-01-14
17.15
2
2021-02-28
25.12
1
2021-01-02
1.22
I need to calculate the forward rolling 365 day sum of sales grouped by the customer, exclusive of the current day. I would like to insert the result as a new column.
e.g. for customer_id == 1 for the first row, the value to be inserted in the new column will be the sum of sales between 2018-08-02 and 2019-08-01 for customer_id == 1.
I'm sure there's a way to do this using pandas, but I can't figure it out.
Code to produce the dataframe:
df = pd.DataFrame({
'Customer_ID': [1, 2, 2, 1, 3, 2, 4, 3, 2, 1],
'Day': ['2018-01-01', '2019-01-07', '2021-02-21', '2019-06-17', '2020-03-17', '2020-04-29', '2016-06-01', '2022-01-14', '2021-02-28', '2021-01-02'],
'Sales': [80.11, 10.15, 74.15, 10.00, 15.15, 80.98, 133.54, 17.15, 25.12, 1.22]
})
df.Day = pd.to_datetime(df.Day)
You first need to groupby the Customer_ID column, then perform a rolling sum on each group after you set the 'Day' column as each groups index.
df.groupby('Customer_ID')
.apply(
lambda gr: gr.set_index('Day').sort_index()['Sales'].rolling('365D').sum()
).reset_index()
There is probably a better way to do this, but for me this is the simplest one for me.

Create a list of years with pandas

I have a dataframe with a column of dates of the form
2004-01-01
2005-01-01
2006-01-01
2007-01-01
2008-01-01
2009-01-01
2010-01-01
2011-01-01
2012-01-01
2013-01-01
2014-01-01
2015-01-01
2016-01-01
2017-01-01
2018-01-01
2019-01-01
Given an integer number k, let's say k=5, I would like to generate an array of the next k years after the maximum date of the column. The output should look like:
2020-01-01
2021-01-01
2022-01-01
2023-01-01
2024-01-01
Let's use pd.to_datetime + max to compute the largest date in the column date then use pd.date_range to generate the dates based on the offset frequency one year and having the number of periods equals to k=5:
strt, offs = pd.to_datetime(df['date']).max(), pd.DateOffset(years=1)
dates = pd.date_range(strt + offs, freq=offs, periods=k).strftime('%Y-%m-%d').tolist()
print(dates)
['2020-01-01', '2021-01-01', '2022-01-01', '2023-01-01', '2024-01-01']
Here you go:
import pandas as pd
# this is your k
k = 5
# Creating a test DF
array = {'dt': ['2018-01-01', '2019-01-01']}
df = pd.DataFrame(array)
# Extracting column of year
df['year'] = pd.DatetimeIndex(df['dt']).year
year1 = df['year'].max()
# creating a new DF and populating it with k years
years_df = pd.DataFrame()
for i in range (1,k+1):
row = {'dates':[str(year1 + i) + '-01-01']}
years_df = years_df.append(pd.DataFrame(row))
years_df
The output:
dates
2020-01-01
2021-01-01
2022-01-01
2023-01-01
2024-01-01

Pandas/Python: Find missing values in time series, insert a new time stamp and a nan value for missing values [duplicate]

This question already has an answer here:
Pandas asfreq with weekly frequency
(1 answer)
Closed 2 years ago.
I create the following DataFrame:
import pandas as pd
d = {'T': [1, 2, 4, 15], 'H': [3, 4, 6, 8]}
df = pd.DataFrame(data=d, index=['10.09.2018 13:15:00','10.09.2018 13:30:00', '10.09.2018 14:00:00', '10.09.2018 22:00:00'])
df.index = pd.to_datetime(df.index)
And get the following result.
Out[30]:
T H
2018-10-09 13:15:00 1 3
2018-10-09 13:30:00 2 4
2018-10-09 14:00:00 4 6
2018-10-09 22:00:00 15 8
As you can see there is one value missing at 13:45:00 and a lot values between 14:00 and 22:00.
Is there a way to automatically find the missing values, insert a row with the missing time stamp and nan values for the missing time ?
I want to achieve this:
Out[30]:
T H
2018-10-09 13:15:00 1 3
2018-10-09 13:30:00 2 4
2018-10-09 13:45:00 nan nan
2018-10-09 14:00:00 4 6
2018-10-09 14:15:00 nan nan
...
2018-10-09 21:45:00 nan nan
2018-10-09 22:00:00 15 8
You can create a second dataframe with the correct timestep as index and join it with the original data. The following code worked in my case
# your code
import pandas as pd
d = {'T': [1, 2, 4, 15], 'H': [3, 4, 6, 8]}
df = pd.DataFrame(data=d, index=['10.09.2018 13:15:00','10.09.2018 13:30:00', '10.09.2018 14:00:00', '10.09.2018 22:00:00'])
df.index = pd.to_datetime(df.index)
# generate second dataframe with needed index
timerange = pd.date_range('10.09.2018 13:15:00', periods=40, freq='15min')
df2 = pd.DataFrame(index=timerange)
# join the original dataframe with the new one
newdf = df.join(df2, how='outer')

Conditionally merging dataframes on time variable

I have to dataframe that I want to merge on the date, id and time variable in order to compute a duration.
from numpy import *
from pandas import *
df1 = DataFrame({
'id': ['a']*4,
'date': ['02-02-2015']*4,
'time_1': ['08:00:00', '09:00:00', '10:30:00', '12:45']})
df1
id date time
0 a 02-02-2015 08:00:00
1 a 02-02-2015 09:00:00
2 a 02-02-2015 10:30:00
3 a 02-02-2015 12:45:00
-------------------------------------------------------------------------------------------------
df2 = DataFrame({
'id': ['a']*7,
'date': ['02-02-2015']*7,
'time_2': ['08:00:00', '08:09:00', '08:04:01','08:52:36', '09:34:25', '10:30:00', '11:23:38']})
df2
id date time
0 a 02-02-2015 08:00:00
1 a 02-02-2015 08:09:00
2 a 02-02-2015 08:04:01
3 a 02-02-2015 08:52:36
4 a 02-02-2015 09:00:00
5 a 02-02-2015 10:30:00
6 a 02-02-2015 11:23:38
The rule that I want my merge to follow is that each row needs in df2 needs to go with the closest previous time in df1.
The intermediate result would be
intermediateResult = DataFrame({
'id': ['a']*8,
'date': ['02-02-2015']*8,
'time_1': ['08:00:00', '08:00:00', '08:00:00','08:00:00', '09:00:00', '10:30:00', '10:30:00', '12:45'],
'time_2': ['08:00:00', '08:09:00', '08:04:01','08:52:36', '09:34:25', '10:30:00', '11:23:38', nan] })
intermediateResult
id date time_1 time_2
0 a 02-02-2015 08:00:00 08:00:00
1 a 02-02-2015 08:00:00 08:09:00
2 a 02-02-2015 08:00:00 08:04:01
3 a 02-02-2015 08:00:00 08:52:36 # end
4 a 02-02-2015 09:00:00 09:34:25 # end
5 a 02-02-2015 10:30:00 10:30:00
6 a 02-02-2015 10:30:00 11:23:38 # end
7 a 02-02-2015 12:45 NaN
Finally, I want to get the time difference between the latest time_2 of each period (inicated with the comment # end) and their corresponding time_1.
The final result would look like this
finalResult = DataFrame({
'id': ['a']*4,
'date': ['02-02-2015']*4,
'Duration': ['00:52:36', '00:34:25', '00:53:38', nan]})
finalResult
id date Duration
0 a 02-02-2015 00:52:36
1 a 02-02-2015 00:34:25
2 a 02-02-2015 00:53:38
3 a 02-02-2015 NaN
Using different merge methods, came to the same answer. Eventually used merge_as0f direction =backward as per your request. Unfortunately not similar to yours in the sense that I have no NaN. Happy to help furtherif you gave information on how you end up with NaN in one row.
#Join dateto time and coerce to datetime
df1['datetime']=pd.to_datetime(df1.date.str.cat(df1.time_1,sep=' '))
df2['datetime']=pd.to_datetime(df2.date.str.cat(df2.time_2,sep=' '))
df2['time_2'] = df2['time_2'].apply(lambda x: (x[-5:]))#StripHours from time_2. I anticipate to use it as duration
#sort to allow merge_asof
df1=df1.sort_values('datetime')
df2=df2.sort_values('datetime')
#Merge to the dataframes joining using datetime to the nearest hour
df3=pd.merge_asof(df2, df1,on='datetime', by='id', tolerance=pd.Timedelta('2H'),allow_exact_matches=True,direction='backward').dropna()
#df3=df2.merge(df1, left_on=df2.datetime.dt.hour, right_on=df1.datetime.dt.hour, how='left').drop(columns=['key_0', 'id_y', 'date_y']).fillna(0)#Alternative merge
df3.set_index('datetime', inplace=True)#set datetime as index
df3['minutes']=df3.index.minute#Extract minute in each row. Looks to me you want the highest minute in each hour
#Groupby hour idxmax Helps boolean select the index with the highest minutes in an hour. aND DROP UNWANTED ROWS
finalResult=df3.loc[df3.groupby([df3.index.hour, df3.date_x])['minutes'].idxmax()].reset_index().drop(columns=['datetime','time_1','date_y','minutes'])
finalResult.columns=['id','date','Duration(min)']
finalResult
Using the solution suggested by #wwnde, I've found one that scales better to my real data set:
import numpy as np
import pandas as pd
df1 = DataFrame({
'id': ['a']*4,
'date': ['02-02-2015']*4,
'time_1': ['08:00:00', '09:00:00', '10:30:00', '12:45:00']
})
df2 = DataFrame({
'id': ['a']*7,
'date': ['02-02-2015',
'02-02-2015',
'03-02-2015', # small change here relatively to the df un my first post
'02-02-2015',
'02-02-2015',
'02-02-2015',
'02-02-2015'],
'time_2': ['08:00:00', '08:09:00', '08:04:01','08:52:36', '09:34:25', '10:30:00', '11:23:38']
})
----------------------------------------------------
def preproDf(df1, df2, time_1, time_2, _id, date):
'''
Preprocess the dataframes for the following operations
df1: pd.DataFrame, left dataframe
df2: pd.DataFrame, right dataframe
time_1:str, name of the left dataframe
time_2:str, name of the right dataframe
_id:str, name of the id variable. Should be the same for both dataframes
date:str, name of the date variable. Should be the same for both dataframes
return: None
'''
df2[time_2] = df2[time_2].apply(pd.to_datetime)
df1[time_1] = df1[time_1].apply(pd.to_datetime)
#sort to allow merge_asof
df1=df1.sort_values([_id, date, time_1])
df2=df2.sort_values([_id, date, time_2])
def processDF(df1, df2, time_1, time_2, _id, date):
# initialisation
groupKeys = list(df2.groupby([_id, date]).groups.keys())
dfGroup=groupKeys[0]
group = df2.groupby([_id, date]).get_group(dfGroup)
rslt = pd.merge_asof(group, df1, left_on=time_2, right_on=time_1, by=[_id, date], tolerance=pd.Timedelta('2H'),allow_exact_matches=True,direction='backward')#.dropna()
# For loop to get the values in an array
for group in groupKeys[1:]: # Iteration start at the second elmt
group = df2.groupby([_id, date]).get_group(group)
item = pd.merge_asof(group, df1, left_on=time_2, right_on=time_1, by=[_id, date], tolerance=pd.Timedelta('2H'),allow_exact_matches=True,direction='backward')#.dropna()
rslt = np.vstack((rslt, item))
rslt = DataFrame(rslt, columns=item.columns)
# Creating timeDifference variable
rslt['timeDifference'] = rslt[time_2] - rslt[time_1]
# Getting the actual result
rslt = rslt.groupby([_id, date, time_1]).timeDifference.max()
rslt = pd.DataFrame(rslt).reset_index()
rslt.rename({time_1: 'openTime'}, axis='columns')
return rslt
The result:
preproDf(df1, df2, 'time_1', 'time_2', 'id', 'date')
processDF(df1, df2, 'time_1', 'time_2', 'id', 'date')
id date time_1 screenOnDuration
0 a 02-02-2015 2020-05-29 08:00:00 00:52:36
1 a 02-02-2015 2020-05-29 09:00:00 00:34:25
2 a 02-02-2015 2020-05-29 10:30:00 00:53:38

Categories