handling real time data in python , rolling window - python

I want to create a function that will read a series of time values from a file (with gaps in the sampling rate,thats the problem) and would read me exactly 200 days and allow me to move through the entire data length,say 10000 day,sort of a rolling window.
I am not sure how to code it. Can I add a statement that calculates the difference between two values of the time variable (x axis) up to when is exactly 200 days?
Or can I somehow write a function that would find the starting value say t0 and then find the element of the array that is closest to t0 + (interval=) 200 days.
What I have so far is:
f = open(reading the file from directory)
lines = f.readlines()
print(len(lines))
tx = np.array([]) # times
y= np.array([])
interval = 200 # days
for li in lines:
col = li.split()
t0 = np.array([])
t1 = np.array([])
tx = np.append(tx, float(col[0]))
y= np.append(y, float(col[1]))
t0 = np.append(t0, np.max(tx))
t1 = np.append(t1, tx[np.argmin(tx)])
print(t0,t1)
days = [t1 + dt.timedelta(days = float(x)) for x in days]
#y = np.random.randn(len(days))
# use pandas for convenient rolling function:
df = pd.DataFrame({"day":tx, "value": y}).set_index("day")
def closest_value(s):
if s.shape[0]<2:
return np.nan
X = np.empty((s.shape[0]-1, 2))
X[:, 0] = s[:-1]
X[:, 1] = np.fabs(s[:-1]-s[-1])
min_diff = np.min(X[:, 1])
return X[X[:, 1]==min_diff, 0][0]
df['closest_value'] = df.rolling(window=dt.timedelta(days=200))
['value'].apply(closest_value, raw=True)
print(df.tail(5))
Output error:
TypeError: float() argument must be a string or a number, not
'datetime.datetime'
Additionally,
First 10 tx and ty values respectively:
0 0.003372722575018
0.015239999629557 0.003366515509113
0.045829999726266 0.003385171061055
0.075369999743998 0.003385171061055
0.993219999596477 0.003366515509113
1.022699999623 0.003378941085299
1.05217999964952 0.003369617612836
1.08166999975219 0.003397665493594
3.0025899996981 0.003378941085299
3.04120999993756 0.003394537568711

import numpy as np
import pandas as pd
import datetime as dt
# load data in days and y arrays
# ... or generate them:
N = 1000 # number of days
day_min = dt.datetime.strptime('2000-01-01', '%Y-%m-%d')
day_max = 2000
days = np.sort(np.unique(np.random.uniform(low=0, high=day_max, size=N).astype(int)))
days = [day_min + dt.timedelta(days = int(x)) for x in days]
y = np.random.randn(len(days))
# use pandas for convenient rolling function:
df = pd.DataFrame({"day":days, "value": y}).set_index("day")
def closest_value(s):
if s.shape[0]<2:
return np.nan
X = np.empty((s.shape[0]-1, 2))
X[:, 0] = s[:-1]
X[:, 1] = np.fabs(s[:-1]-s[-1])
min_diff = np.min(X[:, 1])
return X[X[:, 1]==min_diff, 0][0]
df['closest_value'] = df.rolling(window=dt.timedelta(days=200))['value'].apply(closest_value, raw=True)
print(df.tail(5))
Output:
value closest_value
day
2005-06-15 1.668638 1.591505
2005-06-16 0.316645 0.304382
2005-06-17 0.458580 0.445592
2005-06-18 -0.846174 -0.847854
2005-06-22 -0.151687 -0.166404

You could use pandas, set a datetime range and create a while loop to process the data in batches.
import pandas as pd
from datetime import datetime, timedelta
# Load data into pandas dataframe
df = pd.read_csv(filepath)
# Name columns
df.columns = ['dates', 'num_value']
# Convert strings to datetime
df.dates = pd.to_datetime(df['dates'], format='%d/%m/%Y')
# Print dates within a 200 day interval and move on to the next interval
i = 0
while i < len(df.dates):
start = df.dates[i]
end = start + timedelta(days=200)
print(df.dates[(df.dates >= start) & (df.dates < end)])
i += 200
If the columns don't have headers, you should omit skiprows:
dates num_value
2004-7-1 1
2004-7-2 5
2004-7-4 8
2004-7-5 11
2004-7-6 17
df = pd.read_table(filepath, sep="\s+", skiprows=1)

Related

Have two dataframes with dt, How do I get the count of rows in the last 7 days?

import pandas as pd
l1 = ["2021-11-15","2021-11-13","2021-11-10","2021-05-28","2021-06-02","2021-06-02","2021-11-02"]
l2 = ["2021-11-11","2021-03-02","2021-11-05","2021-05-20","2021-05-01","2021-06-01","2021-04-08"]
#convert to dt
l1=pd.to_datetime(l1)
l2= pd.to_datetime(l2)
#put in df
df1=pd.DataFrame(l1)
df2=pd.DataFrame(l2)
df1.columns = ['0']
df2.columns = ['0']
df1=df1.set_index('0')
df2=df2.set_index('0')
#sort asc
df1=df1.sort_index()
df2=df2.sort_index()
How can I get a COUNT from each dataframe based on the number of rows that are within the last 7 days?
you can slice between two timestamps and then get the number of rows with .shape[0]:
def get_count_last_7_days(df):
stop = df.index.max()
start = stop - pd.Timedelta('7D')
return df.loc[start:stop].shape[0]
count1 = get_count_last_7_days(df1)
count2 = get_count_last_7_days(df2)
import pandas as pd
import numpy as np
from datetime import date, timedelta
x = (date.today() - timedelta(days=100))
y = (date.today() - timedelta(days=7))
z = date.today()
dates = pd.date_range(x, periods=100)
d = np.arange(1, 101)
df = pd.DataFrame(data=d, index=pd.DatetimeIndex(dates))
df = df.sort_index()
last_seven_days = df.loc[y:z]
print(last_seven_days.count())
Your last 7 days is ambiguous, I assume it's calculated from current time:
today = datetime.today()
week_ago = today - timedelta(days=7)
Since you already set the date as index, you can use .loc directly, but you also can use a mask:
df = df1.loc[week_ago:today]
# or
df = df1[(df1.index > week_ago) & (df1.index < today)]
To get row count, you can use shape accessor or sum the boolean mask
count = df1.loc[week_ago:today].shape[0]
# or
sum((df1.index > week_ago) & (df1.index < today))

Pandas Dataframe masking issues: referring to previous rows and selecting values

I am new to Pandas, and I'm trying to avoid iterating over a DataFrame, and attempting to use vectorisation instead. I am not able to get the results I want; I need help in the more complicated masking and selection statements
This is my code:
import random
from datetime import datetime, timedelta
import pandas as pd
dates = []
temp = []
press = []
vel = []
fmt = '%Y-%m-%d %H:%M:%S'
stime = datetime.strptime('2020-01-06 10:28:16', fmt)
etime = datetime.strptime('2020-04-10 03:43:12', fmt)
td = etime - stime
l = set([random.random() for x in range(0, 1000)])
dates = [((td * x) + stime) for x in random.sample(l, 100)]
for i in range(100):
press.append(random.uniform(14,95.5))
temp.append(random.uniform(-15,45))
vel.append(random.uniform(50,153))
measurements = {
'date' : dates,
'pressure' : press,
'velocity' : vel,
'temperature': temp
}
df = pd.DataFrame(measurements)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df = df.sort_index()
df2 = pd.DataFrame()
# if temp increased from previous row, set flag
df2['temp_inc'] = df['temperature'] - df.shift(1)['temperature'] > 0
df2['temp_inc'] = df2['temp_inc'].replace({True: 1, False: 0})
# need to fetch velocity where pressure has increased from previous row, else 0
press_up_mask = df.where( (df['pressure'] - df.shift(1)['pressure']) > 0)
#df2['press_spike_velocity'] = df[press_up_mask]['velocity']
# Need to perform calc based on 'temp_inc' column: if 'temp_inc' column is 1: calculate pressure * velocity, else 0
temp_inc_mask = df2['temp_inc'] == 1
df2['boyle_fact'] = df[temp_inc_mask]['pressure'] * df[temp_inc_mask]['velocity']
# Get some stats
df2['short_max_temp'] = df['temperature'].rolling(3).max()
df2['long_min_pressure'] = df['pressure'].rolling(30).min()
print(df.head())
print(df2.head())
How do I correctly calculate columns 'press_spike_velocity' and 'boyle_fact' ?
Starting from the computations:
# if temp increased from previous row, set flag
df2['temp_inc'] = df['temperature'] - df.shift(1)['temperature'] > 0
# setting int type instead of replace
df2['temp_inc'] = df2['temp_inc'].astype(int)
# need to fetch velocity where pressure has increased from previous row, else 0
press_up_mask = df.where( (df['pressure'] - df['pressure'].shift(1)) > 0)
# set column to velocity then mask in zeros via assignment
df2['press_spike_velocity'] = df['velocity'].copy()
df2['press_spike_velocity'][~press_up_mask] = 0
# Need to perform calc based on 'temp_inc' column: if 'temp_inc' column is 1: calculate pressure * velocity, else 0
temp_inc_mask = df2['temp_inc'] == 1
# same masking approach as above
df2['boyle_fact'] = df['pressure'] * df['velocity']
df2['boyle_fact'][~temp_inc_mask] = 0
This is the simplest way to solve your problem with minimal changes to the code itself. If you dig into pandas more you could probably find methods to do this in 1-2 fewer lines via inplace operations but I don't know how much performance or readability you would gain from that.

Convert hours format to minutes (float) with Pandas

I'm following one tutorial for web scraping an I'm stuck with one part.
I'm only getting errors when I try to run the following code:
df7['Time2'] = df7['Time'].str.split(':').apply(lambda x: float(x[0]) * 60 + float(x[1]) + float(x[2])/60)
Get the error:
IndexError: list index out of range
Also tried the following:
time_mins = []
for i in time_list:
h, m, s = i.split(':')
math = (int(h) * 3600 + int(m) * 60 + int(s))/60
time_mins.append(math)
Again didn't work.
My cell is like:
The result that I want is like:
Any help would be helpful...
Tks in adv.
Create Sample Dataframe:
# Import packages
import pandas as pd
# Create sample dataframe
time = ['1:38:17','1:38:31','1:38:32']
gender = ['M','F','M']
data = pd.DataFrame({
'Time':time,
'Gender':gender
})
data
Out[]:
Time Gender
0 1:38:17 M
1 1:38:31 F
2 1:38:32 M
Convert column into timedelta format:
# Time conversion
data['Time'] = pd.to_timedelta(data['Time'])
# Time in days
data = data.assign(Time_in_days = [x.days for x in data['Time']])
# Time in hour
data = data.assign(Time_in_hour = [(x.seconds)/(60.0*60.0) for x in data['Time']] )
# Time in minutes
data = data.assign(Time_in_minutes = [(x.seconds)/60.0 for x in data['Time']])
# Time in seconds
data = data.assign(Time_in_seconds = [x.seconds * 1.0 for x in data['Time']] )
print(data)
Time Gender Time_in_days Time_in_hour Time_in_minutes Time_in_seconds
0 01:38:17 M 0 1.638056 98.283333 5897.0
1 01:38:31 F 0 1.641944 98.516667 5911.0
2 01:38:32 M 0 1.642222 98.533333 5912.0
data['Time2'] = data['Time'].apply(lambda x: sum([a*b for a,b in zip(list(map(int,x.split(':')))[::-1],[1/60,1,60])]))
If you have date['Time'] dtype as string if not then just make small change in above line :
x.str.split(':')

Efficient (fast) way to group continuous data in one DataFrame based on ranges taken from another DataFrame in Python Pandas?

I have experimental data produced by different programs. One is logging the start and end time of a trial as well as the type of trial (a category).
start trial type end
0 6.002987 2 c 7.574240
1 7.967054 3 b 19.084946
2 21.864419 5 b 23.298480
3 23.656995 7 c 24.087210
4 24.194764 9 c 27.960752
The other one records a continous datastream and logs the time for each observation.
X Y Z
0.0000 0.324963 -0.642636 -2.305040
0.0333 0.025089 -0.480412 -0.637273
0.0666 0.364149 0.966594 0.789467
0.0999 -0.087334 -0.761769 0.399813
0.1332 0.841872 2.306711 -1.059608
I have the 2 tables as pandas DataFrames and want to retrieve only those parts of the continuous data that is between the start to end ranges found in the rows of the trials DataFrame. I managed that by using a for-loop that iterates over the rows, but I was thinking that there must be more of a "pandas way" of doing this. So I looked into apply, but what I came up with so far was even considerably slower than the loop.
As I'm working on a lot of large datasets I'm looking for the most efficient way in terms of execution time to solve this.
This is a slice of the expected result for the continous DataFrame:
X Y Z trial type
13.6863 0.265358 0.116529 1.196689 NaN NaN
13.7196 -0.715096 -0.413416 0.696454 NaN NaN
13.7529 0.714897 -0.158183 1.735958 4.0 b
13.7862 -0.259513 0.194762 -0.531482 4.0 b
13.8195 -0.929080 -1.200593 -1.233834 4.0 b
[EDIT:] Here I test performance of different approaches. I found a way using apply(), but it isn't much faster than using iterrows.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
def create_trials_df(num_trials=360, max_start=1400.0):
# First df holds start and end times (as seconds) of a trial as well as type of trial.
d = {'trial': pd.Series(np.sort(np.random.choice(np.arange(1, 400), replace=False, size=(360,)))),
'type': pd.Series(np.random.choice(('a', 'b', 'c', 'd'),size=num_trials)),
'start': pd.Series(np.sort(np.random.random_sample((num_trials,))) * max_start)}
trials_df = pd.DataFrame(d)
# Create column for when the trial ended.
trials_df['end'] = trials_df['start'].shift(-1)
trials_df.loc[num_trials-1, 'end'] = trials_df['start'].iloc[-1] + 2.0
trials_df['diff'] = trials_df['end'] - trials_df['start']
trials_df['end'] = trials_df['end'] - trials_df['diff'] * 0.2
del trials_df['diff']
return trials_df
def create_continuous_df(num_trials=360, max_start=1400.0):
# Second df has continuously recorded data with time as index.
time_delta = 1.0/30.0
rows = int((max_start+2) * 1/time_delta)
idx_time = pd.Index(np.arange(rows) * time_delta)
continuous_df = pd.DataFrame(np.random.randn(rows, 3), index=idx_time, columns=list('XYZ'))
print("continuous rows:", continuous_df.index.size)
print("continuous last time:", continuous_df.last_valid_index())
return continuous_df
# I want to group the continuous data by trial and type later on.
def iterrows_test(trials_df, continuous_df):
for index, row in trials_df.iterrows():
continuous_df.loc[row['start']:row['end'], 'trial'] = row['trial']
continuous_df.loc[row['start']:row['end'], 'type'] = row['type']
def itertuples_test(trials_df, continuous_df):
continuous_df['trial'] = np.NaN
continuous_df['type'] = np.NaN
for row in trials_df.itertuples():
continuous_df.loc[slice(row[1],row[4]), ['trial','type']] = [row[2],row[3]]
def apply_test(trials_df, continuous_df):
trial_series = pd.Series([x[0] for x in zip(trials_df.values)])
continuous_df['trial'] = np.NaN
continuous_df['type'] = np.NaN
def insert_trial_data_to_continuous(vals, con_df):
con_df.loc[slice(vals[0], vals[3]), ['trial','type']] = [vals[1],vals[2]]
trial_series.apply(insert_trial_data_to_continuous, args=(continuous_df,))
def real_slow_index_map(trials_df, continuous_df):
# Transform trial_data to new df: merge start and end ordered, make it float index.
trials_df['pre-start'] = trials_df['start'] - 0.0001
trials_df['post-end'] = trials_df['end'] + 0.0001
start_df = pd.DataFrame(data={'type': trials_df['type'].values, 'trial': trials_df['trial'].values},
index=trials_df['start'])
end_df = pd.DataFrame(data={'type': trials_df['type'].values, 'trial': trials_df['trial'].values},
index=trials_df['end'])
# Fill inbetween trials with NaN.
pre_start_df = pd.DataFrame({'trial': np.NaN, 'type': np.NaN}, index=trials_df['pre-start'])
post_end_df = pd.DataFrame({'trial': np.NaN, 'type': np.NaN}, index=trials_df['post-end'])
new_df = start_df.append([end_df, pre_start_df, post_end_df])
new_df.sort_index(inplace=True)
# Each start/end index in new_df has corresponding value in type and trial column.
def get_tuple(idx):
res = new_df.iloc[new_df.index.get_loc(idx, method='nearest')]
# return trial and type column values.
return tuple(res.values)
# Apply this to all indices.
idx_series = continuous_df.index.to_series()
continuous_df['trial'] = idx_series.apply(get_tuple).values
continuous_df[['trial', 'type']] = continuous_df['trial'].apply(pd.Series)
def jp_data_analysis_answer(trials_df, continuous_df):
ranges = trials_df[['trial', 'type', 'start', 'end']].values
def return_trial(n):
for i, r in enumerate(ranges):
if r[2] <= n <= r[3]:
return tuple((i, r[1]))
else:
return np.nan, np.nan
continuous_df['trial'], continuous_df['type'] = list(zip(*continuous_df.index.map(return_trial)))
def performance_test(func, trials_df, continuous_df):
return_df = continuous_df.copy()
time_ref = time.perf_counter()
func(trials_df, return_df)
time_delta = time.perf_counter() - time_ref
print("time delta for {}:".format(func.__name__), time_delta)
return return_df
# Just to illustrate where this is going:
def plot_trial(continuous_df):
continuous_df['type'] = continuous_df['type'].astype('category')
continuous_df = continuous_df.groupby('type').filter(lambda x: x is not np.NaN)
# Without the NaNs in column, let's set the trial column to dtype integer.
continuous_df['trial'] = continuous_df['trial'].astype('int64')
# Plot the data by trial.
for key, group in continuous_df.groupby('trial'):
group.drop(['trial', 'type'], axis=1).plot()
plt.title('Trial {}, Type: {}'.format(key, group['type'].iloc[0]))
plt.show()
break
if __name__ == '__main__':
import time
num_trials = 360
max_start_time = 1400
trials_df = create_trials_df(max_start=max_start_time)
data_df = create_continuous_df(max_start=max_start_time)
# My original approach with a for-loop over iterrows.
iterrows_df = performance_test(iterrows_test,trials_df, data_df)
# itertuples test
itertuples_df = performance_test(itertuples_test,trials_df, data_df)
# apply() on trial data, continuous data is manipulated therein
apply_df = performance_test(apply_test,trials_df, data_df)
# Mapping on index of continuous data. SLOW!
map_idx_df = performance_test(real_slow_index_map,trials_df, data_df)
# method by jp_data_analysis' answer. Works well with small continuous_df, but doesn't scale well.
jp_df = performance_test(jp_data_analysis_answer,trials_df, data_df)
plot_trial(apply_df)
I see a factor ~7x improvement with below logic. The trick is to use an index.map(custom_function) on continuous_df and unpack the results, together with (in my opinion) underused for..else.. construct. This is still sub-optimal, but may be sufficient for your purposes, and certainly better than iterating rows.
import numpy as np
import pandas as pd
def test2():
# First df holds start and end times (as seconds) of a trial as well as type of trial.
num_trials = 360
max_start = 1400.0
d = {'trial': pd.Series(np.sort(np.random.choice(np.arange(1, 400), replace=False, size=(360,)))),
'type': pd.Series(np.random.choice(('a', 'b', 'c', 'd'),size=num_trials)),
'start': pd.Series(np.sort(np.random.random_sample((num_trials,))) * max_start)}
trials_df = pd.DataFrame(d)
# Create column for when the trial ended.
trials_df['end'] = trials_df['start'].shift(-1)
trials_df.loc[num_trials-1, 'end'] = trials_df['start'].iloc[-1] + 2.0
trials_df['diff'] = trials_df['end'] - trials_df['start']
trials_df['end'] = trials_df['end'] - trials_df['diff'] * 0.2
del trials_df['diff']
# Second df has continuously recorded data with time as index.
time_delta = 0.0333
rows = int(max_start+2/time_delta)
idx_time = pd.Index(np.arange(rows) * time_delta)
continuous_df = pd.DataFrame(np.random.randn(rows,3), index=idx_time, columns=list('XYZ'))
ranges = trials_df[['trial', 'type', 'start', 'end']].values
def return_trial(n):
for r in ranges:
if r[2] <= n <= r[3]:
return tuple(r[:2])
else:
return (np.nan, '')
continuous_df['trial'], continuous_df['type'] = list(zip(*continuous_df.index.map(return_trial)))
return trials_df, continuous_df

How to include dynamic time?

I am trying to pull the logs with respect to time slots. The program below runs very fine when no. of hours are given and the logs in that range gets extracted.
But now I also what to include Start and end to be dynamically given. i.e. say between 8 am to 8pm or 6am to 8am and so on.
How do I get that? Any edit in the current program will also do or a separate program will also do.
Input: Mini Version of INPUT
Code:
import pandas as pd
from datetime import datetime,time
import numpy as np
fn = r'00_Dart.csv'
cols = ['UserID','StartTime','StopTime', 'gps1', 'gps2']
df = pd.read_csv(fn, header=None, names=cols)
df['m'] = df.StopTime + df.StartTime
df['d'] = df.StopTime - df.StartTime
# 'start' and 'end' for the reporting DF: `r`
# which will contain equal intervals (1 hour in this case)
start = pd.to_datetime(df.StartTime.min(), unit='s').date()
end = pd.to_datetime(df.StopTime.max(), unit='s').date() + pd.Timedelta(days=1)
# building reporting DF: `r`
freq = '1H' # 1 Hour frequency
idx = pd.date_range(start, end, freq=freq)
r = pd.DataFrame(index=idx)
r['start'] = (r.index - pd.datetime(1970,1,1)).total_seconds().astype(np.int64)
# 1 hour in seconds, minus one second (so that we will not count it twice)
interval = 60*60 - 1
r['LogCount'] = 0
r['UniqueIDCount'] = 0
for i, row in r.iterrows():
# intervals overlap test
# https://en.wikipedia.org/wiki/Interval_tree#Overlap_test
# i've slightly simplified the calculations of m and d
# by getting rid of division by 2,
# because it can be done eliminating common terms
u = df[np.abs(df.m - 2*row.start - interval) < df.d + interval].UserID
r.ix[i, ['LogCount', 'UniqueIDCount']] = [len(u), u.nunique()]
r['Date'] = pd.to_datetime(r.start, unit='s').dt.date
r['Day'] = pd.to_datetime(r.start, unit='s').dt.weekday_name.str[:3]
r['StartTime'] = pd.to_datetime(r.start, unit='s').dt.time
r['EndTime'] = pd.to_datetime(r.start + interval + 1, unit='s').dt.time
#r.to_csv('results.csv', index=False)
#print(r[r.LogCount > 0])
#print (r['StartTime'], r['EndTime'], r['Day'], r['LogCount'], r['UniqueIDCount'])
rout = r[['Date', 'StartTime', 'EndTime', 'Day', 'LogCount', 'UniqueIDCount'] ]
#print rout
rout.to_csv('one_hour.csv', index=False, header=False)
Edit:
In Simple words, I should be able to give StartTime and EndTIme in the program. The code below is very much close to what I am trying to do. But how convert this to pandas.
from datetime import datetime,time
start = time(8,0,0)
end = time(20,0,0)
with open('USC28days_0_20', 'r') as infile, open('USC28days_0_20_time','w') as outfile:
for row in infile:
col = row.split()
t1 = datetime.fromtimestamp(float(col[2])).time()
t2 = datetime.fromtimestamp(float(col[3])).time()
print (t1 >= start and t2 <= end)
Edit Two: Working answer in Pandas
Taking a Part from the #MaxU's answer from selected answer. The below code strips the required group of logs between the given StartTime and StopTime
import pandas as pd
from datetime import datetime,time
import numpy as np
fn = r'00_Dart.csv'
cols = ['UserID','StartTime','StopTime', 'gps1', 'gps2']
df = pd.read_csv(fn, header=None, names=cols)
#df['m'] = df.StopTime + df.StartTime
#df['d'] = df.StopTime - df.StartTime
# filter input data set ...
start_hour = 8
end_hour = 9
df = df[(pd.to_datetime(df.StartTime, unit='s').dt.hour >= start_hour) & (pd.to_datetime(df.StopTime, unit='s').dt.hour <= end_hour)]
print df
df.to_csv('time_hour.csv', index=False, header=False)
But: If there was a possibility to have control on minutes and seconds also would be great solution.
At present this also strips the logs which have the hour of StopTime but also the minutes and seconds until the next hour.
Something like
start_hour = 8:0:0
end_hour = 9:0:0 - 1 # -1 to get the logs until 8:59:59
But this gives me an error
try this:
import pandas as pd
from datetime import datetime,time
import numpy as np
fn = r'D:\data\gDrive\data\.stack.overflow\2016-07\dart_small.csv'
cols = ['UserID','StartTime','StopTime', 'gps1', 'gps2']
df = pd.read_csv(fn, header=None, names=cols)
df['m'] = df.StopTime + df.StartTime
df['d'] = df.StopTime - df.StartTime
# filter input data set ...
start_hour = 8
end_hour = 20
df = df[(pd.to_datetime(df.StartTime, unit='s').dt.hour >= 8) & (pd.to_datetime(df.StartTime, unit='s').dt.hour <= 20)]
# 'start' and 'end' for the reporting DF: `r`
# which will contain equal intervals (1 hour in this case)
start = pd.to_datetime(df.StartTime.min(), unit='s').date()
end = pd.to_datetime(df.StopTime.max(), unit='s').date() + pd.Timedelta(days=1)
# building reporting DF: `r`
freq = '1H' # 1 Hour frequency
idx = pd.date_range(start, end, freq=freq)
r = pd.DataFrame(index=idx)
r = r[(r.index.hour >= start_hour) & (r.index.hour <= end_hour)]
r['start'] = (r.index - pd.datetime(1970,1,1)).total_seconds().astype(np.int64)
# 1 hour in seconds, minus one second (so that we will not count it twice)
interval = 60*60 - 1
r['LogCount'] = 0
r['UniqueIDCount'] = 0
for i, row in r.iterrows():
# intervals overlap test
# https://en.wikipedia.org/wiki/Interval_tree#Overlap_test
# i've slightly simplified the calculations of m and d
# by getting rid of division by 2,
# because it can be done eliminating common terms
u = df[np.abs(df.m - 2*row.start - interval) < df.d + interval].UserID
r.ix[i, ['LogCount', 'UniqueIDCount']] = [len(u), u.nunique()]
r['Date'] = pd.to_datetime(r.start, unit='s').dt.date
r['Day'] = pd.to_datetime(r.start, unit='s').dt.weekday_name.str[:3]
r['StartTime'] = pd.to_datetime(r.start, unit='s').dt.time
r['EndTime'] = pd.to_datetime(r.start + interval + 1, unit='s').dt.time
#r.to_csv('results.csv', index=False)
#print(r[r.LogCount > 0])
#print (r['StartTime'], r['EndTime'], r['Day'], r['LogCount'], r['UniqueIDCount'])
rout = r[['Date', 'StartTime', 'EndTime', 'Day', 'LogCount', 'UniqueIDCount'] ]
#print rout
OLD answer:
from_time = '08:00'
to_time = '18:00'
rout.between_time(from_time, to_time).to_csv('one_hour.csv', index=False, header=False)

Categories