Generate a normal distribution of dates within a range - python

I have a date range - say between 1925-01-01 and 1992-01-01. I'd like to generate a list of x dates between that range, and have those x dates generated follow a 'normal' (bell curve - see image) distribution.
There are many many answers on stackoverflow about doing this with integers (using numpy, scipy, etc), but I can't find a solid example with dates

As per #sascha's comment, a conversion from the dates to a time value does the job:
#!/usr/bin/env python3
import time
import numpy
_DATE_RANGE = ('1925-01-01', '1992-01-01')
_DATE_FORMAT = '%Y-%m-%d'
_EMPIRICAL_SCALE_RATIO = 0.15
_DISTRIBUTION_SIZE = 1000
def main():
time_range = tuple(time.mktime(time.strptime(d, _DATE_FORMAT))
for d in _DATE_RANGE)
distribution = numpy.random.normal(
loc=(time_range[0] + time_range[1]) * 0.5,
scale=(time_range[1] - time_range[0]) * _EMPIRICAL_SCALE_RATIO,
size=_DISTRIBUTION_SIZE
)
date_range = tuple(time.strftime(_DATE_FORMAT, time.localtime(t))
for t in numpy.sort(distribution))
print(date_range)
if __name__ == '__main__':
main()
Note that instead of the _EMPIRICAL_SCALE_RATIO, you could (should?) use scipy.stats.truncnorm to generate a truncated normal distribution.

Here is an implementation using datetime module that also allows to generate hours, minutes, seconds & is using Numpy/Pandas friendly date format.
from datetime import datetime
import numpy
def main(start, end, date_format, distribution_size, scale_ratio):
# Converting to timestamp
start = datetime.strptime(start, date_format).timestamp()
end = datetime.strptime(end, date_format).timestamp()
# Generate Normal Distribution
mu = datetime.strptime('1958-01-01T00:00:00', date_format).timestamp()
sigma = (end - start) * scale_ratio
total_distribution = np.random.normal(loc=mu, scale=sigma, size=distribution_size)
# Sort and Convert back to datetime
sorted_distribution = numpy.sort(total_distribution)
date_range = tuple(datetime.fromtimestamp(t) for t in sorted_distribution)
print(date_range)
start = '1925-01-01T00:00:00'
end = '1992-01-01T00:00:00'
date_format = '%Y-%m-%dT%H:%M:%S'
main(start=start, end=end, date_format=date_format, distribution_size=1000, scale_ratio=0.05)
Results:
You can also blend multiple distributions like this:
dist_1 = np.random.normal(loc=mu_1, scale=sigma_1, size=size_1)
dist_2 = np.random.normal(loc=mu_2, scale=sigma_2, size=size_2)
all_distributions = np.concatenate([dist_1, dist_2])

Related

FIlter Data Frame by time

I have a large data frame that is being imported through an excel sheet. I already filtered it to exclude weekends but also need to do the same so only daytime hours eg 7:00 - 18:00 will be displayed. Here is what the data frame looks like after successfully taking out weekends.
picture of data
isBusinessDay = BDay().is_on_offset
match_series = pd.to_datetime(df['timestamp(America/New_York)']).map(isBusinessDay)
df_new = df[match_series]
df_new
A simple approach is to use filters on your datetime field using the Series dt accessor.
In this case...
filt = (df['timestamp(America/New_York)'].dt.hour >= 7) & (df['timestamp(America/New_York)'].dt.hour <= 18)
df_filtered = df.loc[filt, :]
More reading: https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.html
For more and a sample of this in action, see the below code block. The random date generator was taken from here and modified slightly.
import random
import time
import pandas as pd
def str_time_prop(start, end, time_format, prop):
"""Get a time at a proportion of a range of two formatted times.
start and end should be strings specifying times formatted in the
given format (strftime-style), giving an interval [start, end].
prop specifies how a proportion of the interval to be taken after
start. The returned time will be in the specified format.
"""
stime = time.mktime(time.strptime(start, time_format))
etime = time.mktime(time.strptime(end, time_format))
ptime = stime + prop * (etime - stime)
return time.strftime(time_format, time.localtime(ptime))
def random_date(start, end, prop):
return str_time_prop(start, end, '%Y-%m-%d %I:%M %p', prop)
dates = {'dtfield':[random_date("2007-1-1 1:30 PM", "2009-1-1 4:50 AM", random.random()) for n in range(1000)]}
df = pd.DataFrame(data=dates)
df['dtfield'] = pd.to_datetime(df['dtfield'])
filt = (df['dtfield'].dt.hour >= 7) & (df['dtfield'].dt.hour <= 18)
df_filtered = df.loc[filt, :]
df_filtered

How to find the time interval between two unix timestamps?

I want to check if the time difference between two unix timestamps is close to a given interval and print the timestamp and use this new timestamp to compare further timestamps to see if the difference is close to the interval. The timestamps are in a numpy array.
This is my try at this:
from math import isclose
def check_time_interval(now, update, interval):
if isclose(update - now, interval):
# do something
print(update)
return update
else:
return now
interval = 60.0
now = timestamps[0]
for timestamp in timestamps:
now = check_time_interval(now, timestamp, interval)
This code doesn't print any timestamps although the difference is close to the interval. What am I doing wrong? Is there a better and efficient way to do this?
Edit:
sample input:
timestamps = [1632267861.212 + i for i in range(100)]
You use isclose incorrectly. Try this:
from math import isclose
def check_time_interval(now, update, interval):
if isclose(update, now, abs_tol=interval):
# do something
print(update)
return update
else:
return now
interval = 60.0
timestamps = [1632267861.212 + i for i in range(100)]
now = timestamps[0]
for timestamp in timestamps:
check_time_interval(now, timestamp, interval)
As for more efficient way, you could vectorize and check all intervals using numpy, something like this:
import numpy as np
timestamps = np.array(timestamps)
is_close = np.abs(timestamps - now) <= 60
print(timestamps[is_close])

Have a list of hours between two dates in python

I have two times and I want to make a list of all the hours between them using the same format in Python
from= '2016-12-02T11:00:00.000Z'
to= '2017-06-06T07:00:00.000Z'
hours=to-from
so the result will be something like this
2016-12-02T11:00:00.000Z
2016-12-02T12:00:00.000Z
2016-12-02T13:00:00.000Z
..... and so on
How can I so this and what kind of plugin should I use?
If possible I would recommend using pandas.
import pandas
time_range = pandas.date_range('2016-12-02T11:00:00.000Z', '2017-06-06T07:00:00.000Z', freq='H')
If you need strings then use the following:
timestamps = [str(x) + 'Z' for x in time_range]
# Output
# ['2016-12-02 11:00:00+00:00Z',
# '2016-12-02 12:00:00+00:00Z',
# '2016-12-02 13:00:00+00:00Z',
# '2016-12-02 14:00:00+00:00Z',
# '2016-12-02 15:00:00+00:00Z',
# '2016-12-02 16:00:00+00:00Z',
# ...]
simpler solution using standard library's datetime package:
from datetime import datetime, timedelta
DATE_TIME_STRING_FORMAT = '%Y-%m-%dT%H:%M:%S.%fZ'
from_date_time = datetime.strptime('2016-12-02T11:00:00.000Z',
DATE_TIME_STRING_FORMAT)
to_date_time = datetime.strptime('2017-06-06T07:00:00.000Z',
DATE_TIME_STRING_FORMAT)
date_times = [from_date_time.strftime(DATE_TIME_STRING_FORMAT)]
date_time = from_date_time
while date_time < to_date_time:
date_time += timedelta(hours=1)
date_times.append(date_time.strftime(DATE_TIME_STRING_FORMAT))
will give us
>>>date_times
['2016-12-02T11:00:00.000000Z',
'2016-12-02T12:00:00.000000Z',
'2016-12-02T13:00:00.000000Z',
'2016-12-02T14:00:00.000000Z',
'2016-12-02T15:00:00.000000Z',
'2016-12-02T16:00:00.000000Z',
'2016-12-02T17:00:00.000000Z',
'2016-12-02T18:00:00.000000Z',
'2016-12-02T19:00:00.000000Z',
'2016-12-02T20:00:00.000000Z',
...]

string convert to datetime timezone

I want to linear interpolation some points between two time string.
So I try to convert string to datetime then insert some point then convert datetime to string. but it seems the timezone not correct.
In below example. I wish to insert one point between 9-28 11:07:57.435" and "9-28 12:00:00.773".
#!/usr/bin/env python
import numpy as np
from time import mktime
from datetime import datetime
#-----------------------------------------#
def main():
dtstr = [
"9-28 11:07:57.435",
"9-28 12:00:00.773"
]
print "input",dtstr
dtlst = str2dt(dtstr)
floatlst = dt2float(dtlst)
bins = 3
x1 = list(np.arange(floatlst[0],floatlst[-1],(floatlst[-1]-floatlst[0])/bins))
dtlst = float2dt(x1)
dtstr = dt2str(dtlst)
print "output",dtstr
return
def str2dt(strlst):
dtlst = [datetime.strptime("2014-"+i, "%Y-%m-%d %H:%M:%S.%f") for i in strlst]
return dtlst
def dt2float(dtlst):
floatlst = [mktime(dt.timetuple()) for dt in dtlst]
return floatlst
def dt2str(dtlst):
dtstr = [dt.strftime("%Y-%m-%d %H:%M:%S %Z%z") for dt in dtlst]
return dtstr
def float2dt(floatlst):
dtlst = [datetime.utcfromtimestamp(seconds) for seconds in floatlst]
return dtlst
#-----------------------------------------#
if __name__ == "__main__":
main()
The output looks like:
input ['9-28 11:07:57.435', '9-28 12:00:00.773']
output ['2014-09-28 16:07:57 ', '2014-09-28 16:25:18 ', '2014-09-28 16:42:39 ']
Two questions here:
The input and output has 4 hours differ (9-28 16:07:57 to 9-28 11:07:57). I guess it caused by timezone but not sure how to fix it.
I wish the first and last point the same as input, but now it seems the last point is less than the input last point (16:42:39 vs 12:00:00).
Q1. You're right about the timezones, you're using time.mktime which converts struct_time to seconds assuming the input is local time, but then using datetime.utcfromtimestamp which (naturally) converts to utc. Use datetime.fromtimestamp instead to keep everything in local time.
Q2. As with the native Python range/xrange, when you do numpy.arange(x, y, z), the result starts with x and goes upto, but not including y (except in weird floating point roundoff cases. Don't rely on this behaviour). if you want consistent behaviour on the end points w/ floating values, use numpy.linspace
On the other hand, why convert datetime to seconds, then go back again? datetime objects support addition and subtraction. Below would be my suggestion.
from time import mktime, localtime
from datetime import datetime
from copy import copy
def main():
input_timestrings = ["9-28 11:07:57.435", "9-28 12:00:00.773"]
input_datetimes = timestrings_to_datestimes(input_timestrings)
start_datetime = input_datetimes[0]
end_datetime = input_datetimes[1]
# subtraction between datetime objects returns a timedelta object
period_length = end_datetime - start_datetime
bins = 3
# operation w/ timedelta objects and datetime objects work pretty much as you'd expect it to
delta = period_length / bins
datetimes = list(custom_range(start_datetime, end_datetime + delta, delta))
output_timestrings = datetimes_to_timestrings(datetimes)
print output_timestrings
return
def timestrings_to_datetimes(timestrings):
datetimes = [datetime.strptime("2014-"+timestring, "%Y-%m-%d %H:%M:%S.%f") for timestring in timestrings]
return datetimes
def datetimes_to_timestrings(datetimes):
timestrings = [datetime_.strftime("%Y-%m-%d %H:%M:%S %Z%z") for datetime_ in datetimes]
return timestrings
def custom_range(start, end, jump):
x = start
while x < end:
yield x
x = x + jump
if __name__ == "__main__":
main()

How to calculate the time interval between two time strings

I have two times, a start and a stop time, in the format of 10:33:26 (HH:MM:SS). I need the difference between the two times. I've been looking through documentation for Python and searching online and I would imagine it would have something to do with the datetime and/or time modules. I can't get it to work properly and keep finding only how to do this when a date is involved.
Ultimately, I need to calculate the averages of multiple time durations. I got the time differences to work and I'm storing them in a list. I now need to calculate the average. I'm using regular expressions to parse out the original times and then doing the differences.
For the averaging, should I convert to seconds and then average?
Yes, definitely datetime is what you need here. Specifically, the datetime.strptime() method, which parses a string into a datetime object.
from datetime import datetime
s1 = '10:33:26'
s2 = '11:15:49' # for example
FMT = '%H:%M:%S'
tdelta = datetime.strptime(s2, FMT) - datetime.strptime(s1, FMT)
That gets you a timedelta object that contains the difference between the two times. You can do whatever you want with that, e.g. converting it to seconds or adding it to another datetime.
This will return a negative result if the end time is earlier than the start time, for example s1 = 12:00:00 and s2 = 05:00:00. If you want the code to assume the interval crosses midnight in this case (i.e. it should assume the end time is never earlier than the start time), you can add the following lines to the above code:
if tdelta.days < 0:
tdelta = timedelta(
days=0,
seconds=tdelta.seconds,
microseconds=tdelta.microseconds
)
(of course you need to include from datetime import timedelta somewhere). Thanks to J.F. Sebastian for pointing out this use case.
Try this -- it's efficient for timing short-term events. If something takes more than an hour, then the final display probably will want some friendly formatting.
import time
start = time.time()
time.sleep(10) # or do something more productive
done = time.time()
elapsed = done - start
print(elapsed)
The time difference is returned as the number of elapsed seconds.
Here's a solution that supports finding the difference even if the end time is less than the start time (over midnight interval) such as 23:55:00-00:25:00 (a half an hour duration):
#!/usr/bin/env python
from datetime import datetime, time as datetime_time, timedelta
def time_diff(start, end):
if isinstance(start, datetime_time): # convert to datetime
assert isinstance(end, datetime_time)
start, end = [datetime.combine(datetime.min, t) for t in [start, end]]
if start <= end: # e.g., 10:33:26-11:15:49
return end - start
else: # end < start e.g., 23:55:00-00:25:00
end += timedelta(1) # +day
assert end > start
return end - start
for time_range in ['10:33:26-11:15:49', '23:55:00-00:25:00']:
s, e = [datetime.strptime(t, '%H:%M:%S') for t in time_range.split('-')]
print(time_diff(s, e))
assert time_diff(s, e) == time_diff(s.time(), e.time())
Output
0:42:23
0:30:00
time_diff() returns a timedelta object that you can pass (as a part of the sequence) to a mean() function directly e.g.:
#!/usr/bin/env python
from datetime import timedelta
def mean(data, start=timedelta(0)):
"""Find arithmetic average."""
return sum(data, start) / len(data)
data = [timedelta(minutes=42, seconds=23), # 0:42:23
timedelta(minutes=30)] # 0:30:00
print(repr(mean(data)))
# -> datetime.timedelta(0, 2171, 500000) # days, seconds, microseconds
The mean() result is also timedelta() object that you can convert to seconds (td.total_seconds() method (since Python 2.7)), hours (td / timedelta(hours=1) (Python 3)), etc.
This site says to try:
import datetime as dt
start="09:35:23"
end="10:23:00"
start_dt = dt.datetime.strptime(start, '%H:%M:%S')
end_dt = dt.datetime.strptime(end, '%H:%M:%S')
diff = (end_dt - start_dt)
diff.seconds/60
This forum uses time.mktime()
Structure that represent time difference in Python is called timedelta. If you have start_time and end_time as datetime types you can calculate the difference using - operator like:
diff = end_time - start_time
you should do this before converting to particualr string format (eg. before start_time.strftime(...)). In case you have already string representation you need to convert it back to time/datetime by using strptime method.
I like how this guy does it — https://amalgjose.com/2015/02/19/python-code-for-calculating-the-difference-between-two-time-stamps.
Not sure if it has some cons.
But looks neat for me :)
from datetime import datetime
from dateutil.relativedelta import relativedelta
t_a = datetime.now()
t_b = datetime.now()
def diff(t_a, t_b):
t_diff = relativedelta(t_b, t_a) # later/end time comes first!
return '{h}h {m}m {s}s'.format(h=t_diff.hours, m=t_diff.minutes, s=t_diff.seconds)
Regarding to the question you still need to use datetime.strptime() as others said earlier.
Try this
import datetime
import time
start_time = datetime.datetime.now().time().strftime('%H:%M:%S')
time.sleep(5)
end_time = datetime.datetime.now().time().strftime('%H:%M:%S')
total_time=(datetime.datetime.strptime(end_time,'%H:%M:%S') - datetime.datetime.strptime(start_time,'%H:%M:%S'))
print total_time
OUTPUT :
0:00:05
import datetime as dt
from dateutil.relativedelta import relativedelta
start = "09:35:23"
end = "10:23:00"
start_dt = dt.datetime.strptime(start, "%H:%M:%S")
end_dt = dt.datetime.strptime(end, "%H:%M:%S")
timedelta_obj = relativedelta(start_dt, end_dt)
print(
timedelta_obj.years,
timedelta_obj.months,
timedelta_obj.days,
timedelta_obj.hours,
timedelta_obj.minutes,
timedelta_obj.seconds,
)
result:
0 0 0 0 -47 -37
Both time and datetime have a date component.
Normally if you are just dealing with the time part you'd supply a default date. If you are just interested in the difference and know that both times are on the same day then construct a datetime for each with the day set to today and subtract the start from the stop time to get the interval (timedelta).
Take a look at the datetime module and the timedelta objects. You should end up constructing a datetime object for the start and stop times, and when you subtract them, you get a timedelta.
you can use pendulum:
import pendulum
t1 = pendulum.parse("10:33:26")
t2 = pendulum.parse("10:43:36")
period = t2 - t1
print(period.seconds)
would output:
610
import datetime
day = int(input("day[1,2,3,..31]: "))
month = int(input("Month[1,2,3,...12]: "))
year = int(input("year[0~2020]: "))
start_date = datetime.date(year, month, day)
day = int(input("day[1,2,3,..31]: "))
month = int(input("Month[1,2,3,...12]: "))
year = int(input("year[0~2020]: "))
end_date = datetime.date(year, month, day)
time_difference = end_date - start_date
age = time_difference.days
print("Total days: " + str(age))
Concise if you are just interested in the time elapsed that is under 24 hours. You can format the output as needed in the return statement :
import datetime
def elapsed_interval(start,end):
elapsed = end - start
min,secs=divmod(elapsed.days * 86400 + elapsed.seconds, 60)
hour, minutes = divmod(min, 60)
return '%.2d:%.2d:%.2d' % (hour,minutes,secs)
if __name__ == '__main__':
time_start=datetime.datetime.now()
""" do your process """
time_end=datetime.datetime.now()
total_time=elapsed_interval(time_start,time_end)
Usually, you have more than one case to deal with and perhaps have it in a pd.DataFrame(data) format. Then:
import pandas as pd
df['duration'] = pd.to_datetime(df['stop time']) - pd.to_datetime(df['start time'])
gives you the time difference without any manual conversion.
Taken from Convert DataFrame column type from string to datetime.
If you are lazy and do not mind the overhead of pandas, then you could do this even for just one entry.
Here is the code if the string contains days also [-1 day 32:43:02]:
print(
(int(time.replace('-', '').split(' ')[0]) * 24) * 60
+ (int(time.split(' ')[-1].split(':')[0]) * 60)
+ int(time.split(' ')[-1].split(':')[1])
)

Categories