Pandas: Generate date intervals between two dates with yearly reset - python

I am trying to generate 8 day intervals between two-time periods using pandas.date_range. In addition, when the 8 day interval exceeds the end of year (i.e., 365/366), I would like the range start to reset to the beginning of respective year. Below is the example code for just two years, however, I do plan to use it across several years, e.g., 2014-01-01 to 2021-01-01.
import pandas as pd
print(pd.date_range(start='2018-12-01', end='2019-01-31', freq='8D'))
Results in,
DatetimeIndex(['2018-12-01', '2018-12-09', '2018-12-17', '2018-12-25','2019-01-02', '2019-01-10', '2019-01-18', '2019-01-26'], dtype='datetime64[ns]', freq='8D')
However, I would like the start of the interval in 2019 to reset to the first day, e.g., 2019-01-01

You could loop creating a date_range up to the start of the next year for each year, appending them until you hit the end date.
import pandas as pd
from datetime import date
def date_range_with_resets(start, end, freq):
start = date.fromisoformat(start)
end = date.fromisoformat(end)
result = pd.date_range(start=start, end=start, freq=freq) # initialize result with just start date
next_year_start = start.replace(year=start.year+1, month=1, day=1)
while next_year_start < end:
result = result.append(pd.date_range(start=start, end=next_year_start, freq=freq))
start = next_year_start
next_year_start = next_year_start.replace(year=next_year_start.year+1)
result = result.append(pd.date_range(start=start, end=end, freq=freq))
return result[1:] # remove duplicate start date
start = '2018-12-01'
end = '2019-01-31'
date_range_with_resets(start, end, freq='8D')
Edit:
Here's a simpler way without using datetime. Create a date_range of years between start and end, then loop through those.
def date_range_with_resets(start, end, freq):
years = pd.date_range(start=start, end=end, freq='YS') # YS=year start
if len(years) == 0:
return pd.date_range(start=start, end=end, freq=freq)
result = pd.date_range(start=start, end=years[0], freq=freq)
for i in range(0, len(years) - 1):
result = result.append(pd.date_range(start=years[i], end=years[i+1], freq=freq))
result = result.append(pd.date_range(start=years[-1], end=end, freq=freq))
return result

Related

Get entire days timestamp with secs in python

I need to trying to get entire days timestamp in array. I need from 00:00:00 to 24:00:00 seconds of timestamp a day i.e 86400 data.
I have used pandas to achieve but could not
m_days=1
# today's date in timestamp
base = pd.Timestamp.today()
timestamp_list = [base + datetime.timedelta(days=x) for x in range(n_days)]
This should do it then:
import pandas as pd
start = pd.Timestamp('2022-02-16') # start of the day
end = start + pd.Timedelta(days=1) # end of the day
timestamps = pd.date_range(start, end, freq='s')

Getting list of months in between two dates according to specific format

start = "Nov20"
end = "Jan21"
# Expected output:
["Nov20", "Dec20", "Jan21"]
What I've tried so far is the following but am looking for more elegant way.
from calendar import month_abbr
from time import strptime
def get_range(a, b):
start = strptime(a[:3], '%b').tm_mon
end = strptime(b[:3], '%b').tm_mon
dates = []
for m in month_abbr[start:]:
dates.append(m+a[-2:])
for mm in month_abbr[1:end + 1]:
dates.append(mm+b[-2:])
print(dates)
get_range('Nov20', 'Jan21')
Note: i don't want to use pandas as that's not logical to import such library for generating dates.
The date range may span different years so one way is to loop from the start date to end date and increment the month by 1 until end date is reached.
Try this:
from datetime import datetime
def get_range(a, b):
start = datetime.strptime(a, '%b%y')
end = datetime.strptime(b, '%b%y')
dates = []
while start <= end:
dates.append(start.strftime('%b%y'))
if start.month == 12:
start = start.replace(month=1, year=start.year+1)
else:
start = start.replace(month=start.month+1)
return dates
dates = get_range("Nov20", "Jan21")
print(dates)
Output:
['Nov20', 'Dec20', 'Jan21']
You can use timedelta to step one month (31 days) forward, but make sure you stay on the 1st of the month, otherwise the days might add up and eventually skip a month.
from datetime import datetime
from datetime import timedelta
def get_range(a, b):
start = datetime.strptime(a, '%b%y')
end = datetime.strptime(b, '%b%y')
dates = []
while start <= end:
dates.append(start.strftime('%b%y'))
start = (start + timedelta(days=31)).replace(day=1) # go to 1st of next month
return dates
dates = get_range("Jan20", "Jan21")
print(dates)

Dates/months calculation

My below working code calculates date/month ranges, but I am using the Pandas library, which I want to get rid of.
import pandas as pd
dates=pd.date_range("2019-12","2020-02",freq='MS').strftime("%Y%m%d").tolist()
#print dates : ['20191101','20191201','20200101','20200201']
df=(pd.to_datetime(dates,format="%Y%m%d") + MonthEnd(1)).strftime("%Y%m%d").tolist()
#print df : ['20191130','20191231','20200131','20200229']
How can I rewrite this code without using Pandas?
I don't want to use Pandas library as I am triggering my job through Oozie and we don't have Pandas installed on all our nodes.
Pandas offers some nice functionalities when using datetimes which the standard library datetime module does not have (like the frequency or the MonthEnd). You have to reproduce these yourself.
import datetime as DT
def next_first_of_the_month(dt):
"""return a new datetime where the month has been increased by 1 and
the day is always the first
"""
new_month = dt.month + 1
if new_month == 13:
new_year = dt.year + 1
new_month = 1
else:
new_year = dt.year
return DT.datetime(new_year, new_month, day=1)
start, stop = [DT.datetime.strptime(dd, "%Y-%m") for dd in ("2019-11", "2020-02")]
dates = [start]
cd = next_first_of_the_month(start)
while cd <= stop:
dates.append(cd)
cd = next_first_of_the_month(cd)
str_dates = [d.strftime("%Y%m%d") for d in dates]
print(str_dates)
# prints: ['20191101', '20191201', '20200101', '20200201']
end_dates = [next_first_of_the_month(d) - DT.timedelta(days=1) for d in dates]
str_end_dates = [d.strftime("%Y%m%d") for d in end_dates]
print(str_end_dates)
# prints ['20191130', '20191231', '20200131', '20200229']
I used here a function to get a datetime corresponding to the first day of the next month of the input datetime. Sadly, timedelta does not work with months, and adding 30 days of course is not feasible (not all months have 30 days).
Then a while loop to get a sequence of fist days of the month until the stop date.
And to the get the end of the month, again get the next first day of the month fo each datetime in your list and subtract a day.

Create monthly time series with arbitrary start dates

Using pandas it is easy to create a monthly series of dates.
import pandas as pd
pd.date_range('2012-04-23', '2013-01-23', freq='BM')
DatetimeIndex(['2012-04-30', '2012-05-31', '2012-06-29', '2012-07-31',
'2012-08-31', '2012-09-28', '2012-10-31', '2012-11-30',
'2012-12-31'],
dtype='datetime64[ns]', freq='BM')
Notice that the dates in the DatetimeIndex are month ends. I know that it should be considering I chose freq='BM' but I don't believe I had a choice that would have accomplished my goal.
I'm often in need for producing a monthly series of dates starting from the last business day going back in time every month.
I'd like to see this instead:
DatetimeIndex(['2012-04-23', '2012-05-23', '2012-06-23', '2012-07-23',
'2012-08-23', '2012-09-23', '2012-10-23', '2012-11-23',
'2012-12-23'],
dtype='datetime64[ns]', freq=None)
or another more complicated example might be to get months from '2012-01-30' to '2012-04-30'. I'd expect to see:
DatetimeIndex(['2012-01-30', '2012-02-29', '2012-03-30', '2012-04-30'],
dtype='datetime64[ns]', freq=None)
You may be looking for something like this:
from pandas.tseries.offsets import Day, BDay
pd.date_range(start = '2012-01-01', periods = 6, freq = 'MS') + Day(22) + BDay(0)
Out[12]:
DatetimeIndex(['2012-01-23', '2012-02-23', '2012-03-23', '2012-04-23',
'2012-05-23', '2012-06-25'],
dtype='datetime64[ns]', freq=None)
Day(22) adds an offset of 22 days and BDay is responsible for business day offset (BDay(0) takes the nearest business day).
It's a bit more difficult with dates starting at 30th. So I had to write a function for this. (For clarity of code it doesn't allow a custom freq parameter.)
def my_business_date_range(day, **kwargs):
assert(isinstance(day, int) & (day > 0) & (day < 32))
rng0 = pd.date_range(freq = 'MS', **kwargs)
rng1 = rng0 + pd.tseries.offsets.Day(day-1) + pd.tseries.offsets.BDay(0)
# Correcting overflows:
overflow_idx, = np.nonzero(rng0.month != rng1.month)
if overflow_idx.size > 0:
# rng1 is not mutable
tmp = rng1.tolist()
bme = pd.tseries.offsets.BusinessMonthEnd(-1)
for i in overflow_idx:
tmp[i] = bme(rng1[i])
rng1 = pd.DatetimeIndex(tmp)
return rng1
my_business_date_range(30, start= '2012-01-01', periods = 6)
Out[13]:
DatetimeIndex(['2012-01-30', '2012-02-29', '2012-03-30', '2012-04-30',
'2012-05-30', '2012-06-29'],
dtype='datetime64[ns]', freq=None)
Pandas has also an experimental CustomBusinessMonth and the like but I couldn't make it work.
I'm not clear on your question, but believe this is a move in the right direction.
start = '2012-04-23'
end = '2013-01-23'
>>> pd.DatetimeIndex([pd.datetime(ts.year, ts.month, int(end.split("-")[-1]))
for ts in pd.date_range(start, end, freq='BM')])
DatetimeIndex(['2012-04-23', '2012-05-23', '2012-06-23', '2012-07-23', '2012-08-23', '2012-09-23', '2012-10-23', '2012-11-23', '2012-12-23'], dtype='datetime64[ns]', freq=None)
Although not optimized for speed, I believe the following function will return the correct values per your requirements.
def foo(date, periods, forward=True):
if isinstance(date, str):
date = pd.Timestamp(date).date()
dates = [date + relativedelta(date, months=n * (1 if forward else -1)) for n in range(1, periods +1)]
result = []
print dates
for date in dates:
month = date.month
iso_day = date.isoweekday()
if iso_day == 6:
date += dt.timedelta(days=2 if forward else -1)
elif iso_day == 7:
date += dt.timedelta(days=1 if forward else -2)
if date.month != month:
# Gone into next/preceding month. Roll back/forward.
date -= dt.timedelta(days=3 if forward else -3)
result.append(date)
return result

Pandas date_range starting from the end date to start date

In am trying to generate a range of semi-annual dates using Python. Pandas provides a function pd.date_range to help with this however I would like my date range to start from the end date and iterate backwards.
For instance given the input:
start = datetime.datetime(2016 ,2, 8)
end = datetime.datetime(2018 , 6, 1)
pd.date_range(start, end, freq='6m')
The result is:
DatetimeIndex(['2016-02-29', '2016-08-31', '2017-02-28', '2017-08-31',
'2018-02-28'])
How can I generate the following:
DatetimeIndex(['2016-02-08', '2016-06-01', '2016-12-01', '2017-06-01',
'2017-12-01', '2018-06-01'])
With the updated output (from the edit you made) you can do something like the following:
from pandas.tseries.offsets import DateOffset
end = datetime.datetime(2018 , 6, 1)
start = datetime.datetime(2016 ,2, 8)
#Get the range of months to cover
months = (end.year - start.year)*12 + end.month - start.month
#The frequency of periods
period = 6 # in months
pd.DatetimeIndex([end - DateOffset(months=e) for e in range(0, months, period)][::-1]).insert(0, start)
This is a fairly concise solution, though I didn't compare runtimes so I'm not sure how fast it is.
Basically this is just creating the dates you need as a list, and then converting it to a datetime index.
This can be done without pandas and using datutil instead. However it is more involved than it perhaps should:
from datetime import date
import math
from dateutil.relativedelta import relativedelta
#set up key dates
start = date(2016 ,2, 8)
end = date(2018 , 6, 1)
#calculate date range and number of 6 month periods
daterange = end-start
periods = daterange.days *2//365
#calculate next date in sequence and check for year roll-over
next_date = date(start.year,math.ceil(start.month/6)*6,1)
if next_date < start: next_date = date(next_date.year+1,next_date.month,1)
#add the first two values to a list
arr = [start.isoformat(),next_date.isoformat()]
#calculate all subsequent dates using 'relativedelta'
for i in range(periods):
next_date = next_date+ relativedelta(months=+6)
arr.append(next_date.isoformat())
#display results
print(arr)

Categories