Is there a shorter or more elegant way to pivot a timeseries by year in pandas? The code below does what I want but I wonder if there is a better way to accomplish this:
import pandas
import numpy
daterange = pandas.date_range(start='2000-01-01', end='2017-12-31', freq='10T')
# generate a fake timeseries of measured wind speeds from 2000 to 2017 in 10min intervals
wind_speed = pandas.Series(data=numpy.random.rand(daterange.size), index=daterange)
# group by year
wind_speed_groups = wind_speed.groupby(wind_speed.index.year).groups
# assemble data frame with columns of wind speed data for every year
wind_speed_pivot = pandas.DataFrame()
for key, group in wind_speed_groups.items():
series = wind_speed[group]
series.name = key
series.index = series.index - pandas.Timestamp(str(key)+'-01-01')
wind_speed_pivot = wind_speed_pivot.join(series, how='outer')
print(wind_speed_pivot)
I'm not sure if this is the fastest method, as I'm adding two columns to your initial dataframe (it's possible to add just one if you want to overwrite it).
import pandas as pd
import numpy as np
import datetime as dt
daterange = pd.date_range(start='2000-01-01', end='2017-12-31', freq='10T')
# generate a fake timeseries of measured wind speeds from 2000 to 2017 in 10min intervals
wind_speed = pd.Series(data=np.random.rand(daterange.size), index=daterange)
df = wind_speed.to_frame("windspeed")
df["year"] = df.index.year
df["pv_index"] = df.index - df["year"].apply(lambda x: dt.datetime(x,1,1))
wind_speed_pivot = df.pivot_table(index=["pv_index"], columns=["year"], values=["windspeed"])
Related
This is how my dataframe looks like:
datetime open high low close
2006-01-02 4566.95 4601.35 4542.00 4556.25
2006-01-03 4531.45 4605.45 4531.45 4600.25
2006-01-04 4619.55 4707.60 4616.05 4694.14
.
.
.
Need to calculate the Monthly Returns in %
Formula: (Month Closing Price - Month Open Price) / Month Open Price
I can't seem to get the open price and closing price of a month, because in my df most months dont have a log for the 1st of the month. So having trouble calculating it.
Any help would be very much appreciated!
You need to use groupby and agg function in order to get the first and last value of each column in each month:
import pandas as pd
df = pd.read_csv("dt.txt")
df["datetime"] = pd.to_datetime(df["datetime"])
df.set_index("datetime", inplace=True)
resultDf = df.groupby([df.index.year, df.index.month]).agg(["first", "last"])
resultDf["new_column"] = (resultDf[("close", "last")] - resultDf[("open", "first")])/resultDf[("open", "first")]
resultDf.index.rename(["year", "month"], inplace=True)
resultDf.reset_index(inplace=True)
resultDf
The code above will result in a dataframe that has multiindex column. So, if you want to get, for example, rows with year of 2010, you can do something like:
resultDf[resultDf["year"] == 2010]
You can create a custom grouper such as follow :
import pandas as pd
import numpy as np
from io import StringIO
csvfile = StringIO(
"""datetime\topen\thigh\tlow\tclose
2006-01-02\t4566.95\t4601.35\t4542.00\t4556.25
2006-01-03\t4531.45\t4605.45\t4531.45\t4600.25
2006-01-04\t4619.55\t4707.60\t4616.05\t4694.14""")
df = pd.read_csv(csvfile, sep = '\t', engine='python')
df.datetime = pd.to_datetime(df.datetime, format = "%Y-%m-%d")
dg = df.groupby(pd.Grouper(key='datetime', axis=0, freq='M'))
Then each group of dg is separate by month, and since we convert datetime as pandas.datetime we can use classic arithmetic on it :
def monthly_return(datetime, close_value, open_value):
index_start = np.argmin(datetime)
index_end = np.argmax(datetime)
return (close_value[index_end] - open_value[index_start]) / open_value[index_start]
dg.apply(lambda x : monthly_return(x.datetime, x.close, x.open))
Out[97]:
datetime
2006-01-31 0.02785
Freq: M, dtype: float64
Of course a pure functional approach is possible instead of using monthly_return function
I want to consider only the hourly temperature values of a particular day which are greater than the respective daily threshold values and replace the other values with a NaN value.
For example, the pandas series values are as follows
hours = pd.date_range("2018-01-01", periods=120, freq="H")
temperature = pd.Series(range(len(hours)), index=hours)
days = pd.date_range("2018-01-01", periods=5, freq="d")
daily_treshold = pd.Series([5,10,6,25,30], index=days)
Now I want to replace the hourly temperature values on the first day which are less than 5, second-day values which are less than 10 and so on.
How can I achieve this using pandas groupby and apply. Thanks.
Here is an easy understanding double loop version to do what you want. pandas.Series.iteritems() returns (index, value) tuples of the Series:
import numpy as np
import pandas as pd
hours = pd.date_range("2018-01-01", periods=120, freq="H")
temperature = pd.Series(range(len(hours)), index=hours)
days = pd.date_range("2018-01-01", periods=5, freq="d")
daily_treshold = pd.Series([5,10,6,25,30], index=days)
for day_index, treshold in daily_treshold.iteritems():
for hour_index, temp in temperature.iteritems():
if day_index.date() == hour_index.date():
if temp < treshold:
temperature[hour_index] = np.NaN
print(temperature)
It's impossible to get index of pandas.Series when using pandas.Series.apply(). While the date of temperature and daily_treshold are different, we need do some change to compare them. For convenience, I change temperature to pandas.Dataframe.
Here is the code to show how to use apply function on temperature:
import numpy as np
import pandas as pd
hours = pd.date_range("2018-01-01", periods=120, freq="H")
# temperature = pd.Series(range(len(hours)), index=hours)
temperature = pd.DataFrame({'hour': hours,
'temp': range(len(hours))})
days = pd.date_range("2018-01-01", periods=5, freq="d")
daily_treshold = pd.Series([5,10,6,25,30], index=days)
def apply_replace(row, daily_treshold):
treshold = daily_treshold[row['hour'].strftime('%Y-%m-%d')]
if row['temp'] < treshold:
return np.NaN
else:
return row['temp']
temperature['after_replace'] = temperature.apply(apply_replace, axis=1, args=(daily_treshold,))
Is there a way to create a new data frame from a time series with the daily diffence?
This means, suppose that on October 5 I had 5321 counts and on October 6 5331 counts. This represents the difference of 10; what I want is, for example, that my DataFrame shows 10 on October 6.
Here's my code of the raw dataframe:
import pandas as pd
from datetime import datetime, timedelta
url = 'https://raw.githubusercontent.com/mariorz/covid19-mx-time-series/master/data/covid19_confirmed_mx.csv'
df = pd.read_csv(url, index_col=0)
df = df.loc['Colima','18-03-2020':'06-10-2020']
df = pd.DataFrame(df)
df.index = pd.to_datetime(df.index, format='%d-%m-%Y')
df
This is the raw outcome:
Thank you guys!
There's an inbuilt diff function just for these kind of operations:
df['Diff'] = df.Colima.diff()
Yes, you can use the shift method to access the preceding row's value to calculate the difference.
df['difference'] = df.Colima - df.Colima.shift(1)
I want to use time series with Pandas. I read multiple time series one by one, from a csv file which has the date in the column named "Date" as (YYYY-MM-DD):
Date,Business,Education,Holiday
2005-01-01,6665,8511,86397
2005-02-01,8910,12043,92453
2005-03-01,8834,12720,78846
2005-04-01,8127,11667,52644
2005-05-01,7762,11092,33789
2005-06-01,7652,10898,34245
2005-07-01,7403,12787,42020
2005-08-01,7968,13235,36190
2005-09-01,8345,12141,36038
2005-10-01,8553,12067,41089
2005-11-01,8880,11603,59415
2005-12-01,8331,9175,70736
df = pd.read_csv(csv_file, index_col = 'Date',header=0)
Series_list = df.keys()
The time series can have different frequencies: day, week, month, quarter, year and I want to index the time series according to a frequency I decide before I generate the Arima model. Could someone please explain how can I define the frequency of the series?
stepwise_fit = auto_arima(df[Series_name]....
pandas has a built in function pandas.infer_freq()
import pandas as pd
df = pd.DataFrame({'Date': ['2005-01-01', '2005-02-01', '2005-03-01', '2005-04-01'],
'Date1': ['2005-01-01', '2005-01-02', '2005-01-03', '2005-01-04'],
'Date2': ['2006-01-01', '2007-01-01', '2008-01-01', '2009-01-01'],
'Date3': ['2006-01-01', '2006-02-06', '2006-03-11', '2006-04-01']})
df['Date'] = pd.to_datetime(df['Date'])
df['Date1'] = pd.to_datetime(df['Date1'])
df['Date2'] = pd.to_datetime(df['Date2'])
df['Date3'] = pd.to_datetime(df['Date3'])
pd.infer_freq(df.Date)
#'MS'
pd.infer_freq(df.Date1)
#'D'
pd.infer_freq(df.Date2)
#'AS-JAN'
Alternatively you could also make use of the datetime functionality of the columns.
df.Date.dt.freq
#'MS'
Of course if your data doesn't actually have a real frequency, then you won't get anything.
pd.infer_freq(df.Date3)
#
The frequency descriptions are docmented under offset-aliases.
Given a pandas series indexed by date, I need to slice the series into chunks of n months. The code below slices the data into 12-month chunks. How to generalize this to slice into n-month chunks? Also, please note, not all dates are in the series, so the first and last days of each month might not exist in the series.
# Create a pandas series indexed by date
import pandas as pd
import numpy as np
dates = pd.date_range('2000-01-01', '2009-12-31')
data = np.random.rand(len(dates))
series = pd.Series(data, dates)
# Poke holes in the data, so not all dates are represented
series = series[series > 0.50]
# Slice the series into chunks of 12 months each
for year in range(2000, 2009+1):
slice = series[str(year):str(year)]
print "Start date =", slice.index[0], " End date =", slice.index[-1]
You can use pd.cut() to cut your time-series index into chunks, and then use groupby to perform your customized calculations.
# Create a pandas series indexed by date
import pandas as pd
import numpy as np
np.random.seed(0)
dates = pd.date_range('2000-01-01', '2009-12-31', freq='D')
data = np.random.rand(len(dates))
series = pd.Series(data, dates)
# Poke holes in the data, so not all dates are represented
series = series[series > 0.8]
# create a data_range, suppose start at 2001-01-01, 3 month
date_rng = pd.date_range('2000-01-01', periods=50, freq='3MS')
labels = date_rng[1:]
# use pd.cut to cut ts index into chunks
grouped = series.groupby(pd.cut(series.index, bins=date_rng, labels=labels, right=False))
start_date = grouped.head(1).index
Out[206]:
DatetimeIndex(['2000-01-08', '2000-04-08', '2000-07-03', '2000-10-02',
'2001-01-03', '2001-04-04', '2001-07-01', '2001-10-02',
'2002-01-11', '2002-04-05', '2002-07-01', '2002-10-02',
'2003-01-02', '2003-04-03', '2003-07-02', '2003-10-04',
'2004-01-01', '2004-04-01', '2004-07-03', '2004-10-03',
'2005-01-07', '2005-04-08', '2005-07-12', '2005-10-05',
'2006-01-01', '2006-04-01', '2006-07-01', '2006-10-04',
'2007-01-05', '2007-04-04', '2007-07-05', '2007-10-06',
'2008-01-01', '2008-04-05', '2008-07-05', '2008-10-01',
'2009-01-02', '2009-04-04', '2009-07-04', '2009-10-02'],
dtype='datetime64[ns]', freq=None, tz=None)
end_date = grouped.tail(1).index
Out[207]:
DatetimeIndex(['2000-03-30', '2000-06-26', '2000-09-30', '2000-12-30',
'2001-03-30', '2001-06-28', '2001-09-27', '2001-12-28',
'2002-03-24', '2002-06-29', '2002-09-24', '2002-12-29',
'2003-03-27', '2003-06-22', '2003-09-28', '2003-12-31',
'2004-03-31', '2004-06-27', '2004-09-17', '2004-12-31',
'2005-03-23', '2005-06-23', '2005-09-30', '2005-12-30',
'2006-03-29', '2006-06-24', '2006-09-30', '2006-12-31',
'2007-03-26', '2007-06-27', '2007-09-29', '2007-12-31',
'2008-03-25', '2008-06-30', '2008-09-28', '2008-12-30',
'2009-03-25', '2009-06-29', '2009-09-26', '2009-12-27'],
dtype='datetime64[ns]', freq=None, tz=None)