I'm trying to calculate a rolling mean, max, min, and std for specific columns inside a time series pandas dataframe. But I keep getting NaN for the lagged values and I'm not sure how to fix it. My MWE is:
import numpy as np
import pandas as pd
# original data
df = pd.DataFrame()
np.random.seed(0)
days = pd.date_range(start='2015-01-01', end='2015-05-01', freq='1D')
df = pd.DataFrame({'Date': days, 'col1': np.random.randn(len(days)), 'col2': 20+np.random.randn(len(days)), 'col3': 50+np.random.randn(len(days))})
df = df.set_index('Date')
print(df.head(10))
def add_lag(dfObj, window):
cols = ['col2', 'col3']
for col in cols:
rolled = dfObj[col].rolling(window)
lag_mean = rolled.mean().reset_index()#.astype(np.float16)
lag_max = rolled.max().reset_index()#.astype(np.float16)
lag_min = rolled.min().reset_index()#.astype(np.float16)
lag_std = rolled.std().reset_index()#.astype(np.float16)
dfObj[f'{col}_mean_lag{window}'] = lag_mean[col]
dfObj[f'{col}_max_lag{window}'] = lag_max[col]
dfObj[f'{col}_min_lag{window}'] = lag_min[col]
dfObj[f'{col}_std_lag{window}'] = lag_std[col]
# add lag feature for 1 day, 3 days
add_lag(df, window=1)
add_lag(df, window=3)
print(df.head(10))
print(df.tail(10))
Just don't do reset_index(). Then it works.
import numpy as np
import pandas as pd
# original data
df = pd.DataFrame()
np.random.seed(0)
days = pd.date_range(start='2015-01-01', end='2015-05-01', freq='1D')
df = pd.DataFrame({'Date': days, 'col1': np.random.randn(len(days)), 'col2': 20+np.random.randn(len(days)), 'col3': 50+np.random.randn(len(days))})
df = df.set_index('Date')
print(df.head(10))
def add_lag(dfObj, window):
cols = ['col2', 'col3']
for col in cols:
rolled = dfObj[col].rolling(window)
lag_mean = rolled.mean()#.reset_index()#.astype(np.float16)
lag_max = rolled.max()#.reset_index()#.astype(np.float16)
lag_min = rolled.min()#.reset_index()#.astype(np.float16)
lag_std = rolled.std()#.reset_index()#.astype(np.float16)
dfObj[f'{col}_mean_lag{window}'] = lag_mean#[col]
dfObj[f'{col}_max_lag{window}'] = lag_max#[col]
dfObj[f'{col}_min_lag{window}'] = lag_min#[col]
dfObj[f'{col}_std_lag{window}'] = lag_std#[col]
# add lag feature for 1 day, 3 days
add_lag(df, window=1)
add_lag(df, window=3)
print(df.head(10))
print(df.tail(10))
Whenever you use the rolling function, it creates NaN for the values that it cannot calculate.
For example, consider a single column, col1 = [2, 4, 10, 6], and a rolling window of 2.
The output of the rolling window will be NaN, 3, 7, 8.
This is because the rolling average of the first value cannot be calculated since there the window looks at that given index and the previous value, for which there is none.
Then, when you calculate the mean, std, etc you are calculating a series functions without accounting for the NaN. In R, you can usually just do na.rm=T; however, in Python it is recommended that you drop the NaN values, then calculate the series function.
Related
I ran the following code:
import numpy as np
import pandas as pd
#make this example reproducible
np.random.seed(0)
#create dataset
period = np.arange(1, 101, 1)
leads = np.random.uniform(1, 20, 100)
sales = 60 + 2*period + np.random.normal(loc=0, scale=.5*period, size=100)
df = pd.DataFrame({'period': period, 'leads': leads, 'sales': sales})
#view first 10 rows
df.head(10)
df['rolling_sales_5'] = df['sales'].rolling(5,center=True, min_periods=1).mean()
df.head(10)
But I do not understand how the first two obs and last two obs for the rolling_sales_5 variable are generated. Any idea?
I'm not sure if I coded something wrong or just don't seem to understand what an exponential moving average does.
I'm calculating a moving average on SPY 5 minute tick data at ema 15 and ema 200.
They are exactly the same once the 200 starts. This is the code I used.
def EMA(df, column, window, alpha = .2):
"""
Parameters
----------
df : dataframe.
column : column to compute the EMA
window : the window of the EMA
Returns
-------
df with ema column.
"""
df = df.copy()
values = np.array(df[column])
window2 = 'ema_'+str(window)
df[window2] = df[column].ewm(min_periods = window, alpha=0.3, adjust=False).mean()
return df
df = avg.EMA(df = df, column = 'adjusted_close', window = 15)
df = avg.EMA(df = df, column = 'adjusted_close', window = 200)
Is there something glaringly obvious or is it doing what it's supposed to and I don't understand the ema like I thought I did?
All,
My dataset looks like following. I am trying to predict the 'amount' for next 6 months using either the fbProphet or other model. But my issue is that I would like to predict amount based on each groups i.e A,B,C,D for next 6 months. I am not sure how to do that in python using fbProphet or other model ? I referenced official page of fbprophet, but the only information I found is that "Prophet" takes two columns only One is "Date" and other is "amount" .
I am new to python, so any help with code explanation is greatly appreciated!
import pandas as pd
data = {'Date':['2017-01-01', '2017-02-01', '2017-03-01', '2017-04-01','2017-05-01','2017-06-01','2017-07-01'],'Group':['A','B','C','D','C','A','B'],
'Amount':['12.1','13','15','10','12','9.0','5.6']}
df = pd.DataFrame(data)
print (df)
output:
Date Group Amount
0 2017-01-01 A 12.1
1 2017-02-01 B 13
2 2017-03-01 C 15
3 2017-04-01 D 10
4 2017-05-01 C 12
5 2017-06-01 A 9.0
6 2017-07-01 B 5.6
fbprophet requires two columns ds and y, so you need to first rename the two columns
df = df.rename(columns={'Date': 'ds', 'Amount':'y'})
Assuming that your groups are independent from each other and you want to get one prediction for each group, you can group the dataframe by "Group" column and run forecast for each group
from fbprophet import Prophet
grouped = df.groupby('Group')
for g in grouped.groups:
group = grouped.get_group(g)
m = Prophet()
m.fit(group)
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
print(forecast.tail())
Take note that the input dataframe that you supply in the question is not sufficient for the model because group D only has a single data point. fbprophet's forecast needs at least 2 non-Nan rows.
EDIT: if you want to merge all predictions into one dataframe, the idea is to name the yhat for each observations differently, do pd.merge() in the loop, and then cherry-pick the columns that you need at the end:
final = pd.DataFrame()
for g in grouped.groups:
group = grouped.get_group(g)
m = Prophet()
m.fit(group)
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
forecast = forecast.rename(columns={'yhat': 'yhat_'+g})
final = pd.merge(final, forecast.set_index('ds'), how='outer', left_index=True, right_index=True)
final = final[['yhat_' + g for g in grouped.groups.keys()]]
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
# Before doing any modeling using ARIMA or SARIMAS etc Confirm that
# your time-series is stationary by using Augmented Dick Fuller test
# or other tests.
# Create a list of all groups or get from Data using np.unique or other methods
groups_iter = ['A', 'B', 'C', 'D']
dict_org = {}
dict_pred = {}
group_accuracy = {}
# Iterate over all groups and get data
# from Dataframe by filtering for specific group
for i in range(len(groups_iter)):
X = data[data['Group'] == groups_iter[i]]['Amount'].values
size = int(len(X) * 0.70)
train, test = X[0:size], X[size:len(X)]
history = [x for in train]
# Using ARIMA model here you can also do grid search for best parameters
for t in range(len(test)):
model = ARIMA(history, order = (5, 1, 0))
model_fit = model.fit(disp = 0)
output = model_fit.forecast()
yhat = output[0]
predictions.append(yhat)
obs = test[t]
history.append(obs)
print("Predicted:%f, expected:%f" %(yhat, obs))
error = mean_squared_log_error(test, predictions)
dict_org.update({groups_iter[i]: test})
dict_pred.update({group_iter[i]: test})
print("Group: ", group_iter[i], "Test MSE:%f"% error)
group_accuracy.update({group_iter[i]: error})
plt.plot(test)
plt.plot(predictions, color = 'red')
plt.show()
I know this is old but I was trying to predict outcomes for different clients and I tried to use Aditya Santoso solution above but got into some errors, so I added a couple of modifications and finally this worked for me:
df = pd.read_csv('file.csv')
df = pd.DataFrame(df)
df = df.rename(columns={'date': 'ds', 'amount': 'y', 'client_id': 'client_id'})
#I had to filter first clients with less than 3 records to avoid errors as prophet only works for 2+ records by group
df = df.groupby('client_id').filter(lambda x: len(x) > 2)
df.client_id = df.client_id.astype(str)
final = pd.DataFrame(columns=['client','ds','yhat'])
grouped = df.groupby('client_id')
for g in grouped.groups:
group = grouped.get_group(g)
m = Prophet()
m.fit(group)
future = m.make_future_dataframe(periods=365)
forecast = m.predict(future)
#I added a column with client id
forecast['client'] = g
#I used concat instead of merge
final = pd.concat([final, forecast], ignore_index=True)
final.head(10)
I am trying to output a dataframe and a chart for each of my 'vals'. I'm struggling to piece together some of these Pythonic basics.
Flow: I take the dataframe, do a groupby, get the percentage of total... Output a table and a chart. However, I want to loop through this process, the first time with a dataframe filter on Reviewed?=='Yes', and then by No.
data = {'Region': ["US", "US", "US","US"],
'Gender': ["M","F","F","M"],
'Reviewed?': ["Yes","Yes","No","No"]}
df = pd.DataFrame(data, columns=['Region','Gender','Reviewed?'])
def func(df):
vals = ['Yes','No']
for i in range(len(vals)):
for x in vals:
gb[i] = df[df['Reviewed?']==x].groupby(['Gender'])['Region'].count().reset_index()
total[i] = gb[i]['Region'].sum()
gb[i]['Percentage'] = (gb[i]['Region'] / total[i])
gb[i] = gb[i].sort_values(by='Percentage', ascending=False)
sns.barplot(data=gb[i], x='Region', y='Percentage')
plt.show()
return gb[i]
few errors messages:
ValueError: could not broadcast input array from shape (0,2) into shape (0)
ValueError: cannot copy sequence with size 2 to array axis with dimension 0
ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series
Update
Here is a brute force version of what I want. I just want a more efficient and dynamic way to do this.
Note, I wasn't originally explicit that I wanted to keep the counts in the final dataframe...
import pandas as pd
import seaborn as sns
data = {'Region': ["US", "US", "US","US"],
'Gender': ["M","F","F","M"],
'Reviewed?': ["Yes","Yes","No","No"]}
df = pd.DataFrame(data, columns=['Region','Gender','Reviewed?'])
def func(df):
gb = df[df['Reviewed?']=='No'].groupby(['Gender'])['Region'].count().reset_index()
total = gb['Region'].sum()
gb['Percentage'] = (gb['Region'] / total)
notyetreviewed = gb.sort_values(by='Percentage', ascending=False)
sns.barplot(data=notyetreviewed, x='Gender', y='Percentage')
bottom, top = plt.ylim(0,1)
plt.show()
gb = df[df['Reviewed?']=='Yes'].groupby(['Gender'])['Region'].count().reset_index()
total = gb['Region'].sum()
gb['Percentage'] = (gb['Region'] / total)
reviewed = gb.sort_values(by='Percentage', ascending=False)
bottom, top = plt.ylim(0,1)
sns.barplot(data=reviewed, x='Gender', y='Percentage')
plt.show()
return notyetreviewed, reviewed
func(df)
You can try something like this:
import pandas as pd
data = {'Region': ["US", "US", "US","US"],
'Gender': ["M","F","F","M"],
'Reviewed?': ["Yes","Yes","No","No"]}
df = pd.DataFrame(data, columns=['Region','Gender','Reviewed?'])
for outcome in ['Yes', 'No']:
filtered = df[df['Reviewed?'].eq(outcome)]['Gender'].value_counts(normalize=True)
filtered.plot.bar()
In this case, I'm filtering the DF on each loop by the Reviewed? outcome and then getting the proportional values for male and female. Your question poses a binary choice, but I suppose it could be extended by for outcome in df['Reviewed?'].unique():
This is a marginal improvement. It would be nice to see a more Pythonic solution that wouldn't require me to hard code 'Reviewed?' into the function call...
import pandas as pd
import seaborn as sns
data = {'Region': ["US", "US", "US","US"],
'Gender': ["M","F","F","M"],
'Reviewed?': ["Yes","Yes","No","No"]}
df = pd.DataFrame(data, columns=['Region','Gender','Reviewed?'])
def func(df,group,reviewed):
df = df[df['Reviewed?'].isin(reviewed)].groupby([group])['Region'].count().reset_index()
df['Percentage'] = df['Region'] / df['Region'].sum()
sns.barplot(data=df, x='Gender', y='Percentage')
bottom, top = plt.ylim(0,1)
plt.show()
return df
df1 = func(df,'Gender',['Yes'])
df1 = func(df,'Gender',['No'])
[Edited to more clearly state root problem, which behaves differently if you use numpy 1.8 as dmvianna points out]
I have a DataFrame that has time stamps add other data. In the end I would like to not use a formatted time as the index because it messes with matplotlibs 3d plotting. I also want to preform a groupby to populate some flag fields. This is causing me to run into a number of weird errors. The first two work as I would expect. Once I bring pd.to_datetime into the picture it starts throwing errors.
runs as expected:
import pandas as pd
import numpy as np
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
works fine:
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df.time = pd.to_datetime(df.time, unit='s')
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
throws error:
ValueError: Shape of passed values is (3, 1016), indices imply (3, 1000)
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df.time = pd.to_datetime(df.time, unit='s')
df = df.set_index('time')
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
throws error:
ValueError: Shape of passed values is (3, 1016), indices imply (3, 1000)
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df['epoch'] = df.time
df.time = pd.to_datetime(df.time, unit='s')
df = df.set_index('time')
df = df.set_index('epoch')
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
Anyone know what I'm missing / doing wrong?
Instead of using pd.to_datetime, I would use np.datetime64. It will work in columns and offers the same functionality as you expect from a datetime.index (np.datetime64 is a building block for datetime.index).
import numpy as np
data['time2'] = np.datetime64(data.time, 's')
Check the Docs
This would also lead to the same result:
import pandas as pd
data['time2'] = pd.to_datetime(data.time, unit='s')
Notice though that I'm using pandas 0.12.0 and Numpy 1.8.0. Numpy 1.7 has issues referred to in the comments below.