Compute the rolling mean in pandas - python

I ran the following code:
import numpy as np
import pandas as pd
#make this example reproducible
np.random.seed(0)
#create dataset
period = np.arange(1, 101, 1)
leads = np.random.uniform(1, 20, 100)
sales = 60 + 2*period + np.random.normal(loc=0, scale=.5*period, size=100)
df = pd.DataFrame({'period': period, 'leads': leads, 'sales': sales})
#view first 10 rows
df.head(10)
df['rolling_sales_5'] = df['sales'].rolling(5,center=True, min_periods=1).mean()
df.head(10)
But I do not understand how the first two obs and last two obs for the rolling_sales_5 variable are generated. Any idea?

Related

Generating and Storing Samples of an Exponential Distribution with a name for each sample using a loop

I've got a weird question for a class project. Assuming X ~ Exp(Lambda), Lambda=1.6, I have to generate 100 samples of X, with the indices corresponding to the sample size of each generated sample (S1, S2 ... S100). I've worked out a simple loop which generate the required samples in array, but i am not able to rename the array.
First attempt:
import numpy as np
import matplotlib.pyplot as plt
samples = []
for i in range(1,101,1):
samples.append(np.random.exponential(scale= 1/1.6, size= i))
Second attempt:
import numpy as np
import matplotlib.pyplot as plt
for i in range(1,101,1):
samples = np.random.exponential(scale= 1/1.2, size= i)
col = f'samples {i}'
df_samples[col] = exponential_sample
df_samples = pd.DataFrame(samples)
An example how I would like to visualize the data:
# drawing 50 random samples of size 2 from the exponentially distributed population
sample_size = 2
df2 = pd.DataFrame(index= ['x1', 'x2'] )
for i in range(1, 51):
exponential_sample = np.random.exponential((1/rate), sample_size)
col = f'sample {i}'
df2[col] = exponential_sample
# Taking a peek at the samples
df2
But instead of having a simple size = 2, I would like to have sample size = i. This way, I will be able to generate 1 rows for the first column (S1), 2 rows for the second column (S2), until I reach 100 rows for the 100th column (S100).
You cannot stick vectors of different lengths easily into a df so your mock-up code would not work, but you can concat one vector at a time:
df = pd.DataFrame()
for i in range(100,10100,100):
tmp = pd.DataFrame({f'S{i}':np.random.exponential(scale= 1/1.2, size= i)})
df = pd.concat([df, tmp], axis=1)
Use a dict instead maybe?
samples = {}
for i in range(100,10100,100):
samples[i] = np.random.exponential(scale= 1/1.2, size= i)
Then you can convert it into a pandas Dataframe if you like.

Calculate rolling mean, max, min, std of time series pandas dataframe

I'm trying to calculate a rolling mean, max, min, and std for specific columns inside a time series pandas dataframe. But I keep getting NaN for the lagged values and I'm not sure how to fix it. My MWE is:
import numpy as np
import pandas as pd
# original data
df = pd.DataFrame()
np.random.seed(0)
days = pd.date_range(start='2015-01-01', end='2015-05-01', freq='1D')
df = pd.DataFrame({'Date': days, 'col1': np.random.randn(len(days)), 'col2': 20+np.random.randn(len(days)), 'col3': 50+np.random.randn(len(days))})
df = df.set_index('Date')
print(df.head(10))
def add_lag(dfObj, window):
cols = ['col2', 'col3']
for col in cols:
rolled = dfObj[col].rolling(window)
lag_mean = rolled.mean().reset_index()#.astype(np.float16)
lag_max = rolled.max().reset_index()#.astype(np.float16)
lag_min = rolled.min().reset_index()#.astype(np.float16)
lag_std = rolled.std().reset_index()#.astype(np.float16)
dfObj[f'{col}_mean_lag{window}'] = lag_mean[col]
dfObj[f'{col}_max_lag{window}'] = lag_max[col]
dfObj[f'{col}_min_lag{window}'] = lag_min[col]
dfObj[f'{col}_std_lag{window}'] = lag_std[col]
# add lag feature for 1 day, 3 days
add_lag(df, window=1)
add_lag(df, window=3)
print(df.head(10))
print(df.tail(10))
Just don't do reset_index(). Then it works.
import numpy as np
import pandas as pd
# original data
df = pd.DataFrame()
np.random.seed(0)
days = pd.date_range(start='2015-01-01', end='2015-05-01', freq='1D')
df = pd.DataFrame({'Date': days, 'col1': np.random.randn(len(days)), 'col2': 20+np.random.randn(len(days)), 'col3': 50+np.random.randn(len(days))})
df = df.set_index('Date')
print(df.head(10))
def add_lag(dfObj, window):
cols = ['col2', 'col3']
for col in cols:
rolled = dfObj[col].rolling(window)
lag_mean = rolled.mean()#.reset_index()#.astype(np.float16)
lag_max = rolled.max()#.reset_index()#.astype(np.float16)
lag_min = rolled.min()#.reset_index()#.astype(np.float16)
lag_std = rolled.std()#.reset_index()#.astype(np.float16)
dfObj[f'{col}_mean_lag{window}'] = lag_mean#[col]
dfObj[f'{col}_max_lag{window}'] = lag_max#[col]
dfObj[f'{col}_min_lag{window}'] = lag_min#[col]
dfObj[f'{col}_std_lag{window}'] = lag_std#[col]
# add lag feature for 1 day, 3 days
add_lag(df, window=1)
add_lag(df, window=3)
print(df.head(10))
print(df.tail(10))
Whenever you use the rolling function, it creates NaN for the values that it cannot calculate.
For example, consider a single column, col1 = [2, 4, 10, 6], and a rolling window of 2.
The output of the rolling window will be NaN, 3, 7, 8.
This is because the rolling average of the first value cannot be calculated since there the window looks at that given index and the previous value, for which there is none.
Then, when you calculate the mean, std, etc you are calculating a series functions without accounting for the NaN. In R, you can usually just do na.rm=T; however, in Python it is recommended that you drop the NaN values, then calculate the series function.

how to match variables and values to the for loop on pandas dataframe?

import pandas as pd
import numpy as np
from scipy.stats import shapiro
s1 = np.random.randint(10, size = 5)
s2 = np.random.randint(10, size = 5)
s3 = np.random.randint(10, size = 5)
x = {"var1": s1, "var2": s2, "var3": s3}
df = pd.DataFrame(x)
My data is similar to this, but it's made up of thousands of variables.I want to print with the name of the variable corresponding to each value.
for i in df :
test_statistics, pvalue = shapiro(df[i])
print('Test Statistics for xvalue = %.4f, p-value = %.4f' % (test_statistics, pvalue))
I want the output to be like this:
Test statistics for s1...........)
Test statistics for s2...........)
Test statistics for s3...........)
Use f-strings:
for i in df :
test_statistics, pvalue = shapiro(df[i])
print(f'Test statistics for {i}:xvalue = {test_statistics:.4f}, p-value = {pvalue:.4f}')

How to more efficiently calculate a rolling ratio

i have data length is over 3000.
below are code for making 20days value ( Volume Ration in Stock market)
it took more than 2 min.
is there any good way to reduce running time.
import pandas as pd
import numpy as np
from pandas.io.data import DataReader
import matplotlib.pylab as plt
data = DataReader('047040.KS','yahoo',start='2010')
data['vr']=0
data['Volume Ratio']=0
data['acend']=0
data['vr'] = np.sign(data['Close']-data['Open'])
data['vr'] = np.where(data['vr']==0,0.5,data['vr'])
data['vr'] = np.where(data['vr']<0,0,data['vr'])
data['acend'] = np.multiply(data['Volume'],data['vr'])
for i in range(len(data['Open'])):
if i<19:
data['Volume Ratio'][i]=0
else:
data['Volume Ratio'][i] = ((sum(data['acend'][i-19:i]))/((sum(data['Volume'][i-19:i])-sum(data['acend'][i-19:i]))))*100
Consider using conditional row selection and rolling.sum():
data.loc[data.index[:20], 'Volume Ratio'] = 0
data.loc[data.index[20:], 'Volume Ratio'] = (data.loc[:20:, 'acend'].rolling(window=20).sum() / (data.loc[:20:, 'Volume'].rolling(window=20).sum() - data.loc[:20:, 'acend'].rolling(window=20).sum()) * 100
or, simplified - .rolling.sum() will create np.nan for the first 20 values so just use .fillna(0):
data['new_col'] = data['acend'].rolling(window=20).sum().div(data['Volume'].rolling(window=20).sum().subtract(data['acend'].rolling(window=20).sum()).mul(100).fillna(0)

Pandas Time Series and groupby

[Edited to more clearly state root problem, which behaves differently if you use numpy 1.8 as dmvianna points out]
I have a DataFrame that has time stamps add other data. In the end I would like to not use a formatted time as the index because it messes with matplotlibs 3d plotting. I also want to preform a groupby to populate some flag fields. This is causing me to run into a number of weird errors. The first two work as I would expect. Once I bring pd.to_datetime into the picture it starts throwing errors.
runs as expected:
import pandas as pd
import numpy as np
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
works fine:
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df.time = pd.to_datetime(df.time, unit='s')
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
throws error:
ValueError: Shape of passed values is (3, 1016), indices imply (3, 1000)
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df.time = pd.to_datetime(df.time, unit='s')
df = df.set_index('time')
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
throws error:
ValueError: Shape of passed values is (3, 1016), indices imply (3, 1000)
df = pd.DataFrame({'time':np.random.randint(100000, size=1000),
'type':np.random.randint(10, size=1000),
'value':np.random.rand(1000)})
df['epoch'] = df.time
df.time = pd.to_datetime(df.time, unit='s')
df = df.set_index('time')
df = df.set_index('epoch')
df['high'] = 0
def high_low(group):
if group.value.mean() > .5:
group.high = 1
return group
grouped = df.groupby('type')
df = grouped.apply(high_low)
Anyone know what I'm missing / doing wrong?
Instead of using pd.to_datetime, I would use np.datetime64. It will work in columns and offers the same functionality as you expect from a datetime.index (np.datetime64 is a building block for datetime.index).
import numpy as np
data['time2'] = np.datetime64(data.time, 's')
Check the Docs
This would also lead to the same result:
import pandas as pd
data['time2'] = pd.to_datetime(data.time, unit='s')
Notice though that I'm using pandas 0.12.0 and Numpy 1.8.0. Numpy 1.7 has issues referred to in the comments below.

Categories