Xarray resample inter annually - python

I am trying to resample my data annually, but struggle to set the start day of resampling.
import xarray as xr
import numpy as np
import pandas as pd
da = xr.DataArray(
np.linspace(0, 11, num=36),
coords=[
pd.date_range(
"15/12/1999", periods=36,
)
],
dims="time",
)
da.resample(time="1Y").mean()
What I am trying to achieve is to get the means of the following periods: 15/12/1999-15/12/2000, 15/12/2000-15/12/2001, 15/12/2001-15/12/2002, ...

I have solved it by shifting the time to the first month and use the corresponding pandas anchored offset. Afterwards, reset the time back.
import xarray as xr
import numpy as np
import pandas as pd
da = xr.DataArray(
np.concatenate([np.zeros(365), np.ones(365)]),
coords=[
pd.date_range(
"06/15/2017", "06/14/2019", freq='D'
)
],
dims="time",
)
days_to_first_of_month = pd.Timedelta(days=int(da.time.dt.day[0])-1)
da['time'] = da.time - days_to_first_of_month
month = da.time.dt.strftime("%b")[0].values
resampled = da.resample(time=f'AS-{month}').sum()
resampled['time'] = resampled.time + days_to_first_of_month
print(resampled)
Is there a more efficient or clean way?

Related

How can I speed up xarray resample (much slower than pandas resample)

Here is an MWE for resampling a time series in xarray vs. pandas. The 10Min resample takes 6.8 seconds in xarray and 0.003 seconds in pandas. Is there some way to get the Pandas speed in xarray? Pandas resample seems to be independent of the period, while xarray scales with the period.
import numpy as np
import xarray as xr
import pandas as pd
import time
def make_ds(freq):
size = 100000
times = pd.date_range('2000-01-01', periods=size, freq=freq)
ds = xr.Dataset({
'foo': xr.DataArray(
data = np.random.random(size),
dims = ['time'],
coords = {'time': times}
)})
return ds
for f in ["1s", "1Min", "10Min"]:
ds = make_ds(f)
start = time.time()
ds_r = ds.resample({'time':"1H"}).mean()
print(f, 'xr', str(time.time() - start))
start = time.time()
ds_r = ds.to_dataframe().resample("1H").mean()
print(f, 'pd', str(time.time() - start))
: 1s xr 0.040313720703125
: 1s pd 0.0033435821533203125
: 1Min xr 0.5757267475128174
: 1Min pd 0.0025794506072998047
: 10Min xr 6.798743486404419
: 10Min pd 0.0029947757720947266
As per the xarray GH issue this is an implementation issue. The solution is to do the resampling (actually a GroupBy) in other code. My solution is to use the fast Pandas resample and then rebuild the xarray dataset:
df_h = ds.to_dataframe().resample("1H").mean() # what we want (quickly), but in Pandas form
vals = [xr.DataArray(data=df_h[c], dims=['time'], coords={'time':df_h.index}, attrs=ds[c].attrs) for c in df_h.columns]
ds_h = xr.Dataset(dict(zip(df_h.columns,vals)), attrs=ds.attrs)

How to apply euclidean distance to dataframe. Calculate each row

Please help me, I have the problem. It's been about 2 weeks but I don't get it yet.
So, I want to use "apply" in dataframe, which I got from Alphavantage API.
I want to apply euclidean distance to each row of dataframe.
import math
import numpy as np
import pandas as pd
from scipy.spatial import distance
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.neighbors import KNeighborsRegressor
from alpha_vantage.timeseries import TimeSeries
from services.KEY import getApiKey
ts = TimeSeries(key=getApiKey(), output_format='pandas')
And in my picture I got this
My chart (sorry can't post image because of my reputation)
In my code
stock, meta_data = ts.get_daily_adjusted(symbol, outputsize='full')
stock = stock.sort_values('date')
open = stock['1. open'].values
low = stock['3. low'].values
high = stock['2. high'].values
close = stock['4. close'].values
sorted_date = stock.index.get_level_values(level='date')
stock_numpy_format = np.stack((sorted_date, open, low
,high, close), axis=1)
df = pd.DataFrame(stock_numpy_format, columns=['date', 'open', 'low', 'high', 'close'])
df = df[df['open']>0]
df = df[(df['date'] >= "2016-01-01") & (df['date'] <= "2018-12-31")]
df = df.reset_index(drop=True)
df['close_next'] = df['close'].shift(-1)
df['daily_return'] = df['close'].pct_change(1)
df['daily_return'].fillna(0, inplace=True)
stock_numeric_close_dailyreturn = df['close', 'daily_return']
stock_normalized = (stock_numeric_close_dailyreturn - stock_numeric_close_dailyreturn.mean()) / stock_numeric_close_dailyreturn.std()
euclidean_distances = stock_normalized.apply(lambda row: distance.euclidean(row, date_normalized) , axis=1)
distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "idx":euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_date = df.loc[int(second_smallest)]["date"]
And I want that my chart like this
The chart that I want
And the code from this picture
distance_columns = ['Close', 'DailyReturn']
stock_numeric = stock[distance_columns]
stock_normalized = (stock_numeric - stock_numeric.mean()) / stock_numeric.std()
stock_normalized.fillna(0, inplace = True)
date_normalized = stock_normalized[stock["Date"] == "2016-06-29"]
euclidean_distances = stock_normalized.apply(lambda row: distance.euclidean(row, date_normalized), axis = 1)
distance_frame = pandas.DataFrame(data = {"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values("dist", inplace=True)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_date = stock.loc[int(second_smallest)]["Date"]
I tried to figure it out, the "apply" in the df.apply from pandas format and from pandas.csv_reader is different.
Is there any alternative to have same output in different format (pandas and csv)
Thank you!
nb: sorry if my english bad.

Biggest rise in a time window in a time series

Wondering if there is a fast way of getting the biggest rise in a time series within a window.
Intended code is...
import datetime
import numpy as np
import pandas as pd
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, 365)]
data = np.random.randint(low=1, high=10, size=len(date_list))
df = pd.DataFrame({'date': date_list, 'value': data})
def biggest_rise(df, windowsize = 10):
'''gets the biggest rise within a window size specified
'''
# Some magic code here
return df.rolling_max(window=10, ...)
I don't really get what you mean 'biggest rise', but using rolling may be helpful. For example with that code you can get the difference of the maximum and minimum value within a 10-day window:
df.sort_values(['date']).set_index('date').rolling('10d').max() - df.sort_values(['date']).set_index('date').rolling('10d').min()
I think I found the answer... as per code below. Upped the high to 10K to really see the changes:
import datetime
import numpy as np
import pandas as pd
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, 365)]
data = np.random.randint(low=1, high=10000, size=len(date_list))
df = pd.DataFrame({'date': date_list, 'value': data})
window = 10
dfs = [df.iloc[i: i+window] for i in range(0, len(df)) if i+window < len(df)]
biggest_rise = max([d.value.max()-d.value.min() for d in dfs])
Takes 112 ms for 365 datapoints. Anything better is welcome.
The biggest_rise could be the biggest_fall in the window. Don't know how to differentiate.
Here is a better answer to get the maximum rise using #TywinLannister88 suggestion:
import numpy as np
import pandas as pd
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, 365)]
data = np.random.randint(low=1, high=10000, size=len(date_list))
df = pd.DataFrame({'date': date_list, 'value': data})
# 10-day rolling window
df1 = df.sort_values(['date']).set_index('date').rolling('10d').max() - \
df.sort_values(['date']).set_index('date').rolling('10d').min()
# percent change to see if there is a rise or fall
df2 = df.sort_values(['date']).set_index('date').value.pct_change(periods=10)
# filter out the rises (pctchange > 0) and find the maximum rise
df3 = df.sort_values(['date']).set_index('date').assign(delta=df1, pctchange=df2)
biggest_rise = df3[df3.pctchange>0].pctchange.max()

datetime groupby on a multiindex

If I have a multiindex set up like:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from io import StringIO
csv = u"""string,date,number
a string1,2/5/11 9:16am,1.0
a string2,3/5/11 10:44pm,2.0
a string3,4/22/11 12:07pm,3.0
a string4,4/22/11 12:10pm,4.0
a string5,4/29/11 11:59am,1.0
a string6,5/2/11 1:41pm,2.0
a string7,5/2/11 2:02pm,3.0
a string8,5/2/11 2:56pm,4.0
a string9,5/2/11 3:00pm,5.0
a string10,5/2/14 3:02pm,6.0
a string11,5/2/14 3:18pm,7.0"""
df = pd.read_csv(StringIO(csv))
df['date']=pd.to_datetime(df['date'],format='%m/%d/%y %I:%M%p')
df.index = df['date']
df.index = pd.MultiIndex.from_tuples(zip(df['date'], df['string']), names=['alpha', 'bravo'])
How can I do a groupby on the alpha index by month and then sum? What I've tried is:
df.groupby(level='alpha').sum().groupby(df.index.month).sum()
which clearly doesn't work.
Like this?
df.groupby(df.index.get_level_values('alpha').month).number.sum()

How to more efficiently calculate a rolling ratio

i have data length is over 3000.
below are code for making 20days value ( Volume Ration in Stock market)
it took more than 2 min.
is there any good way to reduce running time.
import pandas as pd
import numpy as np
from pandas.io.data import DataReader
import matplotlib.pylab as plt
data = DataReader('047040.KS','yahoo',start='2010')
data['vr']=0
data['Volume Ratio']=0
data['acend']=0
data['vr'] = np.sign(data['Close']-data['Open'])
data['vr'] = np.where(data['vr']==0,0.5,data['vr'])
data['vr'] = np.where(data['vr']<0,0,data['vr'])
data['acend'] = np.multiply(data['Volume'],data['vr'])
for i in range(len(data['Open'])):
if i<19:
data['Volume Ratio'][i]=0
else:
data['Volume Ratio'][i] = ((sum(data['acend'][i-19:i]))/((sum(data['Volume'][i-19:i])-sum(data['acend'][i-19:i]))))*100
Consider using conditional row selection and rolling.sum():
data.loc[data.index[:20], 'Volume Ratio'] = 0
data.loc[data.index[20:], 'Volume Ratio'] = (data.loc[:20:, 'acend'].rolling(window=20).sum() / (data.loc[:20:, 'Volume'].rolling(window=20).sum() - data.loc[:20:, 'acend'].rolling(window=20).sum()) * 100
or, simplified - .rolling.sum() will create np.nan for the first 20 values so just use .fillna(0):
data['new_col'] = data['acend'].rolling(window=20).sum().div(data['Volume'].rolling(window=20).sum().subtract(data['acend'].rolling(window=20).sum()).mul(100).fillna(0)

Categories