How can I speed up xarray resample (much slower than pandas resample) - python

Here is an MWE for resampling a time series in xarray vs. pandas. The 10Min resample takes 6.8 seconds in xarray and 0.003 seconds in pandas. Is there some way to get the Pandas speed in xarray? Pandas resample seems to be independent of the period, while xarray scales with the period.
import numpy as np
import xarray as xr
import pandas as pd
import time
def make_ds(freq):
size = 100000
times = pd.date_range('2000-01-01', periods=size, freq=freq)
ds = xr.Dataset({
'foo': xr.DataArray(
data = np.random.random(size),
dims = ['time'],
coords = {'time': times}
)})
return ds
for f in ["1s", "1Min", "10Min"]:
ds = make_ds(f)
start = time.time()
ds_r = ds.resample({'time':"1H"}).mean()
print(f, 'xr', str(time.time() - start))
start = time.time()
ds_r = ds.to_dataframe().resample("1H").mean()
print(f, 'pd', str(time.time() - start))
: 1s xr 0.040313720703125
: 1s pd 0.0033435821533203125
: 1Min xr 0.5757267475128174
: 1Min pd 0.0025794506072998047
: 10Min xr 6.798743486404419
: 10Min pd 0.0029947757720947266

As per the xarray GH issue this is an implementation issue. The solution is to do the resampling (actually a GroupBy) in other code. My solution is to use the fast Pandas resample and then rebuild the xarray dataset:
df_h = ds.to_dataframe().resample("1H").mean() # what we want (quickly), but in Pandas form
vals = [xr.DataArray(data=df_h[c], dims=['time'], coords={'time':df_h.index}, attrs=ds[c].attrs) for c in df_h.columns]
ds_h = xr.Dataset(dict(zip(df_h.columns,vals)), attrs=ds.attrs)

Related

how to convert pandas dataframe to binary file in python

import numpy as np
import pandas as pd
def get_values_for_frequency(freq):
# sampling information
Fs = 100# sample rate no of samppes per second
T = 1/Fs # sampling period %sample per second
t = 1 # seconds of sampling
N = Fs*t # total points in signal
# signal information
#freq = 100 # in hertz,
omega = 2*np.pi*freq # angular frequency for sine waves
t_vec = np.arange(N)*T # time vector for plotting
y = np.sin(omega*t_vec)
return y
df = pd.DataFrame(columns =['1Hz','2Hz', '3Hz', '4Hz', '5Hz', '6Hz', '7Hz'])
df['1Hz']=pd.Series(get_values_for_frequency(1))
df['2Hz']=pd.Series(get_values_for_frequency(2))
df['3Hz']=pd.Series(get_values_for_frequency(3))
df['4Hz']=pd.Series(get_values_for_frequency(4))
df['5Hz']=pd.Series(get_values_for_frequency(5))
df['6Hz']=pd.Series(get_values_for_frequency(6))
df['7Hz']=pd.Series(get_values_for_frequency(7))
#df.to_csv('samplepersecond.csv')
ndary=df.to_records(index=False)
This is the code to generate a sine wave .Here I generated a sine wave with 7 columns(from 1 Hz to 7 Hz) and with 100 rows. Then I created a pandas Dataframe to store all these values. Now , the requirement is to convert this Dataframe into binary file with datatype of int16. So each value in a dataframe should be converted into 16 bit signed integer and then to convert into binary file
You can convert your data frame values to int16 by using the astype function.
import numpy as np
df = df.astype(np.int16)
Then you can save your data frame in HDF5 format by using to_hdf.
df.to_hdf('tmp.hdf','df', mode='w')

Speeding Up NumPy Array Reshape

I am looking for a way to speed up the reshaping of a 1-column text file to a 3D numpy array.
Python Code:
import numpy as np
nx = 1799
ny = 1059
num = 6
in_file = "data.txt"
arr = np.loadtxt(in_file)
reshaped_data = np.reshape(arr, (nx, ny, num))
print(reshaped_data)
data.txt: https://drive.google.com/file/d/198RLSR5JpDn3nESWQiOPiYBWxzazbUCs/view?usp=sharing
data.txt is a 1-column text file (with 11,430,846 lines) that is six concatenated text files that each had an original shape of 1799x1059.
My process is getting the 11,430,846 values into the correct shape (1799x1059x6).
Currently, this reshaping process is taking about 25 seconds to complete. I would like to get it down to <10 seconds.
Alright, I did some testing. In the code you provided, almost all of the execution time is for arr = np.loadtxt(in_file) and the reshape is completed almost instantaneously. Test code:
import numpy as np
from time import perf_counter
nx = 1799
ny = 1059
num = 6
t = perf_counter()
arr = np.loadtxt('data.txt')
print(f'Loading time: {round(perf_counter() - t, 3)}')
t = perf_counter()
reshaped_data = np.reshape(arr, (nx, ny, num))
print(f'Reshaping time: {round(perf_counter() - t,3)}')
Outputs:
Loading time: 32.54
Reshaping time: 0.006
So, the problem is loading the data. You can improve that part by using pandas:
import numpy as np
from time import perf_counter
import pandas as pd
nx = 1799
ny = 1059
num = 6
t = perf_counter()
arr = pd.read_csv('data.txt', header=None).to_numpy().T
print(f'Loading time: {round(perf_counter() - t, 3)}')
t = perf_counter()
reshaped_data = np.reshape(arr, (nx, ny, num))
print(f'Reshaping time: {round(perf_counter() - t, 3)}')
Outputs:
Loading time: 0.753
Reshaping time: 0.007

Xarray resample inter annually

I am trying to resample my data annually, but struggle to set the start day of resampling.
import xarray as xr
import numpy as np
import pandas as pd
da = xr.DataArray(
np.linspace(0, 11, num=36),
coords=[
pd.date_range(
"15/12/1999", periods=36,
)
],
dims="time",
)
da.resample(time="1Y").mean()
What I am trying to achieve is to get the means of the following periods: 15/12/1999-15/12/2000, 15/12/2000-15/12/2001, 15/12/2001-15/12/2002, ...
I have solved it by shifting the time to the first month and use the corresponding pandas anchored offset. Afterwards, reset the time back.
import xarray as xr
import numpy as np
import pandas as pd
da = xr.DataArray(
np.concatenate([np.zeros(365), np.ones(365)]),
coords=[
pd.date_range(
"06/15/2017", "06/14/2019", freq='D'
)
],
dims="time",
)
days_to_first_of_month = pd.Timedelta(days=int(da.time.dt.day[0])-1)
da['time'] = da.time - days_to_first_of_month
month = da.time.dt.strftime("%b")[0].values
resampled = da.resample(time=f'AS-{month}').sum()
resampled['time'] = resampled.time + days_to_first_of_month
print(resampled)
Is there a more efficient or clean way?

Biggest rise in a time window in a time series

Wondering if there is a fast way of getting the biggest rise in a time series within a window.
Intended code is...
import datetime
import numpy as np
import pandas as pd
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, 365)]
data = np.random.randint(low=1, high=10, size=len(date_list))
df = pd.DataFrame({'date': date_list, 'value': data})
def biggest_rise(df, windowsize = 10):
'''gets the biggest rise within a window size specified
'''
# Some magic code here
return df.rolling_max(window=10, ...)
I don't really get what you mean 'biggest rise', but using rolling may be helpful. For example with that code you can get the difference of the maximum and minimum value within a 10-day window:
df.sort_values(['date']).set_index('date').rolling('10d').max() - df.sort_values(['date']).set_index('date').rolling('10d').min()
I think I found the answer... as per code below. Upped the high to 10K to really see the changes:
import datetime
import numpy as np
import pandas as pd
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, 365)]
data = np.random.randint(low=1, high=10000, size=len(date_list))
df = pd.DataFrame({'date': date_list, 'value': data})
window = 10
dfs = [df.iloc[i: i+window] for i in range(0, len(df)) if i+window < len(df)]
biggest_rise = max([d.value.max()-d.value.min() for d in dfs])
Takes 112 ms for 365 datapoints. Anything better is welcome.
The biggest_rise could be the biggest_fall in the window. Don't know how to differentiate.
Here is a better answer to get the maximum rise using #TywinLannister88 suggestion:
import numpy as np
import pandas as pd
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, 365)]
data = np.random.randint(low=1, high=10000, size=len(date_list))
df = pd.DataFrame({'date': date_list, 'value': data})
# 10-day rolling window
df1 = df.sort_values(['date']).set_index('date').rolling('10d').max() - \
df.sort_values(['date']).set_index('date').rolling('10d').min()
# percent change to see if there is a rise or fall
df2 = df.sort_values(['date']).set_index('date').value.pct_change(periods=10)
# filter out the rises (pctchange > 0) and find the maximum rise
df3 = df.sort_values(['date']).set_index('date').assign(delta=df1, pctchange=df2)
biggest_rise = df3[df3.pctchange>0].pctchange.max()

How to more efficiently calculate a rolling ratio

i have data length is over 3000.
below are code for making 20days value ( Volume Ration in Stock market)
it took more than 2 min.
is there any good way to reduce running time.
import pandas as pd
import numpy as np
from pandas.io.data import DataReader
import matplotlib.pylab as plt
data = DataReader('047040.KS','yahoo',start='2010')
data['vr']=0
data['Volume Ratio']=0
data['acend']=0
data['vr'] = np.sign(data['Close']-data['Open'])
data['vr'] = np.where(data['vr']==0,0.5,data['vr'])
data['vr'] = np.where(data['vr']<0,0,data['vr'])
data['acend'] = np.multiply(data['Volume'],data['vr'])
for i in range(len(data['Open'])):
if i<19:
data['Volume Ratio'][i]=0
else:
data['Volume Ratio'][i] = ((sum(data['acend'][i-19:i]))/((sum(data['Volume'][i-19:i])-sum(data['acend'][i-19:i]))))*100
Consider using conditional row selection and rolling.sum():
data.loc[data.index[:20], 'Volume Ratio'] = 0
data.loc[data.index[20:], 'Volume Ratio'] = (data.loc[:20:, 'acend'].rolling(window=20).sum() / (data.loc[:20:, 'Volume'].rolling(window=20).sum() - data.loc[:20:, 'acend'].rolling(window=20).sum()) * 100
or, simplified - .rolling.sum() will create np.nan for the first 20 values so just use .fillna(0):
data['new_col'] = data['acend'].rolling(window=20).sum().div(data['Volume'].rolling(window=20).sum().subtract(data['acend'].rolling(window=20).sum()).mul(100).fillna(0)

Categories