Returning two values from pandas.rolling_apply - python

I am using pandas.rolling_apply to fit data to a distribution and get a value from it, but I need it also report a rolling goodness of fit (specifically, p-value). Currently I'm doing it like this:
def func(sample):
fit = genextreme.fit(sample)
return genextreme.isf(0.9, *fit)
def p_value(sample):
fit = genextreme.fit(sample)
return kstest(sample, 'genextreme', fit)[1]
values = pd.rolling_apply(data, 30, func)
p_values = pd.rolling_apply(data, 30, p_value)
results = pd.DataFrame({'values': values, 'p_value': p_values})
The problem is that I have a lot of data, and the fit function is expensive, so I don't want to call it twice for every sample. What I'd rather do is something like this:
def func(sample):
fit = genextreme.fit(sample)
value = genextreme.isf(0.9, *fit)
p_value = kstest(sample, 'genextreme', fit)[1]
return {'value': value, 'p_value': p_value}
results = pd.rolling_apply(data, 30, func)
Where results is a DataFrame with two columns. If I try to run this, I get an exception:
TypeError: a float is required. Is it possible to achieve this, and if so, how?

I had a similar problem and solved it by using a member function of a separate helper class during apply. That member function does as required return a single value but I store the other calc results as members of the class and can use it afterwards.
Simple Example:
class CountCalls:
def __init__(self):
self.counter = 0
def your_function(self, window):
retval = f(window)
self.counter = self.counter + 1
TestCounter = CountCalls()
pandas.Series.rolling(your_seriesOrDataframeColumn, window = your_window_size).apply(TestCounter.your_function)
print TestCounter.counter
Assume your function f would return a tuple of two values v1,v2. Then you can return v1 and assign it to column_v1 to your dataframe. The second value v2 you simply accumulate in a Series series_val2 within the helper class. Afterwards you just assing that series as new column to your dataframe.
JML

I had a similar problem before. Here's my solution for it:
from collections import deque
class your_multi_output_function_class:
def __init__(self):
self.deque_2 = deque()
self.deque_3 = deque()
def f1(self, window):
self.k = somefunction(y)
self.deque_2.append(self.k[1])
self.deque_3.append(self.k[2])
return self.k[0]
def f2(self, window):
return self.deque_2.popleft()
def f3(self, window):
return self.deque_3.popleft()
func = your_multi_output_function_class()
output = your_pandas_object.rolling(window=10).agg(
{'a':func.f1,'b':func.f2,'c':func.f3}
)

I used and loved #yi-yu's answer so I made it generic:
from collections import deque
from functools import partial
def make_class(func, dim_output):
class your_multi_output_function_class:
def __init__(self, func, dim_output):
assert dim_output >= 2
self.func = func
self.deques = {i: deque() for i in range(1, dim_output)}
def f0(self, *args, **kwargs):
k = self.func(*args, **kwargs)
for queue in sorted(self.deques):
self.deques[queue].append(k[queue])
return k[0]
def accessor(self, index, *args, **kwargs):
return self.deques[index].popleft()
klass = your_multi_output_function_class(func, dim_output)
for i in range(1, dim_output):
f = partial(accessor, klass, i)
setattr(klass, 'f' + str(i), f)
return klass
and given a function f of a pandas Series (windowed but not necessarily) returning, n values, you use it this way:
rolling_func = make_class(f, n)
# dict to map the function's outputs to new columns. Eg:
agger = {'output_' + str(i): getattr(rolling_func, 'f' + str(i)) for i in range(n)}
windowed_series.agg(agger)

I also had the same issue. I solved it by generating a global data frame and feeding it from the rolling function. In the following example script, I generate a random input data. Then, I calculate with a single rolling apply function the min, the max and the mean.
import pandas as pd
import numpy as np
global outputDF
global index
def myFunction(array):
global index
global outputDF
# Some random operation
outputDF['min'][index] = np.nanmin(array)
outputDF['max'][index] = np.nanmax(array)
outputDF['mean'][index] = np.nanmean(array)
index += 1
# Returning a useless variable
return 0
if __name__ == "__main__":
global outputDF
global index
# A random window size
windowSize = 10
# Preparing some random input data
inputDF = pd.DataFrame({ 'randomValue': [np.nan] * 500 })
for i in range(len(inputDF)):
inputDF['randomValue'].values[i] = np.random.rand()
# Pre-Allocate memory
outputDF = pd.DataFrame({ 'min': [np.nan] * len(inputDF),
'max': [np.nan] * len(inputDF),
'mean': [np.nan] * len(inputDF)
})
# Precise the staring index (due to the window size)
d = (windowSize - 1) / 2
index = np.int(np.floor( d ) )
# Do the rolling apply here
inputDF['randomValue'].rolling(window=windowSize,center=True).apply(myFunction,args=())
assert index + np.int(np.ceil(d)) == len(inputDF), 'Length mismatch'
outputDF.set_index = inputDF.index
# Optional : Clean the nulls
outputDF.dropna(inplace=True)
print(outputDF)

Related

Python class call a method from a class object

How can I fix attribute error in this situation? I have a pandas dataframe where I make some data slicing and transformation and I want to plot the results of the persistence_model function like below.
Edit:
I want to customize a function with specific title of the plot, y and x axis and create a horizontal line on the same plot from the results of persitence_model function.
class ResidualErrors():
def __init__(self, data: pd.Series):
self.data = data
def _persistence_forecast_model_nrows(self, test_rows):
slicer = test_rows + 1
errors = self.data[-slicer:].diff().dropna()
return errors
def _persistence_forecast_model_percrows(self, train_perc):
n = len(self.data)
ntrain = int(n * train_perc)
errors = self.data[ntrain:].diff().dropna()
return errors
def persistence_model(self, test_rows=None, train_perc=None):
if (not test_rows) and (not train_perc):
raise TypeError(r"Please provide 'test_rows' or 'train_perc' arguments.")
if test_rows and train_perc:
raise TypeError(r"Please choose one argument either 'test_rows' or 'train_perc'.")
if test_rows:
return self._persistence_forecast_model_nrows(test_rows)
else:
return self._persistence_forecast_model_percrows(train_perc)
#classmethod
def plot_residuals(obj):
obj.plot()
plt.show()
Desired output
res = ResidualErrors(data).persistence_model(test_rows=10)
res.plot_residuals()
>> AttributeError: 'Series' object has no attribute 'plot_residuals'
You need to be more aware of what methods return. The first step creates a ResidualErrors object:
res = ResidualErrors(data)
The second step creates a DataFrame or Series:
obj = res.persistence_model(test_rows=10)
You can call plot_residuals on res but not on obj, as you are currently doing:
res.plot_residuals(obj)

How to return variables to multiple methods at once?

I am able to return multiple variables to a method as:
def groupby(self, df, groupbyvalue, fieldstoaggregate):
datamin = df.loc[df.groupby([groupbyvalue])[fieldstoaggregate].idxmin()]
datamax = df.loc[df.groupby([groupbyvalue])[fieldstoaggregate].idxmax()]
return plotsview.weatherplot(self, datamin, datamax)
Is it possible to return same datmin, datamax to another method at a time?
as:
return plotsview.newplot(self, datamin, datamax)
I am returning them to the below methods:
class plotsview:
def weatherplot(self, datamax1, datamin1):
and
class plotsviewnew:
def newplot(self, datamax, datamin):
If you want to return 2 values you can return a tuple. Just calculate your values into variables and then return them:
def groupby(self, df, groupbyvalue, fieldstoaggregate):
datamin = df.loc[df.groupby([groupbyvalue])[fieldstoaggregate].idxmin()]
datamax = df.loc[df.groupby([groupbyvalue])[fieldstoaggregate].idxmax()]
weather_res = plotsview.weatherplot(self,datamin,datamax)
new_res = plotsview.newplot(self,datamin,datamax)
return weather_res, new_res

Replacing multiple `if-else` with single `if-else` in a class [duplicate]

This question already has answers here:
How to call Python functions dynamically [duplicate]
(11 answers)
Calling a function of a module by using its name (a string)
(18 answers)
Closed 3 years ago.
So this is more of a trivial problem of writing a clean Python3 code. Let's say I have a class function which can create many function types based on the user input.
import numpy as np
class functions(object):
def __init__(self, typeOfFunction, amplitude, omega, start = None, stop = None,
pulsewidth = None):
self.typeOfFunction = typeOfFunction
self.amplitude = amplitude
self.omega = omega
self.period = 2 * np.pi/omega
self.start = start
self.stop = stop
self.pulsewidth = pulsewidth
def sine_function(self, t):
func = self.amplitude * np.sin(self.omega*t)
return func
def cosine_function(self, t):
func = self.amplitude * np.cos(self.omega*t)
return func
def unit_step_function(self, t):
func = self.amplitude * np.where(t > self.start, 1, 0)
return func
Now my question is let us say we want to write 3 other functions:
Differentiation
Integration
Evaluation at a given time.
Now my problem is that in each of these function I have to put conditions such as these:
def evaluate_function(self, time):
if(self.typeOfFunction == 'sine'):
funcValue = self.sine_function(time)
elif(self.typeOfFunction == 'cosine'):
funcValue = self.cosine_function(time)
elif(self.typeOfFunction == 'unit_step_function'):
funcValue = self.unit_step_function(time)
I want to do it only once in the __init__ method and at subsequent steps just pass the arguments instead of writing if-else:
def __init__(self, typeOfFunction, amplitude, omega, start = None, stop = None,
pulsewidth = None):
self.typeOfFunction = typeOfFunction
self.amplitude = amplitude
self.omega = omega
self.period = 2 * np.pi/omega
self.start = start
self.stop = stop
self.pulsewidth = pulsewidth
#DO SOMETHING THAT MAKES THE TYPE OF FUNCTION EMBEDDED
IN THE CLASS IN A CLASS VARIABLE
And then:
def evaluate_function(self, time):
value = self.doSomething(time)
return value
How can this be done? If duplicate question exists please inform me in the comments.
You can use the method getattr(CLASS_OBJECT, METHOD_ORVARIABLE_NAME) like this:
method = getattr(self, self.typeOfFunction)
and then call method:
method()
or for short:
getattr(self, self.typeOfFunction)()
Also you can check if the attribute you are getting exist or not:
if hasattr(self, self.typeOfFunction):
getattr(self, self.typeOfFunction)()
I think you want a mapping with a dict.
Something like this:
class functions(object):
def evaluate_function(self, which, time):
mapping = {'sine': self.sine_function,
'cosine': self.cosine_function,
# ...more functions here...
}
return mapping[which](time)
# rest of class here...

Python possible to short circuit function call

I am building a function to construct objects with set attributes (similar to a namedtuple); however, the output length must be variable.
I would like to build a function that allows the user to append additional attributes through a function call. Importantly, I would like to find a way to 'short-circuit' parameters and am unsure if Python is powerful enough to do this.
To explain take this trivial example:
def foo():
print("foo")
return False
def bar():
print("bar")
return True
if foo() and bar():
pass
Foo's function call returns False, and Bar short-circuits. The output console will only print foo, and bar is never executed.
Is there such a way to mimic this behavior with inspection or reflection in respect to function calls. Here is an example with my implementation is shown below:
from inspect import stack
cache = {}
def fooFormat(**kwargs):
caller = stack()[1][3]
if caller not in cache:
class fooOut(object):
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
def optional(self, opt, **kwargs):
if (opt):
self.__dict__.update(kwargs)
return self
def __str__(self):
return caller + str(self.__dict__)
cache[caller] = iadsOut
return cache[caller](**kwargs)
def stdev(nums, avg = None):
print("\tStdev call")
if avg is None:
avg = sum(nums) / len(nums)
residuals = sum((i - avg)**2 for i in nums)
return residuals**.5
def stats(nums, verbose=False):
if verbose:
print("Stats call with verbose")
else:
print("Stats call without verbose")
total = sum(nums)
N = len(nums)
avg = total / N
return fooFormat(
avg = avg,
lowerB = min(nums),
upperB = max(nums)).optional(verbose,
stdev = stdev(nums, avg))
In the function 'stats', the return fooFormat should of course yield avg, lowerB, and upperB; additionally, it should yield std if verbose is set to True. Moreover, the function 'stdev' should NOT be called if verbose is set to False.
stats([1,2,3,4], False)
stats([1,2,3,4], True)
Of course, a way around this is:
if verbose:
return fooFormat(
avg = avg,
lowerB = min(nums),
upperB = max(nums),
stdev = stdev(nums, avg))
else:
return fooFormat(
avg = avg,
lowerB = min(nums),
upperB = max(nums))
However, I am hoping there to implement this behavior without a branch.
This doesn't quite answer the shortcutting point, but this is a more efficient way of writing it:
out_dic = { # these items will always be calculated
'avg': avg,
'lowerB':max(nums),
'upperB':min(nums)
}
if verbose: # this is calculated only if verbose
out_dic['stdev'] = stdev(nums,avg)
return fooFormat(**out_dic)
In other words you can expand a dictionary to the kwargs, and add to the dictionary dynamically.

Python: Unpack a list of objects to Dictionary

I have a list of objects that need to be unpacked to a dictionary efficiently. There are more than 2,000,000 objects in the list. The operation takes more than 1.5 hours complete. I would like to know if this can be done more efficiently.
The objects in the list is based on this class.
class ResObj:
def __init__(self, index, result):
self.loc = index ### This is the location, where the values should go in the final result dictionary
self.res = result ### This is a dictionary that has values for this location.
self.loc = 2
self.res = {'value1':5.4, 'value2':2.3, 'valuen':{'sub_value1':4.5, 'sub_value2':3.4, 'sub_value3':7.6}}
Currently I use this method to perform this operation.
def make_final_result(list_of_results):
no_sub_result_variables = ['value1', 'value2']
sub_result_variables = ['valuen']
sub_value_variables = ['sub_value1', 'sub_value3', 'sub_value3']
final_result = {}
num_of_results = len(list_of_results)
for var in no_sub_result_variables:
final_result[var] = numpy.zeros(num_of_results)
for var in sub_result_variables:
final_result[var] = {sub_var:numpy.zeros(num_of_results) for sub_var in sub_value_variables}
for obj in list_of_results:
i = obj.loc
result = obj.res
for var in no_sub_result_variables:
final_result[var][i] = result[var]
for var in sub_result_variables:
for name in sub_value_variables:
try:
final_result[var][name][i] = result[var][name]
except KeyError as e:
##TODO Add a debug check
pass
I have tried using multiprocessing.Manager().dict and Manager().Array() to use parallelism for this, however, I could only get 2 processes to work (even though, I manually set the processes to # of CPUs = 24).
Can you please help me to use a faster method to improve the performance.
Thank you.
Having nested numpy arrays doesn't seem the best way to structure your data. You can use numpy's structured arrays to create a more intuitive data structure.
import numpy as np
# example values
values = [
{
"v1": 0,
"v2": 1,
"vs": {
"x": 2,
"y": 3,
"z": 4,
}
},
{
"v1": 5,
"v2": 6,
"vs": {
"x": 7,
"y": 8,
"z": 9,
}
}
]
def value_to_record(value):
"""Take a dictionary and convert it to an array-like format"""
return (
value["v1"],
value["v2"],
(
value["vs"]["x"],
value["vs"]["y"],
value["vs"]["z"]
)
)
# define what a record looks like -- f8 is an 8-byte float
dtype = [
("v1", "f8"),
("v2", "f8"),
("vs", [
("x", "f8"),
("y", "f8"),
("z", "f8")
])
]
# create actual array
arr = np.fromiter(map(value_to_record, values), dtype=dtype, count=len(values))
# access individual record
print(arr[0]) # prints (0.0, 1.0, (2.0, 3.0, 4.0))
# access specific value
assert arr[0]['vs']['x'] == 2
# access all values of a specific field
print(arr['v2']) # prints [ 1. 6.]
assert arr['v2'].sum() == 7
Using this way of generating the data created a 2,000,000 long array in 2 seconds on my machine.
To make it work for your ResObj objects then sort them by the loc attribute, and then pass the res attribute to the value_to_record function.
You you can distribute the work among processes by key names.
Here I create a pool of workers and pass to them var and optional subvar names.
The huge dataset is shared with workers using cheap fork.
Unpacker.unpack picks the specified vars from ResObj and returns them as an np.array
The main loop in make_final_result combines the arrays in final_result.
Py2:
from collections import defaultdict
from multiprocessing import Process, Pool
import numpy as np
class ResObj(object):
def __init__(self, index=None, result=None):
self.loc = index ### This is the location, where the values should go in the final result dictionary
self.res = result ### This is a dictionary that has values for this location.
self.loc = 2
self.res = {'value1':5.4, 'value2':2.3, 'valuen':{'sub_value1':4.5, 'sub_value2':3.4, 'sub_value3':7.6}}
class Unpacker(object):
#classmethod
def cls_init(cls, list_of_results):
cls.list_of_results = list_of_results
#classmethod
def unpack(cls, var, name):
list_of_results = cls.list_of_results
result = np.zeros(len(list_of_results))
if name is None:
for i, it in enumerate(list_of_results):
result[i] = it.res[var]
else:
for i, it in enumerate(list_of_results):
result[i] = it.res[var][name]
return var, name, result
#Pool.map doesn't accept instancemethods so the use of a wrapper
def Unpacker_unpack((var, name),):
return Unpacker.unpack(var, name)
def make_final_result(list_of_results):
no_sub_result_variables = ['value1', 'value2']
sub_result_variables = ['valuen']
sub_value_variables = ['sub_value1', 'sub_value3', 'sub_value3']
pool = Pool(initializer=Unpacker.cls_init, initargs=(list_of_results, ))
final_result = defaultdict(dict)
def key_generator():
for var in no_sub_result_variables:
yield var, None
for var in sub_result_variables:
for name in sub_value_variables:
yield var, name
for var, name, result in pool.imap(Unpacker_unpack, key_generator()):
if name is None:
final_result[var] = result
else:
final_result[var][name] = result
return final_result
if __name__ == '__main__':
print make_final_result([ResObj() for x in xrange(10)])
Ensure that you are not on Windows. It lacks fork and multiprocessing will have to pipe entire dataset to each of 24 worker processes.
Hope this will help.
Remove some indentation to make your loops non-nested:
for obj in list_of_results:
i = obj.loc
result = obj.res
for var in no_sub_result_variables:
final_result[var][i] = result[var]
for var in sub_result_variables:
for name in sub_value_variables:
try:
final_result[var][name][i] = result[var][name]
except KeyError as e:
##TODO Add a debug check
pass

Categories