Python: Unpack a list of objects to Dictionary - python

I have a list of objects that need to be unpacked to a dictionary efficiently. There are more than 2,000,000 objects in the list. The operation takes more than 1.5 hours complete. I would like to know if this can be done more efficiently.
The objects in the list is based on this class.
class ResObj:
def __init__(self, index, result):
self.loc = index ### This is the location, where the values should go in the final result dictionary
self.res = result ### This is a dictionary that has values for this location.
self.loc = 2
self.res = {'value1':5.4, 'value2':2.3, 'valuen':{'sub_value1':4.5, 'sub_value2':3.4, 'sub_value3':7.6}}
Currently I use this method to perform this operation.
def make_final_result(list_of_results):
no_sub_result_variables = ['value1', 'value2']
sub_result_variables = ['valuen']
sub_value_variables = ['sub_value1', 'sub_value3', 'sub_value3']
final_result = {}
num_of_results = len(list_of_results)
for var in no_sub_result_variables:
final_result[var] = numpy.zeros(num_of_results)
for var in sub_result_variables:
final_result[var] = {sub_var:numpy.zeros(num_of_results) for sub_var in sub_value_variables}
for obj in list_of_results:
i = obj.loc
result = obj.res
for var in no_sub_result_variables:
final_result[var][i] = result[var]
for var in sub_result_variables:
for name in sub_value_variables:
try:
final_result[var][name][i] = result[var][name]
except KeyError as e:
##TODO Add a debug check
pass
I have tried using multiprocessing.Manager().dict and Manager().Array() to use parallelism for this, however, I could only get 2 processes to work (even though, I manually set the processes to # of CPUs = 24).
Can you please help me to use a faster method to improve the performance.
Thank you.

Having nested numpy arrays doesn't seem the best way to structure your data. You can use numpy's structured arrays to create a more intuitive data structure.
import numpy as np
# example values
values = [
{
"v1": 0,
"v2": 1,
"vs": {
"x": 2,
"y": 3,
"z": 4,
}
},
{
"v1": 5,
"v2": 6,
"vs": {
"x": 7,
"y": 8,
"z": 9,
}
}
]
def value_to_record(value):
"""Take a dictionary and convert it to an array-like format"""
return (
value["v1"],
value["v2"],
(
value["vs"]["x"],
value["vs"]["y"],
value["vs"]["z"]
)
)
# define what a record looks like -- f8 is an 8-byte float
dtype = [
("v1", "f8"),
("v2", "f8"),
("vs", [
("x", "f8"),
("y", "f8"),
("z", "f8")
])
]
# create actual array
arr = np.fromiter(map(value_to_record, values), dtype=dtype, count=len(values))
# access individual record
print(arr[0]) # prints (0.0, 1.0, (2.0, 3.0, 4.0))
# access specific value
assert arr[0]['vs']['x'] == 2
# access all values of a specific field
print(arr['v2']) # prints [ 1. 6.]
assert arr['v2'].sum() == 7
Using this way of generating the data created a 2,000,000 long array in 2 seconds on my machine.
To make it work for your ResObj objects then sort them by the loc attribute, and then pass the res attribute to the value_to_record function.

You you can distribute the work among processes by key names.
Here I create a pool of workers and pass to them var and optional subvar names.
The huge dataset is shared with workers using cheap fork.
Unpacker.unpack picks the specified vars from ResObj and returns them as an np.array
The main loop in make_final_result combines the arrays in final_result.
Py2:
from collections import defaultdict
from multiprocessing import Process, Pool
import numpy as np
class ResObj(object):
def __init__(self, index=None, result=None):
self.loc = index ### This is the location, where the values should go in the final result dictionary
self.res = result ### This is a dictionary that has values for this location.
self.loc = 2
self.res = {'value1':5.4, 'value2':2.3, 'valuen':{'sub_value1':4.5, 'sub_value2':3.4, 'sub_value3':7.6}}
class Unpacker(object):
#classmethod
def cls_init(cls, list_of_results):
cls.list_of_results = list_of_results
#classmethod
def unpack(cls, var, name):
list_of_results = cls.list_of_results
result = np.zeros(len(list_of_results))
if name is None:
for i, it in enumerate(list_of_results):
result[i] = it.res[var]
else:
for i, it in enumerate(list_of_results):
result[i] = it.res[var][name]
return var, name, result
#Pool.map doesn't accept instancemethods so the use of a wrapper
def Unpacker_unpack((var, name),):
return Unpacker.unpack(var, name)
def make_final_result(list_of_results):
no_sub_result_variables = ['value1', 'value2']
sub_result_variables = ['valuen']
sub_value_variables = ['sub_value1', 'sub_value3', 'sub_value3']
pool = Pool(initializer=Unpacker.cls_init, initargs=(list_of_results, ))
final_result = defaultdict(dict)
def key_generator():
for var in no_sub_result_variables:
yield var, None
for var in sub_result_variables:
for name in sub_value_variables:
yield var, name
for var, name, result in pool.imap(Unpacker_unpack, key_generator()):
if name is None:
final_result[var] = result
else:
final_result[var][name] = result
return final_result
if __name__ == '__main__':
print make_final_result([ResObj() for x in xrange(10)])
Ensure that you are not on Windows. It lacks fork and multiprocessing will have to pipe entire dataset to each of 24 worker processes.
Hope this will help.

Remove some indentation to make your loops non-nested:
for obj in list_of_results:
i = obj.loc
result = obj.res
for var in no_sub_result_variables:
final_result[var][i] = result[var]
for var in sub_result_variables:
for name in sub_value_variables:
try:
final_result[var][name][i] = result[var][name]
except KeyError as e:
##TODO Add a debug check
pass

Related

Avoid multiple if's inside for loop

I have a function which creates data
from faker import Faker
import pandas as pd
import random
def create_rows_faker(num=1, name_col = True, address_col = True, email_col = False):
output = []
for x in range(num):
out = {}
if name_col:
out["name"] = fake.name()
if address_col:
out["address"] = fake.address()
if email_col:
out["email"] = fake.email()
output.append(out)
return output
but I want to remove the multiple if statements inside the for loop. What is the best method to improve this?
You can use kwargs and dictionary
def create_rows_faker(num=1, **kwargs):
output = []
es = {"name": fake.name(), "address": fake.address(), "email": fake.email()}
for x in range(num):
output.append({key: es[key] for key in kwargs.keys() if kwargs.get(key) })
return output
create_rows_faker(num=1, name=True, address=True, email=True)
Instead of taking the columns as separate arguments, use a list of column names. You can then loop over this list, and fill in out with the corresponding fakes, using getattr() to call the methods dynamically.
from copy import deepcopy
def create_rows_faker(num=1, columns):
output = []
for _ in range(num):
out = {col: getattr(fake, col)() for col in columns}
output.append(out)
return output
I'm not sure if this really is going to be any faster, because copying dictionaries does take at least as much time as doing if statements, but you can create the dictionary once and then copy it in to your output as needed.
def create_mock_rows(num: int = 1,
name_col: bool = True,
address_col: bool = True,
email_col: bool = True) -> list:
out = {
"name": fake.name() if name_col else None,
"address": fake.address() if address_col else None,
"email": fake.email() if email_col else None,
}
return [ {k: v for k, v in out.items() if v is not None} for _ in range(num)]
Another option is to leverage **kwargs:
def create_mock_rows(num: int = 1, **kwargs) -> list:
return [{k: getattr(fake, v)() for k, v in kwargs.items()} for _ in range(num)]
I admit I don't love this, though, because kwargs could be anything, and there is some chance of this just failing or giving you a weird result if improperly called.
Almost every language provides a switch statement:
switch(col.key) {
case "name":
col.val = fake.name();
break;
case "address":
col.val = fake.address();
break;
...
default:
throw new InvalidDataException();
As of version 3.10, Python added something similar: P634 - Structural Pattern Matching.
match subject:
case <pattern_1>:
<action_1>
case <pattern_2>:
<action_2>
case <pattern_3>:
<action_3>
case _:
<action_wildcard>

How to convert an object back into the code used to create it?

For example if I have a custom Python object like this;
#!/usr/bin/env python3
import os
base_dir = os.path.abspath(".")
class MyFile(dict):
def __init__(self, name, size = None, dir = base_dir):
self.name = name
self.path = os.path.join(dir, name)
self.bytes = size
and somewhere in my program, I initialize my object class;
a = MyFile(name = "foo", size = 10)
I want to be able to return the code used to create the object in the first place. For example;
print(a)
# <__main__.MyFile object at 0x102b84470>
# should instead print:
# MyFile(name = "foo", size = 10)
But since my object has some default attribute values, I only want those to show up in the output if they were explicitly included when the object was initialized;
b = MyFile(name = "bar", dir = "/home")
print(b)
# <__main__.MyFile object at 0x102b845c0>
# should instead print:
# MyFile(name = "bar", dir = "/home")
And to be clear, I am not trying to pull this from the source code, because a lot of my objects will be created dynamically, and I want to be able to return the same thing for them as well;
l = [ ("baz", 4), ("buzz", 12) ]
f = [ MyFile(name = n, size = s) for n, s in l ]
print(f)
# [<__main__.MyFile object at 0x1023844a8>, <__main__.MyFile object at 0x102384828>]
# should instead print:
# [ MyFile(name = "baz", size = 4), MyFile(name = "buzz", size = 12) ]
I saw the inspect library (https://docs.python.org/3/library/inspect.html) but it does not seem to have anything that does this. What am I missing? This functionality would be pretty analogous to R's dput function.
At a very basic level you can do this:
class MyClass:
def __init__(self, a, b):
self.a = a
self.b = b
def __repr__(self):
return f'{self.__class__.__name__}({self.a}, {self.b})'
class MyOtherClass(MyClass):
def method(self):
pass
c = MyClass(1, 2)
oc = MyOtherClass(3, 4)
print(c, oc)
Result:
MyClass(1, 2) MyOtherClass(3, 4)
This does what you ask, as well as taking subclassing into account to provide the correct class name. But of course things can get complicated for several reasons:
class MyClass:
def __init__(self, a, b):
self.a = a + 1
self.b = b if b < 10 else a
self.c = 0
def inc_c(self):
self.c += 1
def __repr__(self):
return f'{self.__class__.__name__}({self.a - 1}, {self.b})'
The value of c isn't covered by the constructor, so the proposed call would set it to 0. And Although you could compensate for the + 1 for a, the value of b will be more complicated - even more so if you realise someone could have changed the value later.
And then you need to consider that subclasses can override behaviour, etc. So, doing something like this only makes sense in very limited use cases.
As simple as replacing your code snippet with the following:
import os
base_dir = os.path.abspath(".")
class MyFile(object):
def __init__(self, name, size = None, dir = base_dir):
self.name = name
self.path = os.path.join(dir, name)
self.bytes = size
self.remember(name,size, dir)
def remember(self, name,size, dir):
self.s= '{}(name = \'{}\'{}{})'.format(self.__class__.__name__,name, ", size="+str(size) if size!=None else "", ', dir="'+dir+'"' if dir!=base_dir else "")
def __repr__(self):
return self.s
a) for a it returns:
MyFile(name = 'foo', size=10)
b) for b it returns:
MyFile(name = 'bar', dir="/home")
c) for f it returns:
[MyFile(name = 'baz', size=4), MyFile(name = 'buzz', size=12)]
Thanks to everyone who commented and answered. Ultimately, I incorporated their ideas and feedback into the following method, which allowed me to preserve the object's native __repr__ while still getting the behaviors I wanted.
#!/usr/bin/env python3
import os
base_dir = os.path.abspath(".")
class MyFile(dict):
"""
A custom dict class that auto-populates some keys based on simple input args
compatible with unittest.TestCase.assertDictEqual
"""
def __init__(self, name, size = None, dir = base_dir):
"""
standard init methods
"""
self.name = name
self.path = os.path.join(dir, name)
self.bytes = size
# auto-populate this key
self['somekey'] = self.path + ' ' + str(self.bytes)
# more logic for more complex keys goes here...
# use these later with `init` and `repr`
self.args = None
self.kwargs = None
#classmethod
def init(cls, *args, **kwargs):
"""
alternative method to initialize the object while retaining the args passed
"""
obj = cls(*args, **kwargs)
obj.args = args
obj.kwargs = kwargs
return(obj)
def repr(self):
"""
returns a text representation of the object that can be used to
create a new copy of an identical object, displaying only the
args that were originally used to create the current object instance
(do not show args that were not passed e.g. default value args)
"""
n = 'MyFile('
if self.args:
for i, arg in enumerate(self.args):
n += arg.__repr__()
if i < len(self.args) - 1 or self.kwargs:
n += ', '
if self.kwargs:
for i, (k, v) in enumerate(self.kwargs.items()):
n += str(k) + '=' + v.__repr__()
if i < len(self.kwargs.items()) - 1:
n += ', '
n += ')'
return(n)
Usage:
# normal object initialization
obj1 = MyFile('foo', size=10)
print(obj1) # {'somekey': '/Users/me/test/foo 10'}
# initialize with classmethod instead to preserve args
obj2 = MyFile.init("foo", size = 10)
print(obj2) # {'somekey': '/Users/me/test/foo 10'}
# view the text representation
repr = obj2.repr()
print(repr) # MyFile('foo', size=10)
# re-load a copy of the object from the text representation
obj3 = eval(repr)
print(obj3) # {'somekey': '/Users/me/test/foo 10'}
The use case for this being where I need to represent large simple data structures (dicts) in my Python code (integration tests), where the data values are dynamically generated from a smaller set of variables. But when I have many hundreds of such data structures that I need to include in the test case, it becomes infeasible to write the code for e.g. MyFile(...) out hundreds of times. This method allows me to use a script to ingest the data, and then print out compact Python code needed to recreate the data using my custom object class. Which I can then just copy/paste into my test cases.

how do I combine N dictionaries in list of dictionaries based on matching key:value pair?

I want to achieve the following. It's essentially the composition or merging of N number of dictionaries, accumulating all data from duplicates id and appending all values(except id, updated_date) from all dictionaries in multiples data sources in the final result.
class A:
def __init__(self):
pass
def run(self):
return {"data":[{"id":"ID-2002-0201","updated_at":"2018-05-14T22:25:51Z","html_url":["https://github.com/ID-2002-0201"],"source":"github"},{"id":"ID-2002-0200","updated_at":"2018-05-14T21:49:15Z","html_url":["https://github.com/ID-2002-0200"],"source":"github"},{"id":"ID-2002-0348","updated_at":"2018-05-11T14:13:28Z","html_url":["https://github.com/ID-2002-0348"],"source":"github"}]}
class B:
def __init__(self):
pass
def run(self):
return {"data":[{"id":"ID-2002-0201","updated_at":"2006-03-28","html_url":["http://sample.com/files/1622"],"source":"sample"},{"id":"ID-2002-0200","updated_at":"2006-06-05","html_url":["http://sample.com/files/1880"],"source":"sample"},{"id":"ID-2002-0348","updated_at":"2007-03-09","html_url":["http://sample.com/files/3441"],"source":"sample"}]}
results = {}
data_sources = [A(),B()]
for data in data_sources:
data_stream = data.run()
for data in data_stream.get('data'):
for key, value in data.items():
if key in ['html_url']:
results.setdefault(key, []).extend(value)
elif key in ['source']:
results.setdefault(key, []).append(value)
else:
results[key] = value
print(results)
desired output
[
{
"id":"ID-2002-0201",
"updated_at":"2018-05-14T22:25:51Z",
"html_url":[
"https://github.com/ID-2002-0201",
"https://github.com/ID-2002-0202",
"https://github.com/ID-2002-0203",
"https://github.com/ID-2002-0204"
],
"source": [
"github",
"xxx",
"22aas"
]
},
]
I am a little confused because the desired output you have given does not match with the sample classes provided by you in the code. However, I think I get what you want, correct me if I interpreted your question incorrectly.
I have using your results array like a dictionary of dictionaries. The outer dictionary contains all the unique ids as keys and the inner dictionaries contain the data you wanted in your output. After the loop computes I just return the list(results.values()) to get a list of N dictionaries combined.
Here is the code:
class A:
def __init__(self):
pass
def run(self):
return {"data":[{"id":"ID-2002-0201","updated_at":"2018-05-14T22:25:51Z","html_url":["https://github.com/ID-2002-0201"],"source":"github"},{"id":"ID-2002-0200","updated_at":"2018-05-14T21:49:15Z","html_url":["https://github.com/ID-2002-0200"],"source":"github"},{"id":"ID-2002-0348","updated_at":"2018-05-11T14:13:28Z","html_url":["https://github.com/ID-2002-0348"],"source":"github"}]}
class B:
def __init__(self):
pass
def run(self):
return {"data":[{"id":"ID-2002-0201","updated_at":"2006-03-28","html_url":["http://sample.com/files/1622"],"source":"sample"},{"id":"ID-2002-0200","updated_at":"2006-06-05","html_url":["http://sample.com/files/1880"],"source":"sample"},{"id":"ID-2002-0348","updated_at":"2007-03-09","html_url":["http://sample.com/files/3441"],"source":"sample"}]}
results = {}
data_sources = [A(),B()]
for data in data_sources:
data_stream = data.run()
for data in data_stream.get('data'):
curr_id = data["id"]
result = results.setdefault(curr_id, {})
for key, value in data.items():
if key in ['html_url']:
result.setdefault(key, []).extend(value)
elif key in ['source']:
result.setdefault(key, []).append(value)
else:
result[key] = value
print(list(results.values()))

Dynamically adding nested dictionaries

I want to dynamically add values in a nested dictionary. I am trying to cache similarity score of two words with their part-of-speech-tag.
In short I want to store values as this;
synset_cache[word1][word1_tag][word2][word2_tag] = score
class MyClass(Object):
def __init__(self):
MyClass.synset_cache={} #dict
def set_cache(self,word1, word1_tag, word2, word2_tag, score)
try:
MyClass.synset_cache[word1]
except:
MyClass.synset_cache[word1]={} #create new dict
try:
MyClass.synset_cache[word1][word1_tag]
except:
MyClass.synset_cache[word1][word1_tag]={} #create new dict
try:
MyClass.synset_cache[word1][word1_tag][word2]
except:
MyClass.synset_cache[word1][word1_tag][word2]={} #create new dict
#store the value
MyClass.synset_cache[word1][word1_tag][word2][word2_tag] = score
But I am getting this error.
Type error: list indices must be integers, not unicode
Line number it shows is at MyClass.synset_cache[word1][word1_tag]={} #create new dict.
How can I get this working?
EDIT:
According to the #Robᵩ's comments on his answer; I was assigning a list to this MyClass.synset_cache in another method(note it is at the class-level). So this code part had no errors.
Use dict.setdefault.
This might work:
#UNTESTED
d = MyClass.synset_cache.setdefault(word1, {})
d = d.setdefault(word1_tag, {})
d = d.setdefault(word2, {})
d[word2_tag] = score
Alternatively, you can use this handy recursive defaultdict that springs up new levels of dict automatically. (See: here and here.)
import collections
def tree():
return collections.defaultdict(tree)
class MyClass(Object):
def __init__(self):
MyClass.synset_cache=tree()
def set_cache(self,word1, word1_tag, word2, word2_tag, score)
MyClass.synset_cache[word1][word1_tag][word2][word2_tag] = score
This will be data dependent, as at least for some test data (see below), the code does not produce that error. How are you calling it?
Also, note that as written above, it won't compile due to some syntax errors (i.e. no colon to end the def set_cache line).
Below is some tweaked-to-compile code with some example calling data and how that pretty-prints:
#!/usr/bin/env python
import pprint
class MyClass():
def __init__(self):
MyClass.synset_cache={} #dict
def set_cache(self,word1, word1_tag, word2, word2_tag, score):
try:
MyClass.synset_cache[word1]
except:
MyClass.synset_cache[word1]={} #create new dict
try:
MyClass.synset_cache[word1][word1_tag]
except:
MyClass.synset_cache[word1][word1_tag]={} #create new dict
try:
MyClass.synset_cache[word1][word1_tag][word2]
except:
MyClass.synset_cache[word1][word1_tag][word2]={} #create new dict
#store the value
MyClass.synset_cache[word1][word1_tag][word2][word2_tag] = score
x = MyClass()
x.set_cache('foo', 'foo-tag', 'bar', 'bar-tag', 100)
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(x.synset_cache)
Which outputs:
{ 'foo': { 'foo-tag': { 'bar': { 'bar-tag': 100}}}}
A couple other things of note...
I'd recommend using the in style syntax to check for key presence rather than try-except. It's more compact and more Pythonic.
Also, your main variable, synset_cache, is class-level (i.e. static). Did you mean for that to be the case?

Returning two values from pandas.rolling_apply

I am using pandas.rolling_apply to fit data to a distribution and get a value from it, but I need it also report a rolling goodness of fit (specifically, p-value). Currently I'm doing it like this:
def func(sample):
fit = genextreme.fit(sample)
return genextreme.isf(0.9, *fit)
def p_value(sample):
fit = genextreme.fit(sample)
return kstest(sample, 'genextreme', fit)[1]
values = pd.rolling_apply(data, 30, func)
p_values = pd.rolling_apply(data, 30, p_value)
results = pd.DataFrame({'values': values, 'p_value': p_values})
The problem is that I have a lot of data, and the fit function is expensive, so I don't want to call it twice for every sample. What I'd rather do is something like this:
def func(sample):
fit = genextreme.fit(sample)
value = genextreme.isf(0.9, *fit)
p_value = kstest(sample, 'genextreme', fit)[1]
return {'value': value, 'p_value': p_value}
results = pd.rolling_apply(data, 30, func)
Where results is a DataFrame with two columns. If I try to run this, I get an exception:
TypeError: a float is required. Is it possible to achieve this, and if so, how?
I had a similar problem and solved it by using a member function of a separate helper class during apply. That member function does as required return a single value but I store the other calc results as members of the class and can use it afterwards.
Simple Example:
class CountCalls:
def __init__(self):
self.counter = 0
def your_function(self, window):
retval = f(window)
self.counter = self.counter + 1
TestCounter = CountCalls()
pandas.Series.rolling(your_seriesOrDataframeColumn, window = your_window_size).apply(TestCounter.your_function)
print TestCounter.counter
Assume your function f would return a tuple of two values v1,v2. Then you can return v1 and assign it to column_v1 to your dataframe. The second value v2 you simply accumulate in a Series series_val2 within the helper class. Afterwards you just assing that series as new column to your dataframe.
JML
I had a similar problem before. Here's my solution for it:
from collections import deque
class your_multi_output_function_class:
def __init__(self):
self.deque_2 = deque()
self.deque_3 = deque()
def f1(self, window):
self.k = somefunction(y)
self.deque_2.append(self.k[1])
self.deque_3.append(self.k[2])
return self.k[0]
def f2(self, window):
return self.deque_2.popleft()
def f3(self, window):
return self.deque_3.popleft()
func = your_multi_output_function_class()
output = your_pandas_object.rolling(window=10).agg(
{'a':func.f1,'b':func.f2,'c':func.f3}
)
I used and loved #yi-yu's answer so I made it generic:
from collections import deque
from functools import partial
def make_class(func, dim_output):
class your_multi_output_function_class:
def __init__(self, func, dim_output):
assert dim_output >= 2
self.func = func
self.deques = {i: deque() for i in range(1, dim_output)}
def f0(self, *args, **kwargs):
k = self.func(*args, **kwargs)
for queue in sorted(self.deques):
self.deques[queue].append(k[queue])
return k[0]
def accessor(self, index, *args, **kwargs):
return self.deques[index].popleft()
klass = your_multi_output_function_class(func, dim_output)
for i in range(1, dim_output):
f = partial(accessor, klass, i)
setattr(klass, 'f' + str(i), f)
return klass
and given a function f of a pandas Series (windowed but not necessarily) returning, n values, you use it this way:
rolling_func = make_class(f, n)
# dict to map the function's outputs to new columns. Eg:
agger = {'output_' + str(i): getattr(rolling_func, 'f' + str(i)) for i in range(n)}
windowed_series.agg(agger)
I also had the same issue. I solved it by generating a global data frame and feeding it from the rolling function. In the following example script, I generate a random input data. Then, I calculate with a single rolling apply function the min, the max and the mean.
import pandas as pd
import numpy as np
global outputDF
global index
def myFunction(array):
global index
global outputDF
# Some random operation
outputDF['min'][index] = np.nanmin(array)
outputDF['max'][index] = np.nanmax(array)
outputDF['mean'][index] = np.nanmean(array)
index += 1
# Returning a useless variable
return 0
if __name__ == "__main__":
global outputDF
global index
# A random window size
windowSize = 10
# Preparing some random input data
inputDF = pd.DataFrame({ 'randomValue': [np.nan] * 500 })
for i in range(len(inputDF)):
inputDF['randomValue'].values[i] = np.random.rand()
# Pre-Allocate memory
outputDF = pd.DataFrame({ 'min': [np.nan] * len(inputDF),
'max': [np.nan] * len(inputDF),
'mean': [np.nan] * len(inputDF)
})
# Precise the staring index (due to the window size)
d = (windowSize - 1) / 2
index = np.int(np.floor( d ) )
# Do the rolling apply here
inputDF['randomValue'].rolling(window=windowSize,center=True).apply(myFunction,args=())
assert index + np.int(np.ceil(d)) == len(inputDF), 'Length mismatch'
outputDF.set_index = inputDF.index
# Optional : Clean the nulls
outputDF.dropna(inplace=True)
print(outputDF)

Categories