E.g. if I have
import polars as pl
df = pl.DataFrame({'a': [1,2,3], 'b': [4,5,6]})
how would I find the cumulative sum of each row?
Expected output:
a b
0 1 5
1 2 7
2 3 9
Here's the equivalent in pandas:
>>> import pandas as pd
>>> pd.DataFrame({'a': [1,2,3], 'b': [4,5,6]}).cumsum(axis=1)
a b
0 1 5
1 2 7
2 3 9
but I can't figure out how to do it in polars
Edit: Polars 0.14.18 and later
As of Polars 0.14.18, we can use the new polars.cumsum function to simplify this. (Note: this is slightly different than the polars.Expr.cumsum Expression, in that it acts as a root Expression.)
Using the same DataFrame as below:
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.select([
pl.exclude(my_cols),
pl.cumsum(my_cols).alias('result')
])
.unnest('result')
)
shape: (3, 4)
┌─────┬─────┬─────┬─────┐
│ id ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 5 ┆ 12 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ b ┆ 2 ┆ 7 ┆ 15 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ c ┆ 3 ┆ 9 ┆ 18 │
└─────┴─────┴─────┴─────┘
Before Polars 0.14.18
Polars is column-oriented, and as such does not have the concept of a axis. Still, we can use the list evaluation context to solve this.
First, let's expand you data slightly:
df = pl.DataFrame({
"id": ['a', 'b', 'c'],
"a": [1, 2, 3],
"b": [4, 5, 6],
"c": [7, 8, 9],
})
df
shape: (3, 4)
┌─────┬─────┬─────┬─────┐
│ id ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 4 ┆ 7 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ b ┆ 2 ┆ 5 ┆ 8 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ c ┆ 3 ┆ 6 ┆ 9 │
└─────┴─────┴─────┴─────┘
The Algorithm
Here's a general-purpose performant algorithm that will solve this. We'll walk through it below.
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.with_column(
pl.concat_list(my_cols)
.arr.eval(pl.element().cumsum())
.arr.to_struct(name_generator=lambda idx: my_cols[idx])
.alias('result')
)
.drop(my_cols)
.unnest('result')
)
shape: (3, 4)
┌─────┬─────┬─────┬─────┐
│ id ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 5 ┆ 12 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ b ┆ 2 ┆ 7 ┆ 15 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ c ┆ 3 ┆ 9 ┆ 18 │
└─────┴─────┴─────┴─────┘
How it works
First, we'll select the names of the numeric columns. You can name these explicitly if you like, e.g., my_cols=['a','b','c'].
Next, we'll gather up the column values into a list using polars.concat_list.
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.with_column(
pl.concat_list(my_cols)
.alias('result')
)
)
shape: (3, 5)
┌─────┬─────┬─────┬─────┬───────────┐
│ id ┆ a ┆ b ┆ c ┆ result │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ list[i64] │
╞═════╪═════╪═════╪═════╪═══════════╡
│ a ┆ 1 ┆ 4 ┆ 7 ┆ [1, 4, 7] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2 ┆ 5 ┆ 8 ┆ [2, 5, 8] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 3 ┆ 6 ┆ 9 ┆ [3, 6, 9] │
└─────┴─────┴─────┴─────┴───────────┘
From here, we'll use the arr.eval context to run our cumsum on the list.
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.with_column(
pl.concat_list(my_cols)
.arr.eval(pl.element().cumsum())
.alias('result')
)
)
shape: (3, 5)
┌─────┬─────┬─────┬─────┬────────────┐
│ id ┆ a ┆ b ┆ c ┆ result │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ list[i64] │
╞═════╪═════╪═════╪═════╪════════════╡
│ a ┆ 1 ┆ 4 ┆ 7 ┆ [1, 5, 12] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2 ┆ 5 ┆ 8 ┆ [2, 7, 15] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 3 ┆ 6 ┆ 9 ┆ [3, 9, 18] │
└─────┴─────┴─────┴─────┴────────────┘
Next, we'll break the list into a struct using arr.to_struct, and name the fields the corresponding names from our selected numeric columns.
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.with_column(
pl.concat_list(my_cols)
.arr.eval(pl.element().cumsum())
.arr.to_struct(name_generator=lambda idx: my_cols[idx])
.alias('result')
)
)
shape: (3, 5)
┌─────┬─────┬─────┬─────┬───────────┐
│ id ┆ a ┆ b ┆ c ┆ result │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ struct[3] │
╞═════╪═════╪═════╪═════╪═══════════╡
│ a ┆ 1 ┆ 4 ┆ 7 ┆ {1,5,12} │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2 ┆ 5 ┆ 8 ┆ {2,7,15} │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 3 ┆ 6 ┆ 9 ┆ {3,9,18} │
└─────┴─────┴─────┴─────┴───────────┘
And finally, we'll use unnest to break the struct into columns. (But first we must drop the original columns or else we'll get two columns with the same name.)
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.with_column(
pl.concat_list(my_cols)
.arr.eval(pl.element().cumsum())
.arr.to_struct(name_generator=lambda idx: my_cols[idx])
.alias('result')
)
.drop(my_cols)
.unnest('result')
)
shape: (3, 4)
┌─────┬─────┬─────┬─────┐
│ id ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 5 ┆ 12 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ b ┆ 2 ┆ 7 ┆ 15 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ c ┆ 3 ┆ 9 ┆ 18 │
└─────┴─────┴─────┴─────┘
There may be a simpler and faster way, but here is the programmatic solution.
Concatenate the values along the columns into a list
Calculate the cumulative sum over the list (the result is still a list)
Get values for each column in the result
import polars as pl
df = pl.DataFrame({'a': [1,2,3], 'b': [4,5,6]})
df.select([
pl.concat_list(pl.all())
.arr.eval(pl.element().cumsum())
.alias('cs')
]).select([
pl.col('cs').arr.get(i).alias(name)
for i, name in enumerate(df.columns)
])
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 5 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 7 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 9 │
└─────┴─────┘
I have a Polars dataframe:
┌───────────┬────────────┬────────────┬────────────┬
│ Name ┆ Purchase ┆ Size ┆ Color ┆
│ ┆ Time ┆ ┆ ┆
│ --- ┆ --- ┆ --- ┆ ┆
│ str ┆ datetime[μ ┆ i64 ┆ --- ┆
│ ┆ s] ┆ ┆ str ┆
╞═══════════╪════════════╪════════════╪════════════╪
│ T-Shirt ┆ 2022-02-14 ┆ 12 ┆ Blue ┆
│ ┆ 14:40:09.1 ┆ ┆ ┆
│ ┆ 00 ┆ ┆ ┆
└───────────┴────────────┴────────────┴────────────
And I would like to convert each row of this dataframe into an object which contains information extracted from the rows, e.g:
PurchasedObject(Name, Size, Color)
Is it possible with polars to create a new column in the dataframe, which contains for each row the corresponding object?
Which would be the best way to achieve this in Polars?
Thank you!
Polars has a struct datatype that can be used to pack columns together in a single datatype.
import polars as pl
from datetime import datetime
pl.DataFrame({
"name": ["t-shirt"],
"purchased": [datetime(2022, 2, 4)],
"size": [12],
"color": ["blue"]
}).with_column(
pl.struct(["name", "size", "color"]).alias("purchased_struct")
)
shape: (1, 5)
┌─────────┬─────────────────────┬──────┬───────┬───────────────────────┐
│ name ┆ purchased ┆ size ┆ color ┆ purchased_struct │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ datetime[μs] ┆ i64 ┆ str ┆ struct[3] │
╞═════════╪═════════════════════╪══════╪═══════╪═══════════════════════╡
│ t-shirt ┆ 2022-02-04 00:00:00 ┆ 12 ┆ blue ┆ {"t-shirt",12,"blue"} │
└─────────┴─────────────────────┴──────┴───────┴───────────────────────┘
So I have a Polars dataframe looking as such
df = pl.DataFrame(
{
"ItemId": [15148, 15148, 24957],
"SuffixFactor": [19200, 200, 24],
"ItemRand": [254, -1, -44],
"Stat0": ['+5 Defense', '+$i Might', '+9 Vitality'],
"Amount": ['', '7', '']
}
)
I want to replace $i in the column "Stat0" with Amount whenever Stat0 contains i$
I have tried a couple different things such as:
df = df.with_column(
pl.col('Stat0').str.replace(r'\$i', pl.col('Amount'))
)
Expected result
result = pl.DataFrame(
{
"ItemId": [15148, 15148, 24957],
"SuffixFactor": [19200, 200, 24],
"ItemRand": [254, -1, -44],
"Stat0": ['+5 Defense', '+7 Might', '+9 Vitality'],
"Amount": ['', '7', '']
}
)
But this doesn't seem to work.
I hope someone can help.
Best regards
Edit: Polars >= 0.14.4
As of Polars 0.14.4, the replace and replace_all expressions allow an Expression for the value parameter. Thus, we can solve this more simply as:
df.with_column(
pl.col('Stat0').str.replace(r'\$i', pl.col('Amount'))
)
shape: (3, 5)
┌────────┬──────────────┬──────────┬─────────────┬────────┐
│ ItemId ┆ SuffixFactor ┆ ItemRand ┆ Stat0 ┆ Amount │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ str ┆ str │
╞════════╪══════════════╪══════════╪═════════════╪════════╡
│ 15148 ┆ 19200 ┆ 254 ┆ +5 Defense ┆ │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 15148 ┆ 200 ┆ -1 ┆ +7 Might ┆ 7 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 24957 ┆ 24 ┆ -44 ┆ +9 Vitality ┆ │
└────────┴──────────────┴──────────┴─────────────┴────────┘
Polars < 0.14.4
The problem is that the replace method does not take an Expression, only a constant. Thus, we cannot use a column as replacement values, only a constant.
We can get around this in two ways.
Slow: using apply
This method uses python code to perform the replacement. Since we are executing python bytecode using apply, it will be slow. If your DataFrame is small, then this won't be too painfully slow.
(
df
.with_column(
pl.struct(['Stat0', 'Amount'])
.apply(lambda cols: cols['Stat0'].replace('$i', cols['Amount']))
.alias('Stat0')
)
)
shape: (3, 5)
┌────────┬──────────────┬──────────┬─────────────┬────────┐
│ ItemId ┆ SuffixFactor ┆ ItemRand ┆ Stat0 ┆ Amount │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ str ┆ str │
╞════════╪══════════════╪══════════╪═════════════╪════════╡
│ 15148 ┆ 19200 ┆ 254 ┆ +5 Defense ┆ │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 15148 ┆ 200 ┆ -1 ┆ +7 Might ┆ 7 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 24957 ┆ 24 ┆ -44 ┆ +9 Vitality ┆ │
└────────┴──────────────┴──────────┴─────────────┴────────┘
Fast: using split_exact and when/then/otherwise
This method uses all Polars Expressions. As such, it will be much faster, especially for large DataFrames.
(
df
.with_column(
pl.col('Stat0').str.split_exact('$i', 1)
)
.unnest('Stat0')
.with_column(
pl.when(pl.col('field_1').is_null())
.then(pl.col('field_0'))
.otherwise(pl.concat_str(['field_0', 'Amount', 'field_1']))
.alias('Stat0')
)
.drop(['field_0', 'field_1'])
)
shape: (3, 5)
┌────────┬──────────────┬──────────┬────────┬─────────────┐
│ ItemId ┆ SuffixFactor ┆ ItemRand ┆ Amount ┆ Stat0 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ str ┆ str │
╞════════╪══════════════╪══════════╪════════╪═════════════╡
│ 15148 ┆ 19200 ┆ 254 ┆ ┆ +5 Defense │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 15148 ┆ 200 ┆ -1 ┆ 7 ┆ +7 Might │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 24957 ┆ 24 ┆ -44 ┆ ┆ +9 Vitality │
└────────┴──────────────┴──────────┴────────┴─────────────┘
How it works: we first split the Stat0 column on $i using split_exact. This will produce a struct.
(
df
.with_column(
pl.col('Stat0').str.split_exact('$i', 1)
)
)
shape: (3, 5)
┌────────┬──────────────┬──────────┬──────────────────────┬────────┐
│ ItemId ┆ SuffixFactor ┆ ItemRand ┆ Stat0 ┆ Amount │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ struct[2] ┆ str │
╞════════╪══════════════╪══════════╪══════════════════════╪════════╡
│ 15148 ┆ 19200 ┆ 254 ┆ {"+5 Defense",null} ┆ │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 15148 ┆ 200 ┆ -1 ┆ {"+"," Might"} ┆ 7 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 24957 ┆ 24 ┆ -44 ┆ {"+9 Vitality",null} ┆ │
└────────┴──────────────┴──────────┴──────────────────────┴────────┘
Notice that when Stat0 does not contain $i, the second member of the struct is null. We'll use this fact to our advantage.
In the next step, we break the struct into separate columns, using unnest.
(
df
.with_column(
pl.col('Stat0').str.split_exact('$i', 1)
)
.unnest('Stat0')
)
shape: (3, 6)
┌────────┬──────────────┬──────────┬─────────────┬─────────┬────────┐
│ ItemId ┆ SuffixFactor ┆ ItemRand ┆ field_0 ┆ field_1 ┆ Amount │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ str ┆ str ┆ str │
╞════════╪══════════════╪══════════╪═════════════╪═════════╪════════╡
│ 15148 ┆ 19200 ┆ 254 ┆ +5 Defense ┆ null ┆ │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 15148 ┆ 200 ┆ -1 ┆ + ┆ Might ┆ 7 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 24957 ┆ 24 ┆ -44 ┆ +9 Vitality ┆ null ┆ │
└────────┴──────────────┴──────────┴─────────────┴─────────┴────────┘
This creates two new columns: field_0 and field_1.
From here, we use when/then/otherwise and concat_str to construct the final result
Basically:
when $i does not appear in the Stat0 column, then the string is not split, and field_1 is null, so we can use the value in field_0 as is.
when $i does appear in Stat0, then the string is split into two parts: field_0 and field_1. We simply concatenate the parts back together, putting Amount in the middle.
(
df
.with_column(
pl.col('Stat0').str.split_exact('$i', 1)
)
.unnest('Stat0')
.with_column(
pl.when(pl.col('field_1').is_null())
.then(pl.col('field_0'))
.otherwise(pl.concat_str(['field_0', 'Amount', 'field_1']))
.alias('Stat0')
)
)
shape: (3, 7)
┌────────┬──────────────┬──────────┬─────────────┬─────────┬────────┬─────────────┐
│ ItemId ┆ SuffixFactor ┆ ItemRand ┆ field_0 ┆ field_1 ┆ Amount ┆ Stat0 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ str ┆ str ┆ str ┆ str │
╞════════╪══════════════╪══════════╪═════════════╪═════════╪════════╪═════════════╡
│ 15148 ┆ 19200 ┆ 254 ┆ +5 Defense ┆ null ┆ ┆ +5 Defense │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 15148 ┆ 200 ┆ -1 ┆ + ┆ Might ┆ 7 ┆ +7 Might │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 24957 ┆ 24 ┆ -44 ┆ +9 Vitality ┆ null ┆ ┆ +9 Vitality │
└────────┴──────────────┴──────────┴─────────────┴─────────┴────────┴─────────────┘
I have a data frame as follow
┌────────────┬──────────┬──────────┬──────────┬──────────┐
│ time ┆ open ┆ high ┆ low ┆ close │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞════════════╪══════════╪══════════╪══════════╪══════════╡
│ 1649016000 ┆ 46405.49 ┆ 47444.11 ┆ 46248.84 ┆ 46407.35 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1649030400 ┆ 46407.36 ┆ 46461.14 ┆ 45744.77 ┆ 46005.44 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1649044800 ┆ 46005.43 ┆ 46293.38 ┆ 45834.39 ┆ 46173.99 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1649059200 ┆ 46174.0 ┆ 46287.97 ┆ 45787.0 ┆ 46160.09 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ ... ┆ ... ┆ ... ┆ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1653278400 ┆ 30171.32 ┆ 30670.51 ┆ 30101.07 ┆ 30457.01 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1653292800 ┆ 30457.01 ┆ 30616.18 ┆ 30281.89 ┆ 30397.11 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1653307200 ┆ 30397.12 ┆ 30625.98 ┆ 29967.07 ┆ 30373.53 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1653321600 ┆ 30373.53 ┆ 30529.9 ┆ 30042.09 ┆ 30121.02 │
└────────────┴──────────┴──────────┴──────────┴──────────┘
I want to count how many times each price (low and high) were local minimum/maximum in a window range of 2 to 50.
first I add two columns for count of being local min/max for each row and fill by zeros
raw_data["lmin_count"] = np.zeros(len(raw_data), dtype=np.int16)
raw_data["lmax_count"] = np.zeros(len(raw_data), dtype=np.int16)
then I iterate window length from 2 to 50 and find index of each local min/max by using:
for _order in range(2, 51):
local_minima = argrelextrema(raw_data["low"].to_numpy(), np.less, order=_order)[0]
local_maxima = argrelextrema(raw_data["high"].to_numpy(), np.greater, order=_order)[0]
which order is window length.
and in each iteration over window length I want to increase value of lmin_count and lmax_count by indices found in local_minima and local_maxima
I tried increasing value by this code:
if len(local_minima) > 1:
raw_data[local_minima,5] += 1
if len(local_maxima) > 1:
raw_data[local_minima,6] += 1
which local_minima and local_maxima are array of indices and 5,6 are index of lmin_count and lmax_count columns.
but got error not implemented.
So what is the best way to increase (or assign) value of column by row indices?
Update 2022/05/24
As answers were very helpful now I have other issues.
I changed my code as follow:
min_expr_list = [
(
pl.col("price").rolling_min(
window_size=_order * 2 + 1, min_periods=_order + 2, center=True
)
== pl.col("price")
).cast(pl.UInt32)
for _order in range(200, 1001)
]
max_expr_list = [
(
pl.col("price").rolling_max(
window_size=_order * 2 + 1, min_periods=_order + 2, center=True
)
== pl.col("price")
).cast(pl.UInt32)
for _order in range(200, 1001)
]
raw_data = raw_data.with_columns(
[
pl.sum(min_expr_list).alias("min_freq"),
pl.sum(max_expr_list).alias("max_freq"),
]
)
first: is it possible to merge both min_expr_list and max_expr_list into one list? and if it is possible, in with_columns expression how can I add separate columns based on each element of list?
another issue I am facing is memory usage of this approach.
In previous example _order were limited but in action it is more wider than example.
currently I have datasets with millions of records (some of them have more than 10 million records) and _orders range can be from 2 to 1500 so calculating needs lots of GB of ram.
is there any better way to do that?
and one more side problem. when increasing _order to more than 1000 it seems it doesn't work. is there any limitation in source code?
Let me see if we can build on #ritchie46 response and nudge you closer to the finish line.
Data
I've concatenated the 'open', 'high', and 'low' columns in your sample data, just to give us some data to work with. I've also added a row_nr column, just for discussion. (It won't be used in any calculations whatsoever, so you don't need to include it in your final code.)
import numpy as np
import polars as pl
from scipy.signal import argrelextrema
df = pl.DataFrame(
{
"col1": [
46405.49, 46407.36, 46005.43, 46174.00, 30171.32, 30457.01, 30397.12, 30373.53,
47444.11, 46461.14, 46293.38, 46287.97, 30670.51, 30616.18, 30625.98, 30529.90,
46248.84, 45744.77, 45834.39, 45787.00, 30101.07, 30281.89, 29967.07, 30042.09,
]
}
).with_row_count()
df
shape: (24, 2)
┌────────┬──────────┐
│ row_nr ┆ col1 │
│ --- ┆ --- │
│ u32 ┆ f64 │
╞════════╪══════════╡
│ 0 ┆ 46405.49 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 46407.36 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 46005.43 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 46174.0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 30457.01 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ 30397.12 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ 30373.53 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ 47444.11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ 46461.14 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 10 ┆ 46293.38 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 11 ┆ 46287.97 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 12 ┆ 30670.51 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 13 ┆ 30616.18 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 14 ┆ 30625.98 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 15 ┆ 30529.9 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 16 ┆ 46248.84 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 17 ┆ 45744.77 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 18 ┆ 45834.39 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 19 ┆ 45787.0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 20 ┆ 30101.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 21 ┆ 30281.89 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 22 ┆ 29967.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 23 ┆ 30042.09 │
└────────┴──────────┘
Now, let's run the scipy.signal.argrelextrema code on this data.
for _order in range(1, 7):
print(
"order:", _order, ":", argrelextrema(df["col1"].to_numpy(), np.less, order=_order)
)
order: 1 : (array([ 2, 4, 7, 13, 15, 17, 20, 22]),)
order: 2 : (array([ 4, 7, 15, 22]),)
order: 3 : (array([ 4, 15, 22]),)
order: 4 : (array([ 4, 15, 22]),)
order: 5 : (array([ 4, 22]),)
order: 6 : (array([ 4, 22]),)
From the output, it looks like you're trying to find the index of any row that is the minimum value of a window centered on that row, for various window sizes.
For example, row_nr 2 is a local minimum of a window of size 3, centered on row_nr 2. (Here, order=1 in the call to argrelextrema means "including one value above and below", and hence "window size" = (order * 2) + 1) = 3.
Let's replicate this in Polars. We'll take it in steps.
rolling_min
First, let's use the rolling_min expression to calculate rolling minimums corresponding to order from 1 to 6. Notice that Polars allows us to generate a list of expressions outside of the with_columns context. (This often helps keep code more readable.)
I'm converting the scipy order keyword to the equivalent window_size for rolling_min. Also, I'm setting the min_periods to make sure that there is at least one value on each side of the center value of any window (to replicate the scipy calculations).
expr_list = [
pl.col("col1").rolling_min(
window_size=_order * 2 + 1,
min_periods=_order + 2,
center=True
).alias("roll_min" + str(_order))
for _order in range(1, 7)
]
df.with_columns(expr_list)
shape: (24, 8)
┌────────┬──────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ row_nr ┆ col1 ┆ roll_min1 ┆ roll_min2 ┆ roll_min3 ┆ roll_min4 ┆ roll_min5 ┆ roll_min6 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
╞════════╪══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0 ┆ 46405.49 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 46407.36 ┆ 46005.43 ┆ 46005.43 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 46005.43 ┆ 46005.43 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 46174.0 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 30457.01 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ 30397.12 ┆ 30373.53 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ 30373.53 ┆ 30373.53 ┆ 30373.53 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ 47444.11 ┆ 30373.53 ┆ 30373.53 ┆ 30373.53 ┆ 30171.32 ┆ 30171.32 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ 46461.14 ┆ 46293.38 ┆ 30373.53 ┆ 30373.53 ┆ 30373.53 ┆ 30171.32 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 10 ┆ 46293.38 ┆ 46287.97 ┆ 30670.51 ┆ 30373.53 ┆ 30373.53 ┆ 30373.53 ┆ 30171.32 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 11 ┆ 46287.97 ┆ 30670.51 ┆ 30616.18 ┆ 30616.18 ┆ 30373.53 ┆ 30373.53 ┆ 30373.53 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 12 ┆ 30670.51 ┆ 30616.18 ┆ 30616.18 ┆ 30529.9 ┆ 30529.9 ┆ 30373.53 ┆ 30373.53 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 13 ┆ 30616.18 ┆ 30616.18 ┆ 30529.9 ┆ 30529.9 ┆ 30529.9 ┆ 30529.9 ┆ 30373.53 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 14 ┆ 30625.98 ┆ 30529.9 ┆ 30529.9 ┆ 30529.9 ┆ 30529.9 ┆ 30529.9 ┆ 30101.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 15 ┆ 30529.9 ┆ 30529.9 ┆ 30529.9 ┆ 30529.9 ┆ 30529.9 ┆ 30101.07 ┆ 30101.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 16 ┆ 46248.84 ┆ 30529.9 ┆ 30529.9 ┆ 30529.9 ┆ 30101.07 ┆ 30101.07 ┆ 29967.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 17 ┆ 45744.77 ┆ 45744.77 ┆ 30529.9 ┆ 30101.07 ┆ 30101.07 ┆ 29967.07 ┆ 29967.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 18 ┆ 45834.39 ┆ 45744.77 ┆ 30101.07 ┆ 30101.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 19 ┆ 45787.0 ┆ 30101.07 ┆ 30101.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 20 ┆ 30101.07 ┆ 30101.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 21 ┆ 30281.89 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 22 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 ┆ 29967.07 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 23 ┆ 30042.09 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │
└────────┴──────────┴───────────┴───────────┴───────────┴───────────┴───────────┴───────────┘
Looking at the output in roll_min_1 (the equivalent of an order=1 call for argrelextrema), we see that the values in roll_min_1 equal the values in col1 for row_nr 2, 4, 7, 13, 15, 17, 20, 22 ... which corresponds exactly to the output of argrelextrema for order=1. Likewise, for the other roll_min_X columns. We'll use this fact in the next step.
Obtaining the row index
As #ritchie46 points out, in Polars, we use conditions (not indexing). We'll modify the above code to identify whether the value in col1 equals it's rolling min, for each of our window sizes.
expr_list = [
(
pl.col("col1").rolling_min(
window_size=_order * 2 + 1,
min_periods=_order + 2,
center=True
)
== pl.col("col1")
).alias("min_idx_" + str(_order))
for _order in range(1, 7)
]
df.with_columns(expr_list)
shape: (24, 8)
┌────────┬──────────┬───────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
│ row_nr ┆ col1 ┆ min_idx_1 ┆ min_idx_2 ┆ min_idx_3 ┆ min_idx_4 ┆ min_idx_5 ┆ min_idx_6 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ f64 ┆ bool ┆ bool ┆ bool ┆ bool ┆ bool ┆ bool │
╞════════╪══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0 ┆ 46405.49 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 46407.36 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 46005.43 ┆ true ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 46174.0 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 30171.32 ┆ true ┆ true ┆ true ┆ true ┆ true ┆ true │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 30457.01 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ 30397.12 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ 30373.53 ┆ true ┆ true ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ 47444.11 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ 46461.14 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 10 ┆ 46293.38 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 11 ┆ 46287.97 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 12 ┆ 30670.51 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 13 ┆ 30616.18 ┆ true ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 14 ┆ 30625.98 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 15 ┆ 30529.9 ┆ true ┆ true ┆ true ┆ true ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 16 ┆ 46248.84 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 17 ┆ 45744.77 ┆ true ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 18 ┆ 45834.39 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 19 ┆ 45787.0 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 20 ┆ 30101.07 ┆ true ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 21 ┆ 30281.89 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 22 ┆ 29967.07 ┆ true ┆ true ┆ true ┆ true ┆ true ┆ true │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 23 ┆ 30042.09 ┆ false ┆ false ┆ false ┆ false ┆ false ┆ false │
└────────┴──────────┴───────────┴───────────┴───────────┴───────────┴───────────┴───────────┘
Notice that for min_idx_1, the values are true for row_nr 2, 4, 7, 13, 15, 17, 20, 22, which corresponds to the output of argrelextrema for order=1. Likewise, for the other columns.
Summing
We can now use the cast function and the polars.sum function to sum row-wise across our columns. (Indeed, we won't keep our rolling min columns -- we'll just keep the sums).
expr_list = [
(
pl.col("col1").rolling_min(
window_size=_order * 2 + 1,
min_periods=_order + 2,
center=True
)
== pl.col("col1")
).cast(pl.UInt32)
for _order in range(1, 7)
]
df.with_columns(pl.sum(expr_list).alias("min_freq"))
┌────────┬──────────┬──────────┐
│ row_nr ┆ col1 ┆ min_freq │
│ --- ┆ --- ┆ --- │
│ u32 ┆ f64 ┆ u32 │
╞════════╪══════════╪══════════╡
│ 0 ┆ 46405.49 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 46407.36 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 46005.43 ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 46174.0 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 30171.32 ┆ 6 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 30457.01 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ 30397.12 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ 30373.53 ┆ 2 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ 47444.11 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ 46461.14 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 10 ┆ 46293.38 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 11 ┆ 46287.97 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 12 ┆ 30670.51 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 13 ┆ 30616.18 ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 14 ┆ 30625.98 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 15 ┆ 30529.9 ┆ 4 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 16 ┆ 46248.84 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 17 ┆ 45744.77 ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 18 ┆ 45834.39 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 19 ┆ 45787.0 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 20 ┆ 30101.07 ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 21 ┆ 30281.89 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 22 ┆ 29967.07 ┆ 6 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 23 ┆ 30042.09 ┆ 0 │
└────────┴──────────┴──────────┘
I believe this is the result you were looking to obtain.
From here, I think you can expand the above code for rolling maximums.
Ties
One difference between this code and the argrelextrema code pertains to ties. If two values tie for the minimum in any window, argrelextrema considers neither to be the minimum for the window. The code above considers both to be minimum values.
I'm not sure how likely this will be for the size of windows you have, or for the type of data.
Please update Polars to 0.13.38
The latest release of Polars contains some major improvements to the performance of rolling functions. (The announcement is on this Twitter thread.) You'll want to take advantage of that by updating to the latest version.
Update - 2022/05/24
Merging all lists into one expression
first: is it possible to merge both min_expr_list and max_expr_list into one list? and if it is possible, in with_columns expression how can I add separate columns based on each element of list?
It is possible to generate all columns (min and max, for each order, for each variable) using a single list in a single with_columns context. The calculations are independent, and thus can be in the same with_columns context. Each column would have a name that could be used in a later calculation step.
But in that case, the accumulation steps would need to be in a separate with_columns expression. The with_columns context assumes all calculations are independent - and that they can be run in any order, without dependencies. But summarizing columns (by selecting them by name) is dependent on those columns first being created.
You can return multiple Series from a single function if you return them as a Series of struct (e.g., using map) .. but that's generally to be avoided. (And beyond the scope of our question here.)
And more specifically for this problem, we are dealing with issues of memory pressure. So, in this case, we'll need to move in the opposite direction - we'll need to break lists into smaller pieces and feed them to the with_columns expression in batches.
Batching the calculations by order
currently I have datasets with millions of records (some of them have more than 10 million records) and _orders range can be from 2 to 1500 so calculating needs lots of GB of ram.
is there any better way to do that?
We'll try a couple of techniques to reduce memory pressure ... and still achieve good performance.
One technique will be to use the fold method to accumulate values. This allows us to sum boolean values without having to cast every intermediate column to integers. This should reduce memory pressure during the intermediate calculations.
We'll also batch our calculations by breaking the expression lists into sub-lists, calculating intermediate results, and accumulating into an accumulator column using the fold method.
First, let's eliminate the cast to integer in your min_expr_list.
min_expr_list = [
(
pl.col("price").rolling_min(
window_size=_order * 2 + 1, min_periods=_order + 2, center=True
)
== pl.col("price")
)
for _order in range(1, 20)
]
Next we'll need to pick a batch_size and initialize an accumulator column. I would experiment with different batch_size numbers until you find one that seems to work well for your computing platform and size of dataset. Since we have limited data in this example, I'll pick a batch_size of 5 - just to demonstrate the algorithm.
batch_size = 5
df = df.with_column(pl.lit(0, dtype=pl.UInt32).alias("min_freq"))
Next, we'll iterate through the batches of sub-lists, and accumulate as we go.
while(min_expr_list):
next_batch, min_expr_list = min_expr_list[0: batch_size], min_expr_list[batch_size:]
df=(
df
.with_column(
pl.fold(
pl.col("min_freq"),
lambda acc, x: acc + x,
next_batch,
)
)
)
print(df)
shape: (24, 3)
┌────────┬──────────┬──────────┐
│ row_nr ┆ price ┆ min_freq │
│ --- ┆ --- ┆ --- │
│ u32 ┆ f64 ┆ u32 │
╞════════╪══════════╪══════════╡
│ 0 ┆ 46405.49 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 46407.36 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 46005.43 ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 46174.0 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 30171.32 ┆ 15 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 30457.01 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ 30397.12 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ 30373.53 ┆ 2 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ 47444.11 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ 46461.14 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 10 ┆ 46293.38 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 11 ┆ 46287.97 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 12 ┆ 30670.51 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 13 ┆ 30616.18 ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 14 ┆ 30625.98 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 15 ┆ 30529.9 ┆ 4 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 16 ┆ 46248.84 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 17 ┆ 45744.77 ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 18 ┆ 45834.39 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 19 ┆ 45787.0 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 20 ┆ 30101.07 ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 21 ┆ 30281.89 ┆ 0 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 22 ┆ 29967.07 ┆ 19 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 23 ┆ 30042.09 ┆ 0 │
└────────┴──────────┴──────────┘
Problems with rolling_min when order is 1,000 or more
and one more side problem. when increasing _order to more than 1000 it seems it doesn't work. is there any limitation in source code?
I generated datasets of 50 million random numbers and tested the rolling_min for order sizes of 1,500 .. and found no problems. Indeed, I replicated the algorithm using a rolling slice and found no errors.
But I have a hunch about what might be happening. Let's start with this dataset of 10 records:
df = pl.DataFrame(
{
"col1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}
)
df
shape: (10, 1)
┌──────┐
│ col1 │
│ --- │
│ i64 │
╞══════╡
│ 1 │
├╌╌╌╌╌╌┤
│ 2 │
├╌╌╌╌╌╌┤
│ 3 │
├╌╌╌╌╌╌┤
│ 4 │
├╌╌╌╌╌╌┤
│ 5 │
├╌╌╌╌╌╌┤
│ 6 │
├╌╌╌╌╌╌┤
│ 7 │
├╌╌╌╌╌╌┤
│ 8 │
├╌╌╌╌╌╌┤
│ 9 │
├╌╌╌╌╌╌┤
│ 10 │
└──────┘
If we set _order = 8 and run the algorithm, we get a result.
_order = 8
df = df.with_column(
pl.col("col1")
.rolling_min(window_size=(2 * _order) + 1, min_periods=(_order + 2), center=True)
.alias("rolling_min")
)
df
shape: (10, 2)
┌──────┬─────────────┐
│ col1 ┆ rolling_min │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪═════════════╡
│ 1 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 10 ┆ null │
└──────┴─────────────┘
However, if we set _order=9, we get all null values:
shape: (10, 2)
┌──────┬─────────────┐
│ col1 ┆ rolling_min │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞══════╪═════════════╡
│ 1 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 8 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 10 ┆ null │
└──────┴─────────────┘
Is this what you're seeing? The reason for the null values is the min_period value. We set min_period = _order + 2, which in this case is 11. However, there are only 10 values in the dataset. Thus, we get all null values.
Perhaps this is what is happening in your data?
You write very imperative code which is not really idiomatic polars. You typically should not even know where the index of a value is. Instead you assign by conditions, for instance with when(condition) -> then(value) -> otherwise(value) expressions.
Your condition in a when can still refer to an index. This snippet for instance is equal to assigning to a specific index, but then written more functional:
pl.DataFrame({
"letters": ["a", "b", "c", "d"]
}).with_column(
# use a condition to determine the index location
pl.when(pl.arange(0, pl.count()) == 2)
.then("idx_2")
.otherwise("letters").alias("letters")
)
shape: (4, 1)
┌─────────┐
│ letters │
│ --- │
│ str │
╞═════════╡
│ letters │
├╌╌╌╌╌╌╌╌╌┤
│ letters │
├╌╌╌╌╌╌╌╌╌┤
│ idx_2 │
├╌╌╌╌╌╌╌╌╌┤
│ letters │
└─────────┘
Your intent, counting local minima/maxima
To help you in your cause, I'd like to show how you can find your local minima in an idiomatic polars way.
The local minima/maxima can be found by:
taking the derivative dy/dx of a function x.
computing the sign of that derivative tells us where the functions slope is increasing and decreasing.
if we take the derivative of the sign of dy/dx we know where the signs are changing and thus where we have local minima/maxima.
Let's try this on a dummy DataFrame.
df = pl.DataFrame({
"x": [8, 4, 2, 7, 9, 6, 3, 0]
})
# find the local minima/maxima
df = df.with_columns([
(pl.col("x").diff().sign().diff().shift(-1) == -2).alias("local_maximum"),
(pl.col("x").diff().sign().diff().shift(-1) == 2).alias("local_minimum")
])
print(df)
shape: (8, 3)
┌─────┬───────────────┬───────────────┐
│ x ┆ local_maximum ┆ local_minimum │
│ --- ┆ --- ┆ --- │
│ i64 ┆ bool ┆ bool │
╞═════╪═══════════════╪═══════════════╡
│ 8 ┆ false ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ false ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ false ┆ true │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ false ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ true ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ false ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ false ┆ false │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0 ┆ false ┆ false │
└─────┴───────────────┴───────────────┘
Next we can take the cumulative sum to count the total seen local minima and maxima.
df.with_columns([
pl.col("local_maximum").cumsum().alias("local_max_count"),
pl.col("local_minimum").cumsum().alias("local_min_count")
])
shape: (8, 5)
┌─────┬───────────────┬───────────────┬─────────────────┬─────────────────┐
│ x ┆ local_maximum ┆ local_minimum ┆ local_max_count ┆ local_min_count │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ bool ┆ bool ┆ u32 ┆ u32 │
╞═════╪═══════════════╪═══════════════╪═════════════════╪═════════════════╡
│ 8 ┆ false ┆ false ┆ 0 ┆ 0 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ false ┆ false ┆ 0 ┆ 0 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ false ┆ true ┆ 0 ┆ 1 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ false ┆ false ┆ 0 ┆ 1 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 9 ┆ true ┆ false ┆ 1 ┆ 1 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ false ┆ false ┆ 1 ┆ 1 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ false ┆ false ┆ 1 ┆ 1 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 0 ┆ false ┆ false ┆ 1 ┆ 1 │
└─────┴───────────────┴───────────────┴─────────────────┴─────────────────┘
I hope this helps nudge you in the right direction.