How to use apply better in Polars? - python

I have a polars dataframe illustrated as follows.
import polars as pl
df = pl.DataFrame(
{
"a": [1, 4, 3, 2, 8, 4, 5, 6],
"b": [2, 3, 1, 3, 9, 7, 6, 8],
"c": [1, 1, 1, 1, 2, 2, 2, 2],
}
)
The task I have is
groupby column "c"
for each group, check whether all numbers from column "a" is less than corresponding values from column "b".
If so, just return a column same as "a" in the groupby context.
Otherwise, apply a third-party function called "convert" which takes two numpy arrays and return a single numpy array with the same size, so in my case, I can first convert column "a" and "b" to numpy arrays and supply them as inputs to "convert". Finally, return the array returned from "convert" (probably need to transform it to polars series before returning) in the groupby context.
So, for the example above, the output I want is as follows (exploded after groupby for better illustration).
shape: (8, 2)
┌─────┬─────┐
│ c ┆ a │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 1 │
│ 1 ┆ 3 │
│ 1 ┆ 1 │
│ 1 ┆ 2 │
│ 2 ┆ 8 │
│ 2 ┆ 4 │
│ 2 ┆ 5 │
│ 2 ┆ 6 │
└─────┴─────┘
With the assumption,
>>> import numpy as np
>>> convert(np.array([1, 4, 3, 2]), np.array([2, 3, 1, 3]))
np.array([1, 3, 1, 2])
# [1, 4, 3, 2] is from column a of df when column c is 1, and [2, 3, 1, 3] comes from column b of df when column c is 1.
# I have to apply my custom python function 'convert' for the c == 1 group, because not all values in a are smaller than those in b according to the task description above.
My question is how am I supposed to implement this logic in a performant or polars idiomatic way without sacrificing so much speed gained from running Rust code and parallelization?
The reason I ask is because from my understanding, using apply with custom python function will slow down the program, but in my case, in certain scenarios, I will not need to resort to a third-party function for help. So, is there any way I can get the best of worlds somehow? (for scenarios where no third-party function is required, get full benefits of polars, and only apply third-party function when necessary).

It sounds like you want to find matching groups:
(
df
.with_row_count()
.filter(
(pl.col("a") >= pl.col("b"))
.any()
.over("c"))
)
shape: (4, 4)
┌────────┬─────┬─────┬─────┐
│ row_nr | a | b | c │
│ --- | --- | --- | --- │
│ u32 | i64 | i64 | i64 │
╞════════╪═════╪═════╪═════╡
│ 0 | 1 | 2 | 1 │
│ 1 | 4 | 3 | 1 │
│ 2 | 3 | 1 | 1 │
│ 3 | 2 | 3 | 1 │
└────────┴─────┴─────┴─────┘
And apply your custom function over each group.
(
df
.with_row_count()
.filter(
(pl.col("a") >= pl.col("b"))
.any()
.over("c"))
.select(
pl.col("row_nr"),
pl.apply(
["a", "b"], # np.minimum is just for example purposes
lambda s: np.minimum(s[0], s[1]))
.over("c"))
)
shape: (4, 2)
┌────────┬─────┐
│ row_nr | a │
│ --- | --- │
│ u32 | i64 │
╞════════╪═════╡
│ 0 | 1 │
│ 1 | 3 │
│ 2 | 1 │
│ 3 | 2 │
└────────┴─────┘
(Note: there may be some useful information in How to Write Poisson CDF as Python Polars Expression with regards to scipy/numpy ufuncs and potentially avoiding .apply())
You can then .join() the result back into the original data.
(
df
.with_row_count()
.join(
df
.with_row_count()
.filter(
(pl.col("a") >= pl.col("b"))
.any()
.over("c"))
.select(
pl.col("row_nr"),
pl.apply(
["a", "b"],
lambda s: np.minimum(s[0], s[1]))
.over("c")),
on="row_nr",
how="left")
)
shape: (8, 5)
┌────────┬─────┬─────┬─────┬─────────┐
│ row_nr | a | b | c | a_right │
│ --- | --- | --- | --- | --- │
│ u32 | i64 | i64 | i64 | i64 │
╞════════╪═════╪═════╪═════╪═════════╡
│ 0 | 1 | 2 | 1 | 1 │
│ 1 | 4 | 3 | 1 | 3 │
│ 2 | 3 | 1 | 1 | 1 │
│ 3 | 2 | 3 | 1 | 2 │
│ 4 | 8 | 9 | 2 | null │
│ 5 | 4 | 7 | 2 | null │
│ 6 | 5 | 6 | 2 | null │
│ 7 | 6 | 8 | 2 | null │
└────────┴─────┴─────┴─────┴─────────┘
You can then fill in the nulls.
.with_columns(
pl.col("a_right").fill_null(pl.col("a")))

Related

Replicate pandas ngroup behaviour in polars

I am currently trying to replicate ngroup behaviour in polars to get consecutive group indexes (the dataframe will be grouped over two columns). For the R crowd, this would be achieved in the dplyr world with dplyr::group_indices or the newer dplyr::cur_group_id.
As shown in the repro, I've tried couple avenues without much succcess, both approaches miss group sequentiality and merely return row counts by group.
Quick repro:
import polars as pl
import pandas as pd
df = pd.DataFrame(
{
"id": ["a", "a", "a", "a", "b", "b", "b", "b"],
"cat": [1, 1, 2, 2, 1, 1, 2, 2],
}
)
df_pl = pl.from_pandas(df)
print(df.groupby(["id", "cat"]).ngroup())
# This is the desired behaviour
# 0 0
# 1 0
# 2 1
# 3 1
# 4 2
# 5 2
# 6 3
# 7 3
print(df_pl.select(pl.count().over(["id", "cat"])))
# This is only counting observation by group
# ┌───────┐
# │ count │
# │ --- │
# │ u32 │
# ╞═══════╡
# │ 2 │
# │ 2 │
# │ 2 │
# │ 2 │
# │ 2 │
# │ 2 │
# │ 2 │
# │ 2 │
# └───────┘
# shape: (4, 3)
print(df_pl.groupby(["id", "cat"]).agg([pl.count().alias("test")]))
# shape: (4, 3)
# ┌─────┬─────┬──────┐
# │ id ┆ cat ┆ test │
# │ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ u32 │
# ╞═════╪═════╪══════╡
# │ a ┆ 1 ┆ 2 │
# │ a ┆ 2 ┆ 2 │
# │ b ┆ 1 ┆ 2 │
# │ b ┆ 2 ┆ 2 │
# └─────┴─────┴──────┘
Edit
As #jqurious points out we can use rank for this:
(df.with_row_count("idx")
.select(
pl.first("idx").over(["id", "cat"]).rank("dense") - 1)
)
shape: (8, 1)
┌─────┐
│ idx │
│ --- │
│ u32 │
╞═════╡
│ 0 │
│ 0 │
│ 1 │
│ 1 │
│ 2 │
│ 2 │
│ 3 │
│ 3 │
└─────┘
The following might be more clear:
df = pl.DataFrame(
{
"id": ["a", "a", "a", "a", "b", "b", "b", "b"],
"cat": [1, 1, 2, 2, 1, 1, 2, 2],
}
)
(
# Add row count to each line to create an index.
df.with_row_count("idx")
# Group on id and cat column.
.groupby(
["id", "cat"],
maintain_order=True,
)
.agg(
# Create a list of all index positions per group.
pl.col("idx")
)
# Add a new row count for each group.
.with_row_count("ngroup")
# Expand idx list column to separate rows.
.explode("idx")
# Reorder columns.
.select(["idx", "ngroup", "id", "cat"])
# Optionally sort by original order.
.sort("idx")
)
┌─────┬────────┬─────┬─────┐
│ idx ┆ ngroup ┆ id ┆ cat │
│ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ u32 ┆ str ┆ i64 │
╞═════╪════════╪═════╪═════╡
│ 0 ┆ 0 ┆ a ┆ 1 │
│ 1 ┆ 0 ┆ a ┆ 1 │
│ 2 ┆ 1 ┆ a ┆ 2 │
│ 3 ┆ 1 ┆ a ┆ 2 │
│ 4 ┆ 2 ┆ b ┆ 1 │
│ 5 ┆ 2 ┆ b ┆ 1 │
│ 6 ┆ 3 ┆ b ┆ 2 │
│ 7 ┆ 3 ┆ b ┆ 2 │
└─────┴────────┴─────┴─────┘

Polars columns subtract order does not matter (aperently)

I would like to use polars, but when I try to subtract a 1x3 numpy array from three columns of the DataFrame. The problem is that is does not matter in which order the subtraction is applied:
import numpy as np
import polars as pl
# create polars dataframe:
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pl.DataFrame(data, columns=['x', 'y', 'z']).with_columns(
pl.all().cast(pl.Float64)
)
# subraction array:
arr = np.array([2, 5, 8], dtype=np.float64)
# subtract shit array from DataFrame
df.with_columns((
pl.col('x') - arr[0],
pl.col('y') - arr[1],
pl.col('z') - arr[2],
))
"""
This one is corrct, top row should be negative and bottom row positive
| | x | y | z |
|---:|----:|----:|----:|
| 0 | -1 | -1 | -1 |
| 1 | 0 | 0 | 0 |
| 2 | 1 | 1 | 1 |
"""
df.with_columns((
arr[0] - pl.col('x'),
arr[1] - pl.col('y'),
arr[2] - pl.col('z'),
))
"""
This one is incorrect. The top row should be positive and the bottom row should
be negative.
| | x | y | z |
|---:|----:|----:|----:|
| 0 | -1 | -1 | -1 |
| 1 | 0 | 0 | 0 |
| 2 | 1 | 1 | 1 |
"""
Can't reproduce this, looks fine to me as of 0.16.5:
In [57]: df.with_columns((
...: pl.col('x') - arr[0],
...: pl.col('y') - arr[1],
...: pl.col('z') - arr[2],
...: ))
...:
Out[57]:
shape: (3, 3)
┌──────┬──────┬──────┐
│ x ┆ y ┆ z │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╡
│ -1.0 ┆ -1.0 ┆ -1.0 │
│ 0.0 ┆ 0.0 ┆ 0.0 │
│ 1.0 ┆ 1.0 ┆ 1.0 │
└──────┴──────┴──────┘
In [58]: df.with_columns((
...: arr[0] - pl.col('x'),
...: arr[1] - pl.col('y'),
...: arr[2] - pl.col('z'),
...: ))
Out[58]:
shape: (3, 3)
┌──────┬──────┬──────┐
│ x ┆ y ┆ z │
│ --- ┆ --- ┆ --- │
│ f64 ┆ f64 ┆ f64 │
╞══════╪══════╪══════╡
│ 1.0 ┆ 1.0 ┆ 1.0 │
│ 0.0 ┆ 0.0 ┆ 0.0 │
│ -1.0 ┆ -1.0 ┆ -1.0 │
└──────┴──────┴──────┘

Find value of column based on another column condition (max) in polars for many columns

If I have this dataframe:
pl.DataFrame(dict(x=[0, 1, 2, 3], y=[5, 2, 3, 3],z=[4,7,8,2]))
shape: (4, 3)
┌─────┬─────┬─────┐
│ x ┆ y ┆ z │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 0 ┆ 5 ┆ 4 │
│ 1 ┆ 2 ┆ 7 │
│ 2 ┆ 3 ┆ 8 │
│ 3 ┆ 3 ┆ 2 │
└─────┴─────┴─────┘
and I want to find the value in x where y is max, then again find the value in x where z is max, and repeat for hundreds more columns so that I end up with something like:
shape: (2, 2)
┌────────┬─────────┐
│ column ┆ x_value │
│ --- ┆ --- │
│ str ┆ i64 │
╞════════╪═════════╡
│ y ┆ 0 │
│ z ┆ 2 │
└────────┴─────────┘
or
shape: (1, 2)
┌─────┬─────┐
│ y ┆ z │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 0 ┆ 2 │
└─────┴─────┘
What is the best polars way to do that?
You could:
pl.exclude("x") to select all columns minus x
.arg_max() to get the max index of each selected column.
pass the indexes to pl.col("x").take() to get the x value at each index.
pl.concat_list() to create a list of all the values.
>>> df.select(pl.concat_list(pl.col("x").take(pl.exclude("x").arg_max())))
shape: (1, 1)
┌───────────┐
│ x │
│ --- │
│ list[i64] │
╞═══════════╡
│ [0, 2] │
└───────────┘
To add in the column names you could:
# Feels like this could be simplified?
columns = df.columns
columns.remove("x")
columns = pl.Series(columns).alias("column")
df.select(
pl.concat_list(
pl.col("x").take(pl.exclude("x").arg_max())
).flatten()
).with_columns(columns)
shape: (2, 2)
┌─────┬────────┐
│ x | column │
│ --- | --- │
│ i64 | str │
╞═════╪════════╡
│ 0 | y │
├─────┼────────┤
│ 2 | z │
└─────┴────────┘
Possible approach for the other result:
(df.with_columns(pl.exclude("x").arg_max())
.select([
pl.col("x").take(col).first().alias(col)
for col in df.columns if col != "x"
])
)
shape: (1, 2)
┌─────┬─────┐
│ y | z │
│ --- | --- │
│ i64 | i64 │
╞═════╪═════╡
│ 0 | 2 │
└─────┴─────┘

Split value between polars DataFrame rows

I would like to find a way to distribute the values of a DataFrame among the rows of another DataFrame using polars (without iterating through the rows).
I have a dataframe with the amounts to be distributed:
Name
Amount
A
100
B
300
C
250
And a target DataFrame to which I want to append the distributed values (in a new column) using the common "Name" column.
Name
Item
Price
A
x1
40
A
x2
60
B
y1
50
B
y2
150
B
y3
200
C
z1
400
The rows in the target are sorted and the assigned amount should match the price in each row (as long as there is enough amount remaining).
So the result in this case should look like this:
Name
Item
Price
Assigned amount
A
x1
40
40
A
x2
60
60
B
y1
50
50
B
y2
150
150
B
y3
200
100
C
z1
400
250
In this example, we can distribute the amounts for A, so that they are the same as the price. However, for the last item of B and for C we write the remaining amounts as the prices are too high.
Is there an efficient way to do this?
My initial solution was to calculate the cumulative sum of the Price in a new column in the target dataframe, then left join the source DataFrame and subtract the values of the cumulative sum. This would work if the amount is high enough, but for the last item of B and C I would get negative values and not the remaining amount.
Edit
Example dataframes:
import polars as pl
df1 = pl.DataFrame({"Name": ["A", "B", "C"], "Amount": [100, 300, 250]})
df2 = pl.DataFrame({"Name": ["A", "A", "B", "B", "B", "C"], "Item": ["x1", "x2", "y1", "y2", "y3", "z"],"Price": [40, 60, 50, 150, 200, 400]})
#jqurious, good answer. This might be slightly more succinct:
(
df2.join(df1, on="Name")
.with_columns(
pl.min([
pl.col('Price'),
pl.col('Amount') -
pl.col('Price').cumsum().shift_and_fill(1, 0).over('Name')
])
.clip_min(0)
.alias('assigned')
)
)
shape: (6, 5)
┌──────┬──────┬───────┬────────┬──────────┐
│ Name ┆ Item ┆ Price ┆ Amount ┆ assigned │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪═══════╪════════╪══════════╡
│ A ┆ x1 ┆ 40 ┆ 100 ┆ 40 │
│ A ┆ x2 ┆ 60 ┆ 100 ┆ 60 │
│ B ┆ y1 ┆ 50 ┆ 300 ┆ 50 │
│ B ┆ y2 ┆ 150 ┆ 300 ┆ 150 │
│ B ┆ y3 ┆ 200 ┆ 300 ┆ 100 │
│ C ┆ z ┆ 400 ┆ 250 ┆ 250 │
└──────┴──────┴───────┴────────┴──────────┘
You can take the minimum value of the Price or the Difference.
.clip_min(0) can be used to replace the negatives.
[Edit: See #ΩΠΟΚΕΚΡΥΜΜΕΝΟΣ's answer for a neater way to write this.]
(
df2
.join(df1, on="Name")
.with_columns(
cumsum = pl.col("Price").cumsum().over("Name"))
.with_columns(
assigned = pl.col("Amount") - (pl.col("cumsum") - pl.col("Price")))
.with_columns(
assigned = pl.min(["Price", "assigned"]).clip_min(0))
)
shape: (6, 6)
┌──────┬──────┬───────┬────────┬────────┬──────────┐
│ Name | Item | Price | Amount | cumsum | assigned │
│ --- | --- | --- | --- | --- | --- │
│ str | str | i64 | i64 | i64 | i64 │
╞══════╪══════╪═══════╪════════╪════════╪══════════╡
│ A | x1 | 40 | 100 | 40 | 40 │
│ A | x2 | 60 | 100 | 100 | 60 │
│ B | y1 | 50 | 300 | 50 | 50 │
│ B | y2 | 150 | 300 | 200 | 150 │
│ B | y3 | 200 | 300 | 400 | 100 │
│ C | z | 400 | 250 | 400 | 250 │
└──────┴──────┴───────┴────────┴────────┴──────────┘
This assumes the order of the df is the order of priority, if not, sort it first.
You first want to join your two dfs then make a helper column that is the cumsum of Price less Price. I call that spent. It's more like a potential spent because there's no guarantee it doesn't go over Amount.
Add another two helper columns, one for the difference between Amount and spent which we'll call have1 as that's the amount we have. In the sample data this didn't come up but we need to make sure this isn't less than 0 so we add another column which is just literally zero, we'll call it z.
Add another helper column which will be the greater value between 0 and have1 and we'll call it have2.
Lastly, we'll determine the Assigned amount as smaller value between have2 and Price.
df1.join(df2, on='Name') \
.with_columns((pl.col("Price").cumsum()-pl.col("Price")).over("Name").alias("spent")) \
.with_columns([(pl.col("Amount")-pl.col("spent")).alias("have1"), pl.lit(0).alias('z')]) \
.with_columns(pl.concat_list([pl.col('z'), pl.col('have1')]).arr.max().alias('have2')) \
.with_columns(pl.concat_list([pl.col('have2'), pl.col("Price")]).arr.min().alias("Assigned amount")) \
.select(["Name", "Item","Price","Assigned amount"])
You can reduce this to a single nested expression like this...
df1.join(df2, on='Name') \
.select(["Name", "Item","Price",
pl.concat_list([
pl.concat_list([
pl.repeat(0, pl.count()),
pl.col("Amount")-(pl.col("Price").cumsum()-pl.col("Price")).over("Name")
]).arr.max(),
pl.col("Price")
]).arr.min().alias("Assigned amount")
])
shape: (6, 4)
┌──────┬──────┬───────┬─────────────────┐
│ Name ┆ Item ┆ Price ┆ Assigned amount │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ i64 ┆ i64 │
╞══════╪══════╪═══════╪═════════════════╡
│ A ┆ x1 ┆ 40 ┆ 40 │
│ A ┆ x2 ┆ 60 ┆ 60 │
│ B ┆ y1 ┆ 50 ┆ 50 │
│ B ┆ y2 ┆ 150 ┆ 150 │
│ B ┆ y3 ┆ 200 ┆ 100 │
│ C ┆ z ┆ 400 ┆ 250 │
└──────┴──────┴───────┴─────────────────┘

How to cumulatively sum the first elements when using the .over() function for a specific column in Python Polars

I was wondering if someone could please enlighten me.
I am trying to cumulatively sum pty_nber over/groupby a specific column (Declaration).
My original idea was to use something along:
dataset.filter(pl.col("pty_nber").first().over("Declaration").cumsum() < 30 )
But unfortunately, it does not take into account the .over() and just cumulatively sums all the rows. So rather than summing 4 + 7 + 8 etc.., it sums it 4 + 4 + 4 + 4 + 7 ...
The goal is to show at least a few complete declarations and not cut in the middle.
Thanks in advance :)
As an example please see below:
--> and filter out for CUMSUM that are over a certain threshold such as 30 so that I make sure that no ONE declaration is not complete (i.e. not including all the pty_nber for that specific declaration)
So I am not sure if you want just to have the yellow marked entries or keep the whole dataframe and just remove the rows where CUMSUM is greater than 30. So I implemented in two ways:
Data
I simplified a little bit your example data
import polars as pl
df = pl.DataFrame(
{
"declaration": [2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4],
"item": [1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4],
"pty_nber": [12, 12, 12, 9, 9, 9, 9, 16, 16, 16, 16],
}
)
First:
df.groupby("declaration", maintain_order=True).first().filter(
pl.col("pty_nber").cumsum() < 30
)
shape: (2, 3)
┌─────────────┬──────┬──────────┐
│ declaration ┆ item ┆ pty_nber │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════════════╪══════╪══════════╡
│ 2 ┆ 1 ┆ 12 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 1 ┆ 9 │
└─────────────┴──────┴──────────┘
Second:
For the second I use the help of the ITEM column, since I suppose the first element each group equals the first item
df.filter(
pl.when(pl.col("item") == 1).then(pl.col("pty_nber")).otherwise(0).cumsum() < 30
)
shape: (7, 3)
┌─────────────┬──────┬──────────┐
│ declaration ┆ item ┆ pty_nber │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════════════╪══════╪══════════╡
│ 2 ┆ 1 ┆ 12 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2 ┆ 12 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 3 ┆ 12 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 1 ┆ 9 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 2 ┆ 9 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 3 ┆ 9 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 4 ┆ 9 │
└─────────────┴──────┴──────────┘

Categories