I have a large Polars dataframe that I'd like to split into n number of dataframes given the size. Like take dataframe and split it into 2 or 3 or 5 dataframes.
There are several observations that will show up for each column and would like to choose splitting into a chosen number of dataframes. A simple example is like the following where I am splitting on a specific id, but would like to have similar behave, but more like split into 2 approximately even dataframes since the full example has a large number of identifiers.
df = pl.DataFrame({'Identifier': [1234,1234, 2345,2345],
'DateColumn': ['2022-02-13','2022-02-14', '2022-02-13',
'2022-02-14']
})
df2 = df.with_columns(
[
pl.col('DateColumn').str.strptime(pl.Date).cast(pl.Date)
]
)
print(df)
┌────────────┬────────────┐
│ Identifier ┆ DateColumn │
│ --- ┆ --- │
│ i64 ┆ str │
╞════════════╪════════════╡
│ 1234 ┆ 2022-02-13 │
│ 1234 ┆ 2022-02-14 │
│ 2345 ┆ 2022-02-13 │
│ 2345 ┆ 2022-02-14 │
└────────────┴────────────┘
df1 = df.filter(
pl.col('Identifier')==1234
)
df2 = df.filter(
pl.col('Identifier')==2345
)
print(df1)
shape: (2, 2)
┌────────────┬────────────┐
│ Identifier ┆ DateColumn │
│ --- ┆ --- │
│ i64 ┆ str │
╞════════════╪════════════╡
│ 1234 ┆ 2022-02-13 │
│ 1234 ┆ 2022-02-14 │
└────────────┴────────────┘
print(df2)
┌────────────┬────────────┐
│ Identifier ┆ DateColumn │
│ --- ┆ --- │
│ i64 ┆ str │
╞════════════╪════════════╡
│ 2345 ┆ 2022-02-13 │
│ 2345 ┆ 2022-02-14 │
└────────────┴────────────┘
If you want to divide your DataFrame by let's say your identifier, the best way to do so is use the partition_by method.
df = pl.DataFrame({
"foo": ["A", "A", "B", "B", "C"],
"N": [1, 2, 2, 4, 2],
"bar": ["k", "l", "m", "m", "l"],
})
df.partition_by(groups="foo", maintain_order=True)
[shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ N ┆ bar │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ A ┆ 1 ┆ k │
│ A ┆ 2 ┆ l │
└─────┴─────┴─────┘,
shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ N ┆ bar │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ B ┆ 2 ┆ m │
│ B ┆ 4 ┆ m │
└─────┴─────┴─────┘,
shape: (1, 3)
┌─────┬─────┬─────┐
│ foo ┆ N ┆ bar │
│ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ C ┆ 2 ┆ l │
└─────┴─────┴─────┘]
https://pola-rs.github.io/polars/py-polars/html/reference/dataframe/api/polars.DataFrame.partition_by.html
This automatically divides the DataFrame by values in a column.
Related
I am currently trying to replicate ngroup behaviour in polars to get consecutive group indexes (the dataframe will be grouped over two columns). For the R crowd, this would be achieved in the dplyr world with dplyr::group_indices or the newer dplyr::cur_group_id.
As shown in the repro, I've tried couple avenues without much succcess, both approaches miss group sequentiality and merely return row counts by group.
Quick repro:
import polars as pl
import pandas as pd
df = pd.DataFrame(
{
"id": ["a", "a", "a", "a", "b", "b", "b", "b"],
"cat": [1, 1, 2, 2, 1, 1, 2, 2],
}
)
df_pl = pl.from_pandas(df)
print(df.groupby(["id", "cat"]).ngroup())
# This is the desired behaviour
# 0 0
# 1 0
# 2 1
# 3 1
# 4 2
# 5 2
# 6 3
# 7 3
print(df_pl.select(pl.count().over(["id", "cat"])))
# This is only counting observation by group
# ┌───────┐
# │ count │
# │ --- │
# │ u32 │
# ╞═══════╡
# │ 2 │
# │ 2 │
# │ 2 │
# │ 2 │
# │ 2 │
# │ 2 │
# │ 2 │
# │ 2 │
# └───────┘
# shape: (4, 3)
print(df_pl.groupby(["id", "cat"]).agg([pl.count().alias("test")]))
# shape: (4, 3)
# ┌─────┬─────┬──────┐
# │ id ┆ cat ┆ test │
# │ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ u32 │
# ╞═════╪═════╪══════╡
# │ a ┆ 1 ┆ 2 │
# │ a ┆ 2 ┆ 2 │
# │ b ┆ 1 ┆ 2 │
# │ b ┆ 2 ┆ 2 │
# └─────┴─────┴──────┘
Edit
As #jqurious points out we can use rank for this:
(df.with_row_count("idx")
.select(
pl.first("idx").over(["id", "cat"]).rank("dense") - 1)
)
shape: (8, 1)
┌─────┐
│ idx │
│ --- │
│ u32 │
╞═════╡
│ 0 │
│ 0 │
│ 1 │
│ 1 │
│ 2 │
│ 2 │
│ 3 │
│ 3 │
└─────┘
The following might be more clear:
df = pl.DataFrame(
{
"id": ["a", "a", "a", "a", "b", "b", "b", "b"],
"cat": [1, 1, 2, 2, 1, 1, 2, 2],
}
)
(
# Add row count to each line to create an index.
df.with_row_count("idx")
# Group on id and cat column.
.groupby(
["id", "cat"],
maintain_order=True,
)
.agg(
# Create a list of all index positions per group.
pl.col("idx")
)
# Add a new row count for each group.
.with_row_count("ngroup")
# Expand idx list column to separate rows.
.explode("idx")
# Reorder columns.
.select(["idx", "ngroup", "id", "cat"])
# Optionally sort by original order.
.sort("idx")
)
┌─────┬────────┬─────┬─────┐
│ idx ┆ ngroup ┆ id ┆ cat │
│ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ u32 ┆ str ┆ i64 │
╞═════╪════════╪═════╪═════╡
│ 0 ┆ 0 ┆ a ┆ 1 │
│ 1 ┆ 0 ┆ a ┆ 1 │
│ 2 ┆ 1 ┆ a ┆ 2 │
│ 3 ┆ 1 ┆ a ┆ 2 │
│ 4 ┆ 2 ┆ b ┆ 1 │
│ 5 ┆ 2 ┆ b ┆ 1 │
│ 6 ┆ 3 ┆ b ┆ 2 │
│ 7 ┆ 3 ┆ b ┆ 2 │
└─────┴────────┴─────┴─────┘
Currently, from my expermentation, join_asof does not will cause error if there are any None(null) in either of the "by" column. Is there any way I can still use join_asof while keeping any None(null) in the left dataframe?
For example, I have the following dataframes:
df = pl.DataFrame(
{"a": [1, 2, 3, 4, 5, None, 8], "b": [2, 3, 4, 5, 6, 7, None], "time": [1, 2, 3, 4, 5, 6, 7]}
)
df2 = pl.DataFrame({"a": [1, 3, 4, None], "b": [2, 4, 5, 8], "c": [2, 3, 4, 5], "time": [0, 2, 4, 6]})
If I just run the code below, there will be an error:
df.join_asof(df2, on="time", by=["a", "b"])
thread '<unnamed>' panicked at 'called `Result::unwrap()` on an `Err` value: ComputeError(Borrowed("cannot take slice"))', /home/runner/work/polars/polars/polars/polars-core/src/frame/asof_join/groups.rs:253:35
But, the following code works well:
df.drop_nulls(["a", "b"]).join_asof(df2.drop_nulls(["a", "b"]), on="time", by=["a", "b"])
shape: (5, 4)
┌─────┬─────┬──────┬──────┐
│ a ┆ b ┆ time ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪══════╪══════╡
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ 2 ┆ 3 ┆ 2 ┆ null │
│ 3 ┆ 4 ┆ 3 ┆ 3 │
│ 4 ┆ 5 ┆ 4 ┆ 4 │
│ 5 ┆ 6 ┆ 5 ┆ null │
└─────┴─────┴──────┴──────┘
My question is how can get the following result, basically the result above with rows (where a is null in the left dataframe - df in this case) appended?
┌─────┬─────┬──────┬──────┐
│ a ┆ b ┆ time ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪══════╪══════╡
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ 2 ┆ 3 ┆ 2 ┆ null │
│ 3 ┆ 4 ┆ 3 ┆ 3 │
│ 4 ┆ 5 ┆ 4 ┆ 4 │
│ 5 ┆ 6 ┆ 5 ┆ null │
│ null┆ 7 ┆ 6 ┆ null │
│ 8 ┆ null┆ 7 ┆ null │
└─────┴─────┴──────┴──────┘
Thanks!
One easy solution is to use concat with how='diagonal'. For example:
pl.concat(
[
df.drop_nulls(["a", "b"]).join_asof(df2.drop_nulls(["a", "b"]), on="time", by=["a", "b"]),
df.filter(pl.col('a').is_null() | pl.col('b').is_null()),
],
how='diagonal'
)
shape: (7, 4)
┌──────┬──────┬──────┬──────┐
│ a ┆ b ┆ time ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪══════╡
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ 2 ┆ 3 ┆ 2 ┆ null │
│ 3 ┆ 4 ┆ 3 ┆ 3 │
│ 4 ┆ 5 ┆ 4 ┆ 4 │
│ 5 ┆ 6 ┆ 5 ┆ null │
│ null ┆ 7 ┆ 6 ┆ null │
│ 8 ┆ null ┆ 7 ┆ null │
└──────┴──────┴──────┴──────┘
Edit:
diagonal pl.concat seems to be pretty slow if the dataframes are large?
Is it?
import time
import polars as pl
mult = 100_000_000
df = pl.DataFrame(
{
"a": [1, 2, 3, 4, 5, None, 8] * mult,
"b": [2, 3, 4, 5, 6, 7, None] * mult,
"time": [1, 2, 3, 4, 5, 6, 7] * mult,
}
).sort("time")
df2 = pl.DataFrame(
{
"a": [1, 3, 4, None] * mult,
"b": [2, 4, 5, 8] * mult,
"c": [2, 3, 4, 5] * mult,
"time": [0, 2, 4, 6] * mult,
}
).sort("time")
not_null_df = df.drop_nulls(["a", "b"]).join_asof(
df2.drop_nulls(["a", "b"]), on="time", by=["a", "b"]
)
is_null_df = df.filter(pl.col("a").is_null() | pl.col("b").is_null())
not_null_df
is_null_df
start = time.perf_counter()
pl.concat([not_null_df, is_null_df], how="diagonal")
print(time.perf_counter() - start)
>>> not_null_df
shape: (500000000, 4)
┌─────┬─────┬──────┬──────┐
│ a ┆ b ┆ time ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪══════╪══════╡
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ ... ┆ ... ┆ ... ┆ ... │
│ 5 ┆ 6 ┆ 5 ┆ null │
│ 5 ┆ 6 ┆ 5 ┆ null │
│ 5 ┆ 6 ┆ 5 ┆ null │
│ 5 ┆ 6 ┆ 5 ┆ null │
└─────┴─────┴──────┴──────┘
>>> is_null_df
shape: (200000000, 3)
┌──────┬──────┬──────┐
│ a ┆ b ┆ time │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╡
│ null ┆ 7 ┆ 6 │
│ null ┆ 7 ┆ 6 │
│ null ┆ 7 ┆ 6 │
│ null ┆ 7 ┆ 6 │
│ ... ┆ ... ┆ ... │
│ 8 ┆ null ┆ 7 │
│ 8 ┆ null ┆ 7 │
│ 8 ┆ null ┆ 7 │
│ 8 ┆ null ┆ 7 │
└──────┴──────┴──────┘
>>> pl.concat([not_null_df, is_null_df], how="diagonal")
shape: (700000000, 4)
┌─────┬──────┬──────┬──────┐
│ a ┆ b ┆ time ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪══════╪══════╪══════╡
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ 1 ┆ 2 ┆ 1 ┆ 2 │
│ ... ┆ ... ┆ ... ┆ ... │
│ 8 ┆ null ┆ 7 ┆ null │
│ 8 ┆ null ┆ 7 ┆ null │
│ 8 ┆ null ┆ 7 ┆ null │
│ 8 ┆ null ┆ 7 ┆ null │
└─────┴──────┴──────┴──────┘
>>> print(time.perf_counter() - start)
6.087414998997701
6 seconds to concatenate datasets of 500,000,000 records and 200,000,000 records
I have a polars dataframe as follows:
df = pl.DataFrame(
dict(
day=[1, 1, 1, 3, 3, 3, 5, 5, 8, 8, 9, 9, 9],
value=[1, 2, 2, 3, 5, 2, 1, 2, 7, 3, 5, 3, 4],
)
)
I want to incrementally rotate the values in column 'day'? By incremental rotation, I mean for each value, change it to its next larger value exists in the column, and if the value is the largest, then change it to null/None.
Basically, the result I expect should be the following:
pl.DataFrame(
dict(
day=[3, 3, 3, 5, 5, 5, 8, 8, 9, 9, None, None, None],
value=[1, 2, 2, 3, 5, 2, 1, 2, 7, 3, 5, 3, 4],
)
)
Is there some particular polars-python idiomatic way to achieve this?
If day is sorted - you could group together - shift - then explode back?
(df.groupby("day", maintain_order=True)
.agg_list()
.with_columns(pl.col("day").shift(-1))
.explode(pl.exclude("day")))
shape: (13, 2)
┌──────┬───────┐
│ day | value │
│ --- | --- │
│ i64 | i64 │
╞══════╪═══════╡
│ 3 | 1 │
│ 3 | 2 │
│ 3 | 2 │
│ 5 | 3 │
│ 5 | 5 │
│ 5 | 2 │
│ 8 | 1 │
│ 8 | 2 │
│ 9 | 7 │
│ 9 | 3 │
│ null | 5 │
│ null | 3 │
│ null | 4 │
└──────┴───────┘
Perhaps another approach is to .rank() the column.
.search_sorted() for rank + 1 could find the positions of the next "group".
The max values could be nulled out then passed to .take() to get the new values.
(df.with_columns(
pl.col("day").rank("dense")
.cast(pl.Int64)
.alias("rank"))
.with_columns(
pl.col("rank")
.search_sorted(pl.col("rank") + 1)
.alias("idx"))
.with_columns(
pl.when(pl.col("idx") != pl.col("idx").max())
.then(pl.col("idx"))
.alias("idx"))
.with_columns(
pl.col("day").take(pl.col("idx"))
.alias("new"))
)
shape: (13, 5)
┌─────┬───────┬──────┬──────┬──────┐
│ day | value | rank | idx | new │
│ --- | --- | --- | --- | --- │
│ i64 | i64 | i64 | u32 | i64 │
╞═════╪═══════╪══════╪══════╪══════╡
│ 1 | 1 | 1 | 3 | 3 │
│ 1 | 2 | 1 | 3 | 3 │
│ 1 | 2 | 1 | 3 | 3 │
│ 3 | 3 | 2 | 6 | 5 │
│ 3 | 5 | 2 | 6 | 5 │
│ 3 | 2 | 2 | 6 | 5 │
│ 5 | 1 | 3 | 8 | 8 │
│ 5 | 2 | 3 | 8 | 8 │
│ 8 | 7 | 4 | 10 | 9 │
│ 8 | 3 | 4 | 10 | 9 │
│ 9 | 5 | 5 | null | null │
│ 9 | 3 | 5 | null | null │
│ 9 | 4 | 5 | null | null │
└─────┴───────┴──────┴──────┴──────┘
Feels like I'm missing an obvious simpler approach here..
#jqurious, what I'd recommend for remapping values is a join. Joins are heavily optimized and scale very well, especially on machines with a good number of cores.
As an example, let's benchmark some solutions.
First, some data
Let's use enough data to avoid spurious results from "microbenchmarking" using tiny datasets. (I see this all too often - tiny datasets with benchmark results down to a few microseconds or milliseconds.)
On my 32-core system with 512 GB of RAM, that means expanding the dataset to one billion records. (Choose a different value below as appropriate for your computing platform.)
import polars as pl
import numpy as np
import time
rng = np.random.default_rng(1)
nbr_rows = 1_000_000_000
df = pl.DataFrame(
dict(
day=rng.integers(1, 1_000_000, nbr_rows),
value=rng.integers(1, 1_000_000, nbr_rows),
)
).with_row_count()
df
shape: (1000000000, 3)
┌───────────┬────────┬────────┐
│ row_nr ┆ day ┆ value │
│ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ i64 │
╞═══════════╪════════╪════════╡
│ 0 ┆ 473189 ┆ 747152 │
│ 1 ┆ 511822 ┆ 298575 │
│ 2 ┆ 755167 ┆ 868027 │
│ 3 ┆ 950463 ┆ 289295 │
│ ... ┆ ... ┆ ... │
│ 999999996 ┆ 828237 ┆ 503917 │
│ 999999997 ┆ 909996 ┆ 447681 │
│ 999999998 ┆ 309104 ┆ 588174 │
│ 999999999 ┆ 485525 ┆ 198567 │
└───────────┴────────┴────────┘
Assumption: Not sorted
Let's suppose that we cannot assume that the data is sorted by day. (We'll have to adapt the solutions somewhat.)
Join
Here's the results using a join. If you watch your CPU usage, for example using top in Linux, you'll see that the algorithm is heavily multi-threaded. It spends the majority of its time spread across all cores of your system.
start = time.perf_counter()
(
df
.join(
df
.select(pl.col('day').unique().sort())
.with_columns(
pl.col('day').shift(-1).alias('new_day')
),
how='inner',
on='day',
)
)
print(time.perf_counter() - start)
shape: (1000000000, 4)
┌───────────┬────────┬────────┬─────────┐
│ row_nr ┆ day ┆ value ┆ new_day │
│ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ i64 ┆ i64 │
╞═══════════╪════════╪════════╪═════════╡
│ 0 ┆ 473189 ┆ 747152 ┆ 473190 │
│ 1 ┆ 511822 ┆ 298575 ┆ 511823 │
│ 2 ┆ 755167 ┆ 868027 ┆ 755168 │
│ 3 ┆ 950463 ┆ 289295 ┆ 950464 │
│ ... ┆ ... ┆ ... ┆ ... │
│ 999999996 ┆ 828237 ┆ 503917 ┆ 828238 │
│ 999999997 ┆ 909996 ┆ 447681 ┆ 909997 │
│ 999999998 ┆ 309104 ┆ 588174 ┆ 309105 │
│ 999999999 ┆ 485525 ┆ 198567 ┆ 485526 │
└───────────┴────────┴────────┴─────────┘
>>> print(time.perf_counter() - start)
20.85321443199973
groupby-explode
Now let's try the groupby-explode solution. This algorithm will spend a good share of time in single-threaded mode.
I've had to add a sort after the grouping step because the algorithm assumes sorted data in the steps after it.
start = time.perf_counter()
(
df
.groupby("day", maintain_order=False)
.agg_list()
.sort(['day'])
.with_columns(pl.col("day").shift(-1))
.explode(pl.exclude("day"))
)
print(time.perf_counter() - start)
shape: (1000000000, 3)
┌──────┬───────────┬────────┐
│ day ┆ row_nr ┆ value │
│ --- ┆ --- ┆ --- │
│ i64 ┆ u32 ┆ i64 │
╞══════╪═══════════╪════════╡
│ 2 ┆ 197731 ┆ 4093 │
│ 2 ┆ 3154732 ┆ 433246 │
│ 2 ┆ 4825468 ┆ 436316 │
│ 2 ┆ 4927362 ┆ 83493 │
│ ... ┆ ... ┆ ... │
│ null ┆ 993596728 ┆ 25604 │
│ null ┆ 995160321 ┆ 575415 │
│ null ┆ 996690852 ┆ 490825 │
│ null ┆ 999391650 ┆ 92113 │
└──────┴───────────┴────────┘
>>> print(time.perf_counter() - start)
54.04602192300081
rank
Now, the rank method. This algorithm will spend nearly all its time in single-threaded mode.
I've also had to add a sort here, as the ranks are assumed to be sorted in the search_sorted step.
start = time.perf_counter()
(
df
.sort(['day'])
.with_columns(
pl.col("day").rank("dense").cast(pl.Int64).alias("rank")
)
.with_columns(
pl.col("rank").search_sorted(pl.col("rank") + 1).alias("idx")
)
.with_columns(
pl.when(pl.col("idx") != pl.col("idx").max())
.then(pl.col("idx"))
.alias("idx")
)
.with_columns(
pl.col("day").take(pl.col("idx")).alias("new")
)
)
print(time.perf_counter() - start)
shape: (1000000000, 6)
┌───────────┬────────┬────────┬────────┬──────┬──────┐
│ row_nr ┆ day ┆ value ┆ rank ┆ idx ┆ new │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ i64 ┆ i64 ┆ u32 ┆ i64 │
╞═══════════╪════════╪════════╪════════╪══════╪══════╡
│ 197731 ┆ 1 ┆ 4093 ┆ 1 ┆ 1907 ┆ 2 │
│ 3154732 ┆ 1 ┆ 433246 ┆ 1 ┆ 1907 ┆ 2 │
│ 4825468 ┆ 1 ┆ 436316 ┆ 1 ┆ 1907 ┆ 2 │
│ 4927362 ┆ 1 ┆ 83493 ┆ 1 ┆ 1907 ┆ 2 │
│ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... │
│ 993596728 ┆ 999999 ┆ 25604 ┆ 999999 ┆ null ┆ null │
│ 995160321 ┆ 999999 ┆ 575415 ┆ 999999 ┆ null ┆ null │
│ 996690852 ┆ 999999 ┆ 490825 ┆ 999999 ┆ null ┆ null │
│ 999391650 ┆ 999999 ┆ 92113 ┆ 999999 ┆ null ┆ null │
└───────────┴────────┴────────┴────────┴──────┴──────┘
>>> print(time.perf_counter() - start)
98.63108555600047
Assumption: Sorted by day
If we can assume that our data is already sorted by day, we can cut out unnecessary steps in our algorithms - as well as see some decent increases in speed.
We'll sort the data first and re-run our algorithms. Note that sorting sets the sorted flag on the day column, which allows algorithms to take shortcuts to increase speed. (If not sorting manually, then the set_sorted method can be used tell Polars that the column is pre-sorted.)
df = df.sort(['day'])
df
shape: (1000000000, 3)
┌───────────┬────────┬────────┐
│ row_nr ┆ day ┆ value │
│ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ i64 │
╞═══════════╪════════╪════════╡
│ 197731 ┆ 1 ┆ 4093 │
│ 3154732 ┆ 1 ┆ 433246 │
│ 4825468 ┆ 1 ┆ 436316 │
│ 4927362 ┆ 1 ┆ 83493 │
│ ... ┆ ... ┆ ... │
│ 993596728 ┆ 999999 ┆ 25604 │
│ 995160321 ┆ 999999 ┆ 575415 │
│ 996690852 ┆ 999999 ┆ 490825 │
│ 999391650 ┆ 999999 ┆ 92113 │
└───────────┴────────┴────────┘
Join
The code employing a join needs no changes; however, it does see an incredible speedup.
start = time.perf_counter()
(
df
.join(
df
.select(pl.col('day').unique().sort())
.with_columns(
pl.col('day').shift(-1).alias('new_day')
),
how='inner',
on='day',
)
)
print(time.perf_counter() - start)
shape: (1000000000, 4)
┌───────────┬────────┬────────┬─────────┐
│ row_nr ┆ day ┆ value ┆ new_day │
│ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ i64 ┆ i64 │
╞═══════════╪════════╪════════╪═════════╡
│ 197731 ┆ 1 ┆ 4093 ┆ 2 │
│ 3154732 ┆ 1 ┆ 433246 ┆ 2 │
│ 4825468 ┆ 1 ┆ 436316 ┆ 2 │
│ 4927362 ┆ 1 ┆ 83493 ┆ 2 │
│ ... ┆ ... ┆ ... ┆ ... │
│ 993596728 ┆ 999999 ┆ 25604 ┆ null │
│ 995160321 ┆ 999999 ┆ 575415 ┆ null │
│ 996690852 ┆ 999999 ┆ 490825 ┆ null │
│ 999391650 ┆ 999999 ┆ 92113 ┆ null │
└───────────┴────────┴────────┴─────────┘
>>> print(time.perf_counter() - start)
8.71159654099938
Note the same exact join algorithm now finishes in only 8.7 seconds rather than 20.9 seconds, largely due to the data being pre-sorted, and the sorted flag being set on day.
groupby-explode
We'll eliminate the superfluous sort within the algorithm, and re-run it.
start = time.perf_counter()
(
df
.groupby("day", maintain_order=True)
.agg_list()
.with_columns(pl.col("day").shift(-1))
.explode(pl.exclude("day"))
)
print(time.perf_counter() - start)
shape: (1000000000, 3)
┌──────┬───────────┬────────┐
│ day ┆ row_nr ┆ value │
│ --- ┆ --- ┆ --- │
│ i64 ┆ u32 ┆ i64 │
╞══════╪═══════════╪════════╡
│ 2 ┆ 197731 ┆ 4093 │
│ 2 ┆ 3154732 ┆ 433246 │
│ 2 ┆ 4825468 ┆ 436316 │
│ 2 ┆ 4927362 ┆ 83493 │
│ ... ┆ ... ┆ ... │
│ null ┆ 993596728 ┆ 25604 │
│ null ┆ 995160321 ┆ 575415 │
│ null ┆ 996690852 ┆ 490825 │
│ null ┆ 999391650 ┆ 92113 │
└──────┴───────────┴────────┘
>>> print(time.perf_counter() - start)
8.249637401000655
Note how this algorithm takes slightly less time than the join algorithm, all due to the assumption of day being pre-sorted.
rank
Again, we'll now eliminated the superfluous sort and re-run the algorithm.
start = time.perf_counter()
(
df
.with_columns(
pl.col("day").rank("dense").cast(pl.Int64).alias("rank")
)
.with_columns(
pl.col("rank").search_sorted(pl.col("rank") + 1).alias("idx")
)
.with_columns(
pl.when(pl.col("idx") != pl.col("idx").max())
.then(pl.col("idx"))
.alias("idx")
)
.with_columns(
pl.col("day").take(pl.col("idx")).alias("new")
)
)
print(time.perf_counter() - start)
shape: (1000000000, 6)
┌───────────┬────────┬────────┬────────┬──────┬──────┐
│ row_nr ┆ day ┆ value ┆ rank ┆ idx ┆ new │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ i64 ┆ i64 ┆ u32 ┆ i64 │
╞═══════════╪════════╪════════╪════════╪══════╪══════╡
│ 197731 ┆ 1 ┆ 4093 ┆ 1 ┆ 1907 ┆ 2 │
│ 3154732 ┆ 1 ┆ 433246 ┆ 1 ┆ 1907 ┆ 2 │
│ 4825468 ┆ 1 ┆ 436316 ┆ 1 ┆ 1907 ┆ 2 │
│ 4927362 ┆ 1 ┆ 83493 ┆ 1 ┆ 1907 ┆ 2 │
│ ... ┆ ... ┆ ... ┆ ... ┆ ... ┆ ... │
│ 993596728 ┆ 999999 ┆ 25604 ┆ 999999 ┆ null ┆ null │
│ 995160321 ┆ 999999 ┆ 575415 ┆ 999999 ┆ null ┆ null │
│ 996690852 ┆ 999999 ┆ 490825 ┆ 999999 ┆ null ┆ null │
│ 999391650 ┆ 999999 ┆ 92113 ┆ 999999 ┆ null ┆ null │
└───────────┴────────┴────────┴────────┴──────┴──────┘
>>> print(time.perf_counter() - start)
48.90440067800046
Although this algorithm now takes roughly half the time, it's not quite as fast as the join or groupby-explode algorithms.
Of course, wall-clock performance is not the end-all-be-all. But when problems scale up, joins are particularly good tools, even when we cannot make assumptions regarding the sorted-ness of our data.
If I have this dataframe:
pl.DataFrame(dict(x=[0, 1, 2, 3], y=[5, 2, 3, 3],z=[4,7,8,2]))
shape: (4, 3)
┌─────┬─────┬─────┐
│ x ┆ y ┆ z │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 0 ┆ 5 ┆ 4 │
│ 1 ┆ 2 ┆ 7 │
│ 2 ┆ 3 ┆ 8 │
│ 3 ┆ 3 ┆ 2 │
└─────┴─────┴─────┘
and I want to find the value in x where y is max, then again find the value in x where z is max, and repeat for hundreds more columns so that I end up with something like:
shape: (2, 2)
┌────────┬─────────┐
│ column ┆ x_value │
│ --- ┆ --- │
│ str ┆ i64 │
╞════════╪═════════╡
│ y ┆ 0 │
│ z ┆ 2 │
└────────┴─────────┘
or
shape: (1, 2)
┌─────┬─────┐
│ y ┆ z │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 0 ┆ 2 │
└─────┴─────┘
What is the best polars way to do that?
You could:
pl.exclude("x") to select all columns minus x
.arg_max() to get the max index of each selected column.
pass the indexes to pl.col("x").take() to get the x value at each index.
pl.concat_list() to create a list of all the values.
>>> df.select(pl.concat_list(pl.col("x").take(pl.exclude("x").arg_max())))
shape: (1, 1)
┌───────────┐
│ x │
│ --- │
│ list[i64] │
╞═══════════╡
│ [0, 2] │
└───────────┘
To add in the column names you could:
# Feels like this could be simplified?
columns = df.columns
columns.remove("x")
columns = pl.Series(columns).alias("column")
df.select(
pl.concat_list(
pl.col("x").take(pl.exclude("x").arg_max())
).flatten()
).with_columns(columns)
shape: (2, 2)
┌─────┬────────┐
│ x | column │
│ --- | --- │
│ i64 | str │
╞═════╪════════╡
│ 0 | y │
├─────┼────────┤
│ 2 | z │
└─────┴────────┘
Possible approach for the other result:
(df.with_columns(pl.exclude("x").arg_max())
.select([
pl.col("x").take(col).first().alias(col)
for col in df.columns if col != "x"
])
)
shape: (1, 2)
┌─────┬─────┐
│ y | z │
│ --- | --- │
│ i64 | i64 │
╞═════╪═════╡
│ 0 | 2 │
└─────┴─────┘
E.g. if I have
import polars as pl
df = pl.DataFrame({'a': [1,2,3], 'b': [4,5,6]})
how would I find the cumulative sum of each row?
Expected output:
a b
0 1 5
1 2 7
2 3 9
Here's the equivalent in pandas:
>>> import pandas as pd
>>> pd.DataFrame({'a': [1,2,3], 'b': [4,5,6]}).cumsum(axis=1)
a b
0 1 5
1 2 7
2 3 9
but I can't figure out how to do it in polars
Edit: Polars 0.14.18 and later
As of Polars 0.14.18, we can use the new polars.cumsum function to simplify this. (Note: this is slightly different than the polars.Expr.cumsum Expression, in that it acts as a root Expression.)
Using the same DataFrame as below:
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.select([
pl.exclude(my_cols),
pl.cumsum(my_cols).alias('result')
])
.unnest('result')
)
shape: (3, 4)
┌─────┬─────┬─────┬─────┐
│ id ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 5 ┆ 12 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ b ┆ 2 ┆ 7 ┆ 15 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ c ┆ 3 ┆ 9 ┆ 18 │
└─────┴─────┴─────┴─────┘
Before Polars 0.14.18
Polars is column-oriented, and as such does not have the concept of a axis. Still, we can use the list evaluation context to solve this.
First, let's expand you data slightly:
df = pl.DataFrame({
"id": ['a', 'b', 'c'],
"a": [1, 2, 3],
"b": [4, 5, 6],
"c": [7, 8, 9],
})
df
shape: (3, 4)
┌─────┬─────┬─────┬─────┐
│ id ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 4 ┆ 7 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ b ┆ 2 ┆ 5 ┆ 8 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ c ┆ 3 ┆ 6 ┆ 9 │
└─────┴─────┴─────┴─────┘
The Algorithm
Here's a general-purpose performant algorithm that will solve this. We'll walk through it below.
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.with_column(
pl.concat_list(my_cols)
.arr.eval(pl.element().cumsum())
.arr.to_struct(name_generator=lambda idx: my_cols[idx])
.alias('result')
)
.drop(my_cols)
.unnest('result')
)
shape: (3, 4)
┌─────┬─────┬─────┬─────┐
│ id ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 5 ┆ 12 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ b ┆ 2 ┆ 7 ┆ 15 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ c ┆ 3 ┆ 9 ┆ 18 │
└─────┴─────┴─────┴─────┘
How it works
First, we'll select the names of the numeric columns. You can name these explicitly if you like, e.g., my_cols=['a','b','c'].
Next, we'll gather up the column values into a list using polars.concat_list.
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.with_column(
pl.concat_list(my_cols)
.alias('result')
)
)
shape: (3, 5)
┌─────┬─────┬─────┬─────┬───────────┐
│ id ┆ a ┆ b ┆ c ┆ result │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ list[i64] │
╞═════╪═════╪═════╪═════╪═══════════╡
│ a ┆ 1 ┆ 4 ┆ 7 ┆ [1, 4, 7] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2 ┆ 5 ┆ 8 ┆ [2, 5, 8] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 3 ┆ 6 ┆ 9 ┆ [3, 6, 9] │
└─────┴─────┴─────┴─────┴───────────┘
From here, we'll use the arr.eval context to run our cumsum on the list.
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.with_column(
pl.concat_list(my_cols)
.arr.eval(pl.element().cumsum())
.alias('result')
)
)
shape: (3, 5)
┌─────┬─────┬─────┬─────┬────────────┐
│ id ┆ a ┆ b ┆ c ┆ result │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ list[i64] │
╞═════╪═════╪═════╪═════╪════════════╡
│ a ┆ 1 ┆ 4 ┆ 7 ┆ [1, 5, 12] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2 ┆ 5 ┆ 8 ┆ [2, 7, 15] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 3 ┆ 6 ┆ 9 ┆ [3, 9, 18] │
└─────┴─────┴─────┴─────┴────────────┘
Next, we'll break the list into a struct using arr.to_struct, and name the fields the corresponding names from our selected numeric columns.
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.with_column(
pl.concat_list(my_cols)
.arr.eval(pl.element().cumsum())
.arr.to_struct(name_generator=lambda idx: my_cols[idx])
.alias('result')
)
)
shape: (3, 5)
┌─────┬─────┬─────┬─────┬───────────┐
│ id ┆ a ┆ b ┆ c ┆ result │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 ┆ struct[3] │
╞═════╪═════╪═════╪═════╪═══════════╡
│ a ┆ 1 ┆ 4 ┆ 7 ┆ {1,5,12} │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 2 ┆ 5 ┆ 8 ┆ {2,7,15} │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ c ┆ 3 ┆ 6 ┆ 9 ┆ {3,9,18} │
└─────┴─────┴─────┴─────┴───────────┘
And finally, we'll use unnest to break the struct into columns. (But first we must drop the original columns or else we'll get two columns with the same name.)
my_cols = [s.name for s in df if s.is_numeric()]
(
df
.with_column(
pl.concat_list(my_cols)
.arr.eval(pl.element().cumsum())
.arr.to_struct(name_generator=lambda idx: my_cols[idx])
.alias('result')
)
.drop(my_cols)
.unnest('result')
)
shape: (3, 4)
┌─────┬─────┬─────┬─────┐
│ id ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 5 ┆ 12 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ b ┆ 2 ┆ 7 ┆ 15 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ c ┆ 3 ┆ 9 ┆ 18 │
└─────┴─────┴─────┴─────┘
There may be a simpler and faster way, but here is the programmatic solution.
Concatenate the values along the columns into a list
Calculate the cumulative sum over the list (the result is still a list)
Get values for each column in the result
import polars as pl
df = pl.DataFrame({'a': [1,2,3], 'b': [4,5,6]})
df.select([
pl.concat_list(pl.all())
.arr.eval(pl.element().cumsum())
.alias('cs')
]).select([
pl.col('cs').arr.get(i).alias(name)
for i, name in enumerate(df.columns)
])
shape: (3, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 5 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 7 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 9 │
└─────┴─────┘