Can we/how to conditionally select columns? - python-polars

Let's say I have a list of dataframes list this:
Ldfs=[
pl.DataFrame({'a':[1.0,2.0,3.1], 'b':[2,3,4]}),
pl.DataFrame({'b':[1,2,3], 'c':[2,3,4]}),
pl.DataFrame({'a':[1,2,3], 'c':[2,3,4]})
]
I can't do pl.concat(Ldfs) because they don't all have the same columns and even the ones that have a in common don't have the same data type.
What I'd like to do is concat them together but just add a column of Nones whenever a column isn't there and to cast columns to a fixed datatype.
For instance, just taking the first element of the list I'd like to have something like this work:
Ldfs[0].select(pl.when(pl.col('c')).then(pl.col('c').cast(pl.Float64()).otherwise(pl.lit(None).cast(pl.Float64()).alias('c')))
of course, this results in NotFoundError: c

Would an approach like this work for you. (I'll convert your DataFrames to LazyFrames for added fun.)
Ldfs = [
pl.DataFrame({"a": [1.0, 2.0, 3.1], "b": [2, 3, 4]}).lazy(),
pl.DataFrame({"b": [1, 2, 3], "c": [2, 3, 4]}).lazy(),
pl.DataFrame({"a": [1, 2, 3], "c": [2, 3, 4]}).lazy(),
]
my_schema = {
"a": pl.Float64,
"b": pl.Int64,
"c": pl.UInt32,
}
def fix_schema(ldf: pl.LazyFrame) -> pl.LazyFrame:
ldf = (
ldf.with_columns(
[
pl.col(col_nm).cast(col_type)
for col_nm, col_type in my_schema.items()
if col_nm in ldf.columns
]
)
.with_columns(
[
pl.lit(None, dtype=col_type).alias(col_nm)
for col_nm, col_type in my_schema.items()
if col_nm not in ldf.columns
]
)
.select(my_schema.keys())
)
return ldf
pl.concat([fix_schema(next_frame)
for next_frame in Ldfs], how="vertical").collect()
shape: (9, 3)
┌──────┬──────┬──────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ f64 ┆ i64 ┆ u32 │
╞══════╪══════╪══════╡
│ 1.0 ┆ 2 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2.0 ┆ 3 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3.1 ┆ 4 ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ null ┆ 1 ┆ 2 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ null ┆ 2 ┆ 3 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ null ┆ 3 ┆ 4 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 1.0 ┆ null ┆ 2 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2.0 ┆ null ┆ 3 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3.0 ┆ null ┆ 4 │
└──────┴──────┴──────┘

.from_dicts() can infer the types and column names:
>>> df = pl.from_dicts([frame.to_dict() for frame in Ldfs])
>>> df
shape: (3, 3)
┌─────────────────┬───────────┬───────────┐
│ a | b | c │
│ --- | --- | --- │
│ list[f64] | list[i64] | list[i64] │
╞═════════════════╪═══════════╪═══════════╡
│ [1.0, 2.0, 3.1] | [2, 3, 4] | null │
├─────────────────┼───────────┼───────────┤
│ null | [1, 2, 3] | [2, 3, 4] │
├─────────────────┼───────────┼───────────┤
│ [1.0, 2.0, 3.0] | null | [2, 3, 4] │
└─//──────────────┴─//────────┴─//────────┘
With the right sized [null, ...] lists - you could .explode() all columns.
>>> nulls = pl.Series([[None] * len(frame) for frame in Ldfs])
... (
... pl.from_dicts([
... frame.to_dict() for frame in Ldfs
... ])
... .with_columns(
... pl.all().fill_null(nulls))
... .explode(pl.all())
... )
shape: (9, 3)
┌──────┬──────┬──────┐
│ a | b | c │
│ --- | --- | --- │
│ f64 | i64 | i64 │
╞══════╪══════╪══════╡
│ 1.0 | 2 | null │
├──────┼──────┼──────┤
│ 2.0 | 3 | null │
├──────┼──────┼──────┤
│ 3.1 | 4 | null │
├──────┼──────┼──────┤
│ null | 1 | 2 │
├──────┼──────┼──────┤
│ null | 2 | 3 │
├──────┼──────┼──────┤
│ null | 3 | 4 │
├──────┼──────┼──────┤
│ 1.0 | null | 2 │
├──────┼──────┼──────┤
│ 2.0 | null | 3 │
├──────┼──────┼──────┤
│ 3.0 | null | 4 │
└─//───┴─//───┴─//───┘

Related

How can I use when, then and otherwise with multiple conditions in polars?

I have a data set with three columns. Column A is to be checked for strings. If the string matches foo or spam, the values in the same row for the other two columns L and G should be changed to XX. For this I have tried the following.
df = pl.DataFrame(
{
"A": ["foo", "ham", "spam", "egg",],
"L": ["A54", "A12", "B84", "C12"],
"G": ["X34", "C84", "G96", "L6",],
}
)
print(df)
shape: (4, 3)
┌──────┬─────┬─────┐
│ A ┆ L ┆ G │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞══════╪═════╪═════╡
│ foo ┆ A54 ┆ X34 │
│ ham ┆ A12 ┆ C84 │
│ spam ┆ B84 ┆ G96 │
│ egg ┆ C12 ┆ L6 │
└──────┴─────┴─────┘
expected outcome
shape: (4, 3)
┌──────┬─────┬─────┐
│ A ┆ L ┆ G │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞══════╪═════╪═════╡
│ foo ┆ XX ┆ XX │
│ ham ┆ A12 ┆ C84 │
│ spam ┆ XX ┆ XX │
│ egg ┆ C12 ┆ L6 │
└──────┴─────┴─────┘
I tried this
df = df.with_column(
pl.when((pl.col("A") == "foo") | (pl.col("A") == "spam"))
.then((pl.col("L")= "XX") & (pl.col( "G")= "XX"))
.otherwise((pl.col("L"))&(pl.col( "G")))
)
However, this does not work. Can someone help me with this?
For setting multiple columns to the same value you could use:
df.with_columns(
pl.when(pl.col("A").is_in(["foo", "spam"]))
.then("XX")
.otherwise(pl.col(["L", "G"]))
.keep_name()
)
shape: (4, 3)
┌──────┬─────┬─────┐
│ A | L | G │
│ --- | --- | --- │
│ str | str | str │
╞══════╪═════╪═════╡
│ foo | XX | XX │
├──────┼─────┼─────┤
│ ham | A12 | C84 │
├──────┼─────┼─────┤
│ spam | XX | XX │
├──────┼─────┼─────┤
│ egg | C12 | L6 │
└──────┴─────┴─────┘
.is_in() can be used instead of multiple == x | == y chains.
To update multiple columns at once with different values you could use .map() and a dictionary:
df.with_columns(
pl.when(pl.col("A").is_in(["foo", "spam"]))
.then(pl.col(["L", "G"]).map(
lambda column: {
"L": "XX",
"G": "YY",
}.get(column.name)))
.otherwise(pl.col(["L", "G"]))
)
shape: (4, 3)
┌──────┬─────┬─────┐
│ A | L | G │
│ --- | --- | --- │
│ str | str | str │
╞══════╪═════╪═════╡
│ foo | XX | YY │
├──────┼─────┼─────┤
│ ham | A12 | C84 │
├──────┼─────┼─────┤
│ spam | XX | YY │
├──────┼─────┼─────┤
│ egg | C12 | L6 │
└──────┴─────┴─────┘

Copying row values based on Config file from polars dataframe

I have a dataframe consisting of an ID, Local, Entity, Field and Global column.
# Creating a dictionary with the data
data = {'ID': [4, 4, 4, 4, 4],
'Local': ['A', 'B', 'C', 'D', 'E'],
'Field': ['P', 'Q', 'R', 'S', 'T'],
'Entity': ['K', 'L', 'M', 'N', 'O'],
'Global': ['F', 'G', 'H', 'I', 'J'],}
# Creating the dataframe
table = pl.DataFrame(data)
print(table)
shape: (5, 5)
┌─────┬───────┬───────┬──────────┬────────┐
│ ID ┆ Local ┆ Field ┆ Entity ┆ Global │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str ┆ str ┆ str │
╞═════╪═══════╪═══════╪══════════╪════════╡
│ 4 ┆ A ┆ P ┆ K ┆ F │
│ 4 ┆ B ┆ Q ┆ L ┆ G │
│ 4 ┆ C ┆ R ┆ M ┆ H │
│ 4 ┆ D ┆ S ┆ N ┆ I │
│ 4 ┆ E ┆ T ┆ O ┆ J │
└─────┴───────┴───────┴──────────┴────────┘
Within the dataset certain rows need to be copied. For this purpose the following config file is provided with the following information:
copying:
- column_name: P
source_table: K
destination_table: X
- column_name: S
source_table: N
destination_table: W
In the config file there is a column_name value which refers to the Field column, a source_table which refers to the given Entity and a destination_table which should be the future entry in the Entity column. The goal is to enrich data based on existing rows (just with other tables).
The solution should look like this:
shape: (7, 5)
┌─────┬───────┬───────┬──────────┬────────┐
│ ID ┆ Local ┆ Field ┆ Entity ┆ Global │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ str ┆ str ┆ str │
╞═════╪═══════╪═══════╪══════════╪════════╡
│ 4 ┆ A ┆ P ┆ K ┆ F │
│ 4 ┆ B ┆ Q ┆ L ┆ G │
│ 4 ┆ C ┆ R ┆ M ┆ H │
│ 4 ┆ D ┆ S ┆ N ┆ I │
│ 4 ┆ E ┆ T ┆ O ┆ J │
│ 4 ┆ A ┆ P ┆ X ┆ F │
│ 4 ┆ D ┆ S ┆ W ┆ I │
└─────┴───────┴───────┴──────────┴────────┘
The dataset is a polars DataFrame and the config file is loaded as omegaconf. I tried it with this code:
conf.copying = [
{"column_name": "P", "source_table": "K", "destination_table": "X"},
{"column_name": "S", "source_table": "N", "destination_table": "W"},
]
# Iterate through the config file
for i in range(len(conf.copying)):
# Select the rows from the table dataframe that match the column_name and source_table fields in the config
match_rows = table.filter(
(pl.col("Field") == conf.copying[i]["column_name"])
& (pl.col("Entity") == conf.copying[i]["source_table"])
)
# Add the column Entities with the destination_table
match_rows = match_rows.select(
[
"ID",
"Local",
"Field",
"Global",
]
)
# Add the column Entities with the destination_table
match_rows = match_rows.with_columns(
pl.lit(conf.copying[i]["destination_table"]).alias("Entity")
)
match_rows = match_rows[
[
"ID",
"Local",
"Field",
"Entity",
"Global",
]
]
# Append the new rows to the original table dataframe
df_copy = match_rows.vstack(table)
However, the data is not copied as expected and added to the existing dataset. What am I doing wrong?
It's probably better if you extract all of the rows you want in one go.
One possible approach is to build a list of when/then expressions:
rules = [
pl.when(
(pl.col("Field") == rule["column_name"]) &
(pl.col("Entity") == rule["source_table"]))
.then(rule["destination_table"])
.alias("Entity")
for rule in conf.copying
]
You can use pl.coalesce and .drop_nulls() to remove non-matches.
>>> table.with_columns(pl.coalesce(rules)).drop_nulls()
shape: (2, 5)
┌─────┬───────┬───────┬────────┬────────┐
│ ID | Local | Field | Entity | Global │
│ --- | --- | --- | --- | --- │
│ i64 | str | str | str | str │
╞═════╪═══════╪═══════╪════════╪════════╡
│ 4 | A | P | X | F │
├─────┼───────┼───────┼────────┼────────┤
│ 4 | D | S | W | I │
└─────┴───────┴───────┴────────┴────────┘
This can then be added to the original dataframe.
Another style I've seen is to start with an "empty" when/then to build a .when().then() chain e.g.
rules = pl.when(False).then(None)
for rule in conf.copying:
rules = (
rules.when(
(pl.col("Field") == rule["column_name"]) &
(pl.col("Entity") == rule["source_table"]))
.then(rule["destination_table"])
)
>>> table.with_columns(rules.alias("Entity")).drop_nulls()
shape: (2, 5)
┌─────┬───────┬───────┬────────┬────────┐
│ ID | Local | Field | Entity | Global │
│ --- | --- | --- | --- | --- │
│ i64 | str | str | str | str │
╞═════╪═══════╪═══════╪════════╪════════╡
│ 4 | A | P | X | F │
├─────┼───────┼───────┼────────┼────────┤
│ 4 | D | S | W | I │
└─────┴───────┴───────┴────────┴────────┘
I'm not sure if it would make much difference in filtering the rows first and then doing the Entity modification - pl.any() could be used if you were to do that:
table.filter(
pl.any([
(pl.col("Field") == rule["column_name"]) &
(pl.col("Entity") == rule["source_table"])
for rule in conf.copying
])
)
shape: (2, 5)
┌─────┬───────┬───────┬────────┬────────┐
│ ID | Local | Field | Entity | Global │
│ --- | --- | --- | --- | --- │
│ i64 | str | str | str | str │
╞═════╪═══════╪═══════╪════════╪════════╡
│ 4 | A | P | K | F │
├─────┼───────┼───────┼────────┼────────┤
│ 4 | D | S | N | I │
└─────┴───────┴───────┴────────┴────────┘

Join between Polars dataframes with inequality conditions

I would like to do a join between two dataframes, using as join condition an inequality condition, i.e. greater than.
Given two dataframes, I would like to get the result equivalent to the SQL written below.
stock_market_value = pl.DataFrame(
{
"date": [date(2022, 1, 1), date(2022, 2, 1), date(2022, 3, 1)],
"price": [10.00, 12.00, 14.00]
}
)
my_stock_orders = pl.DataFrame(
{
"date": [date(2022, 1, 15), date(2022, 2, 15)],
"quantity": [2, 5]
}
)
I have read that Polars supports join of type asof, but I don't think it applies to my case (maybe putting tolerance equal to infinity?).
For sake of clarity, I wrote the join in form of SQL statement.
SELECT m.date, m.price * o.quantity AS portfolio_value
FROM stock_market_value m LEFT JOIN my_stock_orders o
ON m.date >= o.date
Example query/output:
duckdb.sql("""
SELECT
m.date market_date,
o.date order_date,
price,
quantity,
price * quantity AS portfolio_value
FROM stock_market_value m LEFT JOIN my_stock_orders o
ON m.date >= o.date
""").pl()
shape: (4, 5)
┌─────────────┬────────────┬───────┬──────────┬─────────────────┐
│ market_date | order_date | price | quantity | portfolio_value │
│ --- | --- | --- | --- | --- │
│ date | date | f64 | i64 | f64 │
╞═════════════╪════════════╪═══════╪══════════╪═════════════════╡
│ 2022-01-01 | null | 10.0 | null | null │
│ 2022-02-01 | 2022-01-15 | 12.0 | 2 | 24.0 │
│ 2022-03-01 | 2022-01-15 | 14.0 | 2 | 28.0 │
│ 2022-03-01 | 2022-02-15 | 14.0 | 5 | 70.0 │
└─────────────┴────────────┴───────┴──────────┴─────────────────┘
Why asof() is not the solution
Comments were suggesting to use asof, but it actually does not work in the way I expect.
Forward asof
result_fwd = stock_market_value.join_asof(
my_stock_orders, left_on="date", right_on="date", strategy="forward"
)
print(result_fwd)
shape: (3, 3)
┌────────────┬───────┬──────────┐
│ date ┆ price ┆ quantity │
│ --- ┆ --- ┆ --- │
│ date ┆ f64 ┆ i64 │
╞════════════╪═══════╪══════════╡
│ 2022-01-01 ┆ 10.0 ┆ 2 │
│ 2022-02-01 ┆ 12.0 ┆ 5 │
│ 2022-03-01 ┆ 14.0 ┆ null │
└────────────┴───────┴──────────┘
Backward asof
result_bwd = stock_market_value.join_asof(
my_stock_orders, left_on="date", right_on="date", strategy="backward"
)
print(result_bwd)
shape: (3, 3)
┌────────────┬───────┬──────────┐
│ date ┆ price ┆ quantity │
│ --- ┆ --- ┆ --- │
│ date ┆ f64 ┆ i64 │
╞════════════╪═══════╪══════════╡
│ 2022-01-01 ┆ 10.0 ┆ null │
│ 2022-02-01 ┆ 12.0 ┆ 2 │
│ 2022-03-01 ┆ 14.0 ┆ 5 │
└────────────┴───────┴──────────┘
Thanks!
You can do a join_asof. I you want to look forward you should use the forward strategy:
stock_market_value.join_asof(
my_stock_orders,
on='date',
strategy='forward',
).with_columns((pl.col("price") * pl.col("quantity")).alias("value"))
┌────────────┬───────┬──────────┬───────┐
│ date ┆ price ┆ quantity ┆ value │
│ --- ┆ --- ┆ --- ┆ --- │
│ date ┆ f64 ┆ i64 ┆ f64 │
╞════════════╪═══════╪══════════╪═══════╡
│ 2022-01-01 ┆ 10.0 ┆ 2 ┆ 20.0 │
│ 2022-02-01 ┆ 12.0 ┆ 5 ┆ 60.0 │
│ 2022-03-01 ┆ 14.0 ┆ null ┆ null │
└────────────┴───────┴──────────┴───────┘
You can use join_asof to determine which records to exclude from the date logic, then perform a cartesian product + filter yourself on the remainder, then merge everything back together. The following implements what you want, although it's a little bit hacky.
Update: Using polars' native cross-product instead of self-defined cartesian product function.
import polars as pl
from polars import col
from datetime import date
stock_market_value = pl.DataFrame({
"market_date": [date(2022, 1, 1), date(2022, 2, 1), date(2022, 3, 1)],
"price": [10.00, 12.00, 14.00]
})
stock_market_orders = pl.DataFrame({
"order_date": [date(2022, 1, 15), date(2022, 2, 15)],
"quantity": [2, 5]
})
# use a backwards join-asof to find rows in market_value that have no rows in orders with order date < market date
stock_market_value = stock_market_value.with_columns(
stock_market_value.join_asof(
stock_market_orders,
left_on="market_date",
right_on="order_date",
)["order_date"].is_not_null().alias("has_match")
)
nonmatched_rows = stock_market_value.filter(col("has_match")==False).drop("has_match")
# keep all other rows and perform a cartesian product
matched_rows = stock_market_value.filter(col("has_match")==True).drop("has_match")
df = matched_rows.join(stock_market_orders, how="cross")
# filter based on our join condition
df = df.filter(col("market_date") > col("order_date"))
# concatenate the unmatched with the filtered result for our final answer
df = pl.concat((nonmatched_rows, df), how="diagonal")
print(df)
Output:
shape: (4, 4)
┌─────────────┬───────┬────────────┬──────────┐
│ market_date ┆ price ┆ order_date ┆ quantity │
│ --- ┆ --- ┆ --- ┆ --- │
│ date ┆ f64 ┆ date ┆ i64 │
╞═════════════╪═══════╪════════════╪══════════╡
│ 2022-01-01 ┆ 10.0 ┆ null ┆ null │
│ 2022-02-01 ┆ 12.0 ┆ 2022-01-15 ┆ 2 │
│ 2022-03-01 ┆ 14.0 ┆ 2022-01-15 ┆ 2 │
│ 2022-03-01 ┆ 14.0 ┆ 2022-02-15 ┆ 5 │
└─────────────┴───────┴────────────┴──────────┘

Complex asof joins with option to select between duplicates + strictly less/greater than + use utf8

Is there a way to perform something similar to an asof join, but with
The option to select which element to join with (e.g. first, last) if there are duplicates
The option to join with only strictly less/greater than
Ability to use utf-8
Here's a code example:
import polars as pl
df1 = pl.DataFrame({
'by_1': ['X', 'X', 'Y', 'Y'] * 8,
'by_2': ['X', 'Y', 'X', 'Y'] * 8,
'on_1': ['A'] * 16 + ['C'] * 16,
'on_2': (['A'] * 8 + ['C'] * 8) * 2,
'__index__': list(range(32))
})
df2 = pl.DataFrame([
{ 'by_1': 'Y', 'by_2': 'Y', 'on_1': 'B', 'on_2': 'A' },
{ 'by_1': 'Y', 'by_2': 'Y', 'on_1': 'C', 'on_2': 'A' },
{ 'by_1': 'Y', 'by_2': 'Z', 'on_1': 'A', 'on_2': 'A' },
])
df1:
┌──────┬──────┬──────┬──────┬───────────┐
│ by_1 ┆ by_2 ┆ on_1 ┆ on_2 ┆ __index__ │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ i64 │
╞══════╪══════╪══════╪══════╪═══════════╡
│ X ┆ X ┆ A ┆ A ┆ 0 │
│ X ┆ Y ┆ A ┆ A ┆ 1 │
│ Y ┆ X ┆ A ┆ A ┆ 2 │
│ Y ┆ Y ┆ A ┆ A ┆ 3 │
│ X ┆ X ┆ A ┆ A ┆ 4 │
│ X ┆ Y ┆ A ┆ A ┆ 5 │
│ Y ┆ X ┆ A ┆ A ┆ 6 │
│ Y ┆ Y ┆ A ┆ A ┆ 7 │
│ X ┆ X ┆ A ┆ C ┆ 8 │
│ X ┆ Y ┆ A ┆ C ┆ 9 │
│ Y ┆ X ┆ A ┆ C ┆ 10 │
│ Y ┆ Y ┆ A ┆ C ┆ 11 │
│ X ┆ X ┆ A ┆ C ┆ 12 │
│ X ┆ Y ┆ A ┆ C ┆ 13 │
│ Y ┆ X ┆ A ┆ C ┆ 14 │
│ Y ┆ Y ┆ A ┆ C ┆ 15 │
│ X ┆ X ┆ C ┆ A ┆ 16 │
│ X ┆ Y ┆ C ┆ A ┆ 17 │
│ Y ┆ X ┆ C ┆ A ┆ 18 │
│ Y ┆ Y ┆ C ┆ A ┆ 19 │
│ X ┆ X ┆ C ┆ A ┆ 20 │
│ X ┆ Y ┆ C ┆ A ┆ 21 │
│ Y ┆ X ┆ C ┆ A ┆ 22 │
│ Y ┆ Y ┆ C ┆ A ┆ 23 │
│ X ┆ X ┆ C ┆ C ┆ 24 │
│ X ┆ Y ┆ C ┆ C ┆ 25 │
│ Y ┆ X ┆ C ┆ C ┆ 26 │
│ Y ┆ Y ┆ C ┆ C ┆ 27 │
│ X ┆ X ┆ C ┆ C ┆ 28 │
│ X ┆ Y ┆ C ┆ C ┆ 29 │
│ Y ┆ X ┆ C ┆ C ┆ 30 │
│ Y ┆ Y ┆ C ┆ C ┆ 31 │
└──────┴──────┴──────┴──────┴───────────┘
df2:
┌──────┬──────┬──────┬──────┐
│ by_1 ┆ by_2 ┆ on_1 ┆ on_2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str │
╞══════╪══════╪══════╪══════╡
│ Y ┆ Y ┆ B ┆ A │
│ Y ┆ Y ┆ C ┆ A │
│ Y ┆ Z ┆ A ┆ A │
└──────┴──────┴──────┴──────┘
# Case 1 - Less Than (lt)
df2.join_asof_lt(
df1,
by=['by_1', 'by_2'],
on=['on_1', 'on_2'],
lt_select_eq = 'first',
)
┌──────┬──────┬──────┬──────┬───────────┐
│ by_1 ┆ by_2 ┆ on_1 ┆ on_2 ┆ __index__ │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ i64 │
╞══════╪══════╪══════╪══════╪═══════════╡
│ Y ┆ Y ┆ B ┆ A ┆ 11 │ # First strictly less than is ('Y', 'Y'), ('A', 'C'), which exists at index 11 and 15. Since lt_select_eq is 'first', choose 11
│ Y ┆ Y ┆ C ┆ A ┆ 11 │ # First strictly less than is ('Y', 'Y'), ('A', 'C'), which exists at index 11 and 15. Since lt_select_eq is 'first', choose 11
│ Y ┆ Z ┆ A ┆ A ┆ null │ # Group (Z, Y) does not exist, so return None
└──────┴──────┴──────┴──────┴───────────┘
df2.join_asof_lt(
df1,
by=['by_1', 'by_2'],
on=['on_1', 'on_2'],
lt_select_eq = 'last',
)
┌──────┬──────┬──────┬──────┬───────────┐
│ by_1 ┆ by_2 ┆ on_1 ┆ on_2 ┆ __index__ │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ i64 │
╞══════╪══════╪══════╪══════╪═══════════╡
│ Y ┆ Y ┆ B ┆ A ┆ 15 │ # First strictly less than is ('Y', 'Y'), ('A', 'C'), which exists at index 11 and 15. Since lt_select_eq is 'last', choose 15
│ Y ┆ Y ┆ C ┆ A ┆ 15 │ # First strictly less than is ('Y', 'Y'), ('A', 'C'), which exists at index 11 and 15. Since lt_select_eq is 'last', choose 15
│ Y ┆ Z ┆ A ┆ A ┆ null │ # Group (Z, Y) does not exist, so return None
└──────┴──────┴──────┴──────┴───────────┘
# Case 2 - Less Than or Equal To (leq)
df2.join_asof_leq(
df1,
by=['by_1', 'by_2'],
on=['on_1', 'on_2'],
lt_select_eq = 'first',
eq_select_eq = 'last',
)
┌──────┬──────┬──────┬──────┬───────────┐
│ by_1 ┆ by_2 ┆ on_1 ┆ on_2 ┆ __index__ │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ i64 │
╞══════╪══════╪══════╪══════╪═══════════╡
│ Y ┆ Y ┆ B ┆ A ┆ 11 │ # First less than or equal to is ('Y', 'Y'), ('A', 'C'), which exists at index 11 and 15. Since lt_select_eq is 'first', choose 11
│ Y ┆ Y ┆ C ┆ A ┆ 23 │ # First less than or equal to is ('Y', 'Y'), ('C', 'A'), which exists at index 19 and 23. Since eq_select_eq is 'last', choose 23
│ Y ┆ Z ┆ A ┆ A ┆ null │ # Group (Z, Y) does not exist, so return None
└──────┴──────┴──────┴──────┴───────────┘
df2.join_asof_leq(
df1,
by=['by_1', 'by_2'],
on=['on_1', 'on_2'],
lt_select_eq = 'last',
eq_select_eq = 'first',
)
┌──────┬──────┬──────┬──────┬───────────┐
│ by_1 ┆ by_2 ┆ on_1 ┆ on_2 ┆ __index__ │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ i64 │
╞══════╪══════╪══════╪══════╪═══════════╡
│ Y ┆ Y ┆ B ┆ A ┆ 15 │ # First less than or equal to is ('Y', 'Y'), ('A', 'C'), which exists at index 11 and 15. Since lt_select_eq is 'last', choose 15
│ Y ┆ Y ┆ C ┆ A ┆ 19 │ # First less than or equal to is ('Y', 'Y'), ('C', 'A'), which exists at index 19 and 23. Since eq_select_eq is 'first', choose 19
│ Y ┆ Z ┆ A ┆ A ┆ null │ # Group (Z, Y) does not exist, so return None
└──────┴──────┴──────┴──────┴───────────┘
These examples are for lt / leq, but it could also be gt / geq. Thanks!
I don't follow why 3 and 7 are not to be classed as less than but for example:
df3 = df2.join(df1, on=["by_1", "by_2"], how="left")
df3.filter(
pl.col("__index__").is_null() |
(pl.col("on_1_right") < pl.col("on_1"))
)
shape: (9, 7)
┌──────┬──────┬──────┬──────┬────────────┬────────────┬───────────┐
│ by_1 | by_2 | on_1 | on_2 | on_1_right | on_2_right | __index__ │
│ --- | --- | --- | --- | --- | --- | --- │
│ str | str | str | str | str | str | i64 │
╞══════╪══════╪══════╪══════╪════════════╪════════════╪═══════════╡
│ Y | Y | B | A | A | A | 3 │
│ Y | Y | B | A | A | A | 7 │
│ Y | Y | B | A | A | C | 11 │
│ Y | Y | B | A | A | C | 15 │
│ Y | Y | C | A | A | A | 3 │
│ Y | Y | C | A | A | A | 7 │
│ Y | Y | C | A | A | C | 11 │
│ Y | Y | C | A | A | C | 15 │
│ Y | Z | A | A | null | null | null │
└──────┴──────┴──────┴──────┴────────────┴────────────┴───────────┘
Get "closest" match per group:
group_keys = ["by_1", "by_2", "on_1", "on_2"]
df3 = df2.join(df1, on=["by_1", "by_2"], how="left")
(
df3
.filter(
pl.col("__index__").is_null() |
(pl.col("on_1") > pl.col("on_1_right")))
.filter(
pl.col([
"on_1_right",
"on_2_right"
]) == pl.col(["on_1_right", "on_2_right"])
.last()
.over(group_keys))
)
shape: (5, 7)
┌──────┬──────┬──────┬──────┬────────────┬────────────┬───────────┐
│ by_1 | by_2 | on_1 | on_2 | on_1_right | on_2_right | __index__ │
│ --- | --- | --- | --- | --- | --- | --- │
│ str | str | str | str | str | str | i64 │
╞══════╪══════╪══════╪══════╪════════════╪════════════╪═══════════╡
│ Y | Y | B | A | A | C | 11 │
│ Y | Y | B | A | A | C | 15 │
│ Y | Y | C | A | A | C | 11 │
│ Y | Y | C | A | A | C | 15 │
│ Y | Z | A | A | null | null | null │
└──────┴──────┴──────┴──────┴────────────┴────────────┴───────────┘
If you .groupby(group_keys) that result you can use .first() / .last()
>>> groups.agg(pl.col("__index__").first())
shape: (3, 5)
┌──────┬──────┬──────┬──────┬───────────┐
│ by_1 | by_2 | on_1 | on_2 | __index__ │
│ --- | --- | --- | --- | --- │
│ str | str | str | str | i64 │
╞══════╪══════╪══════╪══════╪═══════════╡
│ Y | Y | C | A | 11 │
│ Y | Z | A | A | null │
│ Y | Y | B | A | 11 │
└──────┴──────┴──────┴──────┴───────────┘
>>> groups.agg(pl.col("__index__").last())
shape: (3, 5)
┌──────┬──────┬──────┬──────┬───────────┐
│ by_1 | by_2 | on_1 | on_2 | __index__ │
│ --- | --- | --- | --- | --- │
│ str | str | str | str | i64 │
╞══════╪══════╪══════╪══════╪═══════════╡
│ Y | Y | B | A | 15 │
│ Y | Z | A | A | null │
│ Y | Y | C | A | 15 │
└──────┴──────┴──────┴──────┴───────────┘

How to use join in expression context?

Suppose I have a mapping dataframe that I would like to join to an original dataframe:
df = pl.DataFrame({
'A': [1, 2, 3, 2, 1],
})
mapper = pl.DataFrame({
'key': [1, 2, 3, 4, 5],
'value': ['a', 'b', 'c', 'd', 'e']
})
I can map A to value directly via df.join(mapper, ...), but is there a way to do this in an expression context, i.e. while building columns? As in:
df.with_columns([
(pl.col('A')+1).join(mapper, left_on='A', right_on='key')
])
With would furnish:
shape: (5, 2)
┌─────┬───────┐
│ A ┆ value │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════╪═══════╡
│ 1 ┆ b │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1 ┆ b │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ c │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ c │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ d │
└─────┴───────┘
Probably, yes. I just putted df.select(col('A')+1) inside.
df = df.with_columns([
col('A'),
df.select(col('A')+1).join(mapper, left_on='A', right_on='key')['value']
])
print(df)
df
┌─────┬───────┐
│ A ┆ value │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════╪═══════╡
│ 1 ┆ b │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ b │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ c │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ c │
├╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1 ┆ d │
└─────┴───────┘