I have a variable driver_age and some levels 16_to_25, 25_to_34, etc.
I would like the dummy encoded columns to have names like driver_age#16_to_25.
I have the following workaround, but it is incompatible with LazyFrames.
prefix_sep = "#"
for col in features_categorical:
ddf = df.get_column(col).to_dummies()
new_names = [f"{col}{prefix_sep}{x[len(col)+1:]}" for x in ddf.columns]
mapper = dict(zip(ddf.columns, new_names))
ddf = ddf.rename(mapper)
df = df.drop(col).hstack(ddf)
Is there a more efficient way to do this? Would it be reasonable to request this as a feature?
One (somewhat) easier way to accomplish this is to add the # as a suffix to your Categorical columns, and then target the #_ with a simple list comprehension.
Let's start with this data.
import polars as pl
df = (
pl.DataFrame([
pl.Series(
name='driver_age',
values=['16_to_25', '25_to_34', '35_to_45', '45_to_55'],
dtype=pl.Categorical),
pl.Series(
name='marital_status',
values=['S', 'M'] * 2,
dtype=pl.Categorical
),
pl.Series(
name='col1',
values=[1, 2, 3, 4],
),
pl.Series(
name='col2',
values=[10, 20, 30, 40],
),
])
)
df
shape: (4, 4)
┌────────────┬────────────────┬──────┬──────┐
│ driver_age ┆ marital_status ┆ col1 ┆ col2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ cat ┆ cat ┆ i64 ┆ i64 │
╞════════════╪════════════════╪══════╪══════╡
│ 16_to_25 ┆ S ┆ 1 ┆ 10 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 25_to_34 ┆ M ┆ 2 ┆ 20 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 35_to_45 ┆ S ┆ 3 ┆ 30 │
├╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 45_to_55 ┆ M ┆ 4 ┆ 40 │
└────────────┴────────────────┴──────┴──────┘
We use the suffix Expression to add a # to the end of the column names that are Categorical and create our dummy variables.
df = (
pl.get_dummies(
df
.select([
pl.exclude(pl.Categorical),
pl.col(pl.Categorical).suffix('#'),
]),
columns=[s.name + '#' for s in df.select(pl.col(pl.Categorical))]
)
)
df
shape: (4, 8)
┌──────┬──────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬───────────────────┬───────────────────┐
│ col1 ┆ col2 ┆ driver_age#_16_to_25 ┆ driver_age#_25_to_34 ┆ driver_age#_35_to_45 ┆ driver_age#_45_to_55 ┆ marital_status#_M ┆ marital_status#_S │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
╞══════╪══════╪══════════════════════╪══════════════════════╪══════════════════════╪══════════════════════╪═══════════════════╪═══════════════════╡
│ 1 ┆ 10 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 20 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 30 ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 40 ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │
└──────┴──────┴──────────────────────┴──────────────────────┴──────────────────────┴──────────────────────┴───────────────────┴───────────────────┘
From here, it's a one-liner to change the column names:
df.columns = [col_nm.replace('#_', '#') for col_nm in df.columns]
df
shape: (4, 8)
┌──────┬──────┬─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────────────────┬──────────────────┐
│ col1 ┆ col2 ┆ driver_age#16_to_25 ┆ driver_age#25_to_34 ┆ driver_age#35_to_45 ┆ driver_age#45_to_55 ┆ marital_status#M ┆ marital_status#S │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 ┆ u8 │
╞══════╪══════╪═════════════════════╪═════════════════════╪═════════════════════╪═════════════════════╪══════════════════╪══════════════════╡
│ 1 ┆ 10 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 20 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 30 ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 40 ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │
└──────┴──────┴─────────────────────┴─────────────────────┴─────────────────────┴─────────────────────┴──────────────────┴──────────────────┘
It's not done in Lazy mode, but then again, the get_dummies is also not available in Lazy mode.
I have a dataframe like this:
import polars as pl
orig_df = pl.DataFrame({
"primary_key": [1, 1, 2, 2, 3, 3],
"simple_foreign_keys": [1, 1, 2, 4, 4, 4],
"fancy_foreign_keys": [[1], [1], [2], [1], [3, 4], [3, 4]],
})
I have some logic that computes when the secondary column changes, then additional logic to rank those changes per primary column. It works fine on the simple data type:
foreign_key_changed = pl.col('simple_foreign_keys') != pl.col('simple_foreign_keys').shift_and_fill(1, 0)
df = orig_df.sort('primary_key').with_column(foreign_key_changed.cumsum().alias('raw_changes'))
print(df)
shape: (6, 4)
┌─────────────┬─────────────────────┬────────────────────┬─────────────┐
│ primary_key ┆ simple_foreign_keys ┆ fancy_foreign_keys ┆ raw_changes │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ list[i64] ┆ u32 │
╞═════════════╪═════════════════════╪════════════════════╪═════════════╡
│ 1 ┆ 1 ┆ [1] ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 1 ┆ [1] ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2 ┆ [2] ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 4 ┆ [1] ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 4 ┆ [3, 4] ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 4 ┆ [3, 4] ┆ 3 │
└─────────────┴─────────────────────┴────────────────────┴─────────────┘
df = orig_df.sort('primary_key').with_column(foreign_key_changed.cumsum().rank('dense').over('primary_key').alias('ranked_changes'))
print(df)
shape: (6, 4)
┌─────────────┬─────────────────────┬────────────────────┬────────────────┐
│ primary_key ┆ simple_foreign_keys ┆ fancy_foreign_keys ┆ ranked_changes │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ list[i64] ┆ u32 │
╞═════════════╪═════════════════════╪════════════════════╪════════════════╡
│ 1 ┆ 1 ┆ [1] ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 1 ┆ [1] ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2 ┆ [2] ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 4 ┆ [1] ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 4 ┆ [3, 4] ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 4 ┆ [3, 4] ┆ 1 │
└─────────────┴─────────────────────┴────────────────────┴────────────────┘
But if I try the exact same logic on the List column, it blows up on me. Note that the intermediate columns (the cumsum, the rank) are still plain integers:
foreign_key_changed = pl.col('fancy_foreign_keys') != pl.col('fancy_foreign_keys').shift_and_fill(1, [])
df = orig_df.sort('primary_key').with_column(foreign_key_changed.cumsum().alias('raw_changes'))
print(df)
shape: (6, 4)
┌─────────────┬─────────────────────┬────────────────────┬─────────────┐
│ primary_key ┆ simple_foreign_keys ┆ fancy_foreign_keys ┆ raw_changes │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ list[i64] ┆ u32 │
╞═════════════╪═════════════════════╪════════════════════╪═════════════╡
│ 1 ┆ 1 ┆ [1] ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 1 ┆ [1] ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 2 ┆ [2] ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 4 ┆ [1] ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 4 ┆ [3, 4] ┆ 4 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 4 ┆ [3, 4] ┆ 4 │
└─────────────┴─────────────────────┴────────────────────┴─────────────┘
df = orig_df.sort('primary_key').with_column(foreign_key_changed.cumsum().rank('dense').over('primary_key').alias('ranked_changes'))
thread '<unnamed>' panicked at 'implementation error, cannot get ref List(Null) from Int64', /Users/runner/work/polars/polars/polars/polars-core/src/series/mod.rs:945:13
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
Traceback (most recent call last):
File "polars_example.py", line 18, in <module>
df = orig_df.sort('primary_key').with_column(foreign_key_changed.cumsum().rank('dense').over('primary_key').alias('ranked_changes'))
File "venv/lib/python3.9/site-packages/polars/internals/frame.py", line 4144, in with_column
return self.with_columns([column])
File "venv/lib/python3.9/site-packages/polars/internals/frame.py", line 5308, in with_columns
self.lazy()
File "venv/lib/python3.9/site-packages/polars/internals/lazy_frame.py", line 652, in collect
return self._dataframe_class._from_pydf(ldf.collect())
pyo3_runtime.PanicException: implementation error, cannot get ref List(Null) from Int64
Am I doing this wrong? Is there some massaging that has to happen with the types?
This is with polars 0.13.62.
I have multi columns which name startswith "ts" like "ts_1, ts_2, ts_3,etc" , I want to sum these f64 value row by row, but I don't know exactly the column names. If I use regex like pl.col('^ts.*$'). How to sum these value?
Update: Polars >= 0.13.60
As of Polars 0.13.60, you can now use polars.sum with a regex argument to polars.col.
df.with_column(
pl.sum(pl.col("^ts_.*$")).alias('ts_sum')
)
shape: (4, 6)
┌──────┬──────┬──────┬───────┬──────┬────────┐
│ ts_1 ┆ a ┆ ts_2 ┆ b ┆ ts_3 ┆ ts_sum │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═══════╪══════╪════════╡
│ 1 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 111 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 112 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 3 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 113 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 4 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 114 │
└──────┴──────┴──────┴───────┴──────┴────────┘
Other methods
There are two ways to accomplish this. Starting with this data:
import polars as pl
df = pl.DataFrame({
'ts_1': [1, 2, 3, 4],
'a': [-100] * 4,
'ts_2': [10] * 4,
'b': [-1000] * 4,
'ts_3': [100] * 4,
})
df
shape: (4, 5)
┌──────┬──────┬──────┬───────┬──────┐
│ ts_1 ┆ a ┆ ts_2 ┆ b ┆ ts_3 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═══════╪══════╡
│ 1 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 4 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 │
└──────┴──────┴──────┴───────┴──────┘
As of Polars 0.13.59, you can use a regex expression in fold.
df.with_column(
pl.fold(acc=pl.lit(0),
f=lambda acc, x: acc + x,
exprs=pl.col("^ts_.*$"))
.alias("ts_sum")
)
shape: (4, 6)
┌──────┬──────┬──────┬───────┬──────┬────────┐
│ ts_1 ┆ a ┆ ts_2 ┆ b ┆ ts_3 ┆ ts_sum │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═══════╪══════╪════════╡
│ 1 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 111 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 112 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 3 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 113 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 4 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 114 │
└──────┴──────┴──────┴───────┴──────┴────────┘
The other option is to use the fold-like properties of polars.sum. polars.sum will sum horizontally when passed a list of Expressions. For example:
df.with_column(
pl.sum(
[col_nm for col_nm in df.columns
if col_nm.startswith(r"ts_")]
).alias("ts_sum")
)
shape: (4, 6)
┌──────┬──────┬──────┬───────┬──────┬────────┐
│ ts_1 ┆ a ┆ ts_2 ┆ b ┆ ts_3 ┆ ts_sum │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═══════╪══════╪════════╡
│ 1 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 111 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 112 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 3 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 113 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 4 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 114 │
└──────┴──────┴──────┴───────┴──────┴────────┘
If needed, you can use regex-type filtering, using a regex module such as re.
import re
df.with_column(
pl.sum(
[col_nm for col_nm in df.columns
if re.search(r"^ts_.*$", col_nm)]
).alias("ts_sum")
)
shape: (4, 6)
┌──────┬──────┬──────┬───────┬──────┬────────┐
│ ts_1 ┆ a ┆ ts_2 ┆ b ┆ ts_3 ┆ ts_sum │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪══════╪═══════╪══════╪════════╡
│ 1 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 111 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 112 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 3 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 113 │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 4 ┆ -100 ┆ 10 ┆ -1000 ┆ 100 ┆ 114 │
└──────┴──────┴──────┴───────┴──────┴────────┘
I have a polars dataframe with many columns. I want to look at all the data from a single row aligned vertically so that I can see the values in many different columns without it going off the edge of the screen. How can I do this?
E.g. define a dataframe
df = pl.DataFrame({'a':[0,1],'b':[2,3]})
Print df[0] in ipython/jupyter and I get:
But if I convert df to pandas and print df.iloc[0] I get:
The latter is very handy when you've got many columns.
I've tried things like df[0].to_series(), but it only prints the first element, not the first row.
My suspicion is that there isn't a direct replacement because the pandas method relies on the series having an index. I think the polars solution will be more like making a two column dataframe where one column is the column names and the other is a value. I'm not sure if there's a method to do that though.
Thanks for any help you can offer!
import polars as pl
import numpy as np
# Create dataframe with lots of columns.
df = pl.DataFrame(np.random.randint(0, 1000, (5, 100)))
df
shape: (5, 100)
┌──────────┬──────────┬──────────┬──────────┬─────┬───────────┬───────────┬───────────┬───────────┐
│ column_0 ┆ column_1 ┆ column_2 ┆ column_3 ┆ ... ┆ column_96 ┆ column_97 ┆ column_98 ┆ column_99 │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞══════════╪══════════╪══════════╪══════════╪═════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 285 ┆ 366 ┆ 886 ┆ 981 ┆ ... ┆ 63 ┆ 326 ┆ 882 ┆ 564 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 735 ┆ 269 ┆ 381 ┆ 78 ┆ ... ┆ 556 ┆ 737 ┆ 741 ┆ 768 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 543 ┆ 729 ┆ 915 ┆ 901 ┆ ... ┆ 48 ┆ 21 ┆ 277 ┆ 818 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 264 ┆ 424 ┆ 285 ┆ 540 ┆ ... ┆ 602 ┆ 584 ┆ 888 ┆ 836 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 269 ┆ 701 ┆ 483 ┆ 817 ┆ ... ┆ 579 ┆ 873 ┆ 192 ┆ 734 │
└──────────┴──────────┴──────────┴──────────┴─────┴───────────┴───────────┴───────────┴───────────┘
# Display row 3, by creating a tuple of column name and value for row 3.
tuple(zip(df.columns, df.row(2)))
(('column_0', 543),
('column_1', 729),
('column_2', 915),
('column_3', 901),
('column_4', 332),
('column_5', 156),
('column_6', 624),
('column_7', 37),
('column_8', 341),
('column_9', 503),
('column_10', 135),
('column_11', 183),
('column_12', 651),
('column_13', 910),
('column_14', 625),
('column_15', 129),
('column_16', 604),
('column_17', 671),
('column_18', 976),
('column_19', 558),
('column_20', 159),
('column_21', 314),
('column_22', 460),
('column_23', 49),
('column_24', 944),
('column_25', 6),
('column_26', 470),
('column_27', 228),
('column_28', 615),
('column_29', 230),
('column_30', 217),
('column_31', 66),
('column_32', 999),
('column_33', 440),
('column_34', 519),
('column_35', 851),
('column_36', 37),
('column_37', 859),
('column_38', 560),
('column_39', 870),
('column_40', 892),
('column_41', 192),
('column_42', 541),
('column_43', 136),
('column_44', 631),
('column_45', 22),
('column_46', 522),
('column_47', 225),
('column_48', 610),
('column_49', 191),
('column_50', 886),
('column_51', 454),
('column_52', 312),
('column_53', 956),
('column_54', 473),
('column_55', 851),
('column_56', 760),
('column_57', 224),
('column_58', 859),
('column_59', 442),
('column_60', 234),
('column_61', 788),
('column_62', 53),
('column_63', 999),
('column_64', 473),
('column_65', 237),
('column_66', 247),
('column_67', 307),
('column_68', 916),
('column_69', 94),
('column_70', 714),
('column_71', 233),
('column_72', 995),
('column_73', 335),
('column_74', 454),
('column_75', 801),
('column_76', 742),
('column_77', 386),
('column_78', 196),
('column_79', 239),
('column_80', 723),
('column_81', 59),
('column_82', 929),
('column_83', 852),
('column_84', 722),
('column_85', 328),
('column_86', 59),
('column_87', 710),
('column_88', 238),
('column_89', 823),
('column_90', 75),
('column_91', 307),
('column_92', 472),
('column_93', 822),
('column_94', 582),
('column_95', 802),
('column_96', 48),
('column_97', 21),
('column_98', 277),
('column_99', 818))
Pandas does not display all values either if you have many columns.
In [121]: df.to_pandas().iloc[0]
Out[121]:
column_0 285
column_1 366
column_2 886
column_3 981
column_4 464
...
column_95 862
column_96 63
column_97 326
column_98 882
column_99 564
Name: 0, Length: 100, dtype: int64
You can try using melt. For example:
df = pl.DataFrame(
[
pl.Series(name="col_str", values=["string1", "string2"]),
pl.Series(name="col_bool", values=[False, True]),
pl.Series(name="col_int", values=[1, 2]),
pl.Series(name="col_float", values=[10.0, 20.0]),
*[pl.Series(name=f"col_other_{idx}", values=[idx] * 2)
for idx in range(1, 25)],
]
)
print(df)
shape: (2, 28)
┌─────────┬──────────┬─────────┬───────────┬─────┬──────────────┬──────────────┬──────────────┬──────────────┐
│ col_str ┆ col_bool ┆ col_int ┆ col_float ┆ ... ┆ col_other_21 ┆ col_other_22 ┆ col_other_23 ┆ col_other_24 │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ bool ┆ i64 ┆ f64 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════════╪══════════╪═════════╪═══════════╪═════╪══════════════╪══════════════╪══════════════╪══════════════╡
│ string1 ┆ false ┆ 1 ┆ 10.0 ┆ ... ┆ 21 ┆ 22 ┆ 23 ┆ 24 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ string2 ┆ true ┆ 2 ┆ 20.0 ┆ ... ┆ 21 ┆ 22 ┆ 23 ┆ 24 │
└─────────┴──────────┴─────────┴───────────┴─────┴──────────────┴──────────────┴──────────────┴──────────────┘
To print the first row:
pl.Config.set_tbl_rows(100)
df[0,].melt()
shape: (28, 2)
┌──────────────┬─────────┐
│ variable ┆ value │
│ --- ┆ --- │
│ str ┆ str │
╞══════════════╪═════════╡
│ col_str ┆ string1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_bool ┆ false │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_int ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_float ┆ 10.0 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_1 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_2 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_3 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_4 ┆ 4 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_5 ┆ 5 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_6 ┆ 6 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_7 ┆ 7 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_8 ┆ 8 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_9 ┆ 9 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_10 ┆ 10 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_11 ┆ 11 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_12 ┆ 12 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_13 ┆ 13 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_14 ┆ 14 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_15 ┆ 15 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_16 ┆ 16 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_17 ┆ 17 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_18 ┆ 18 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_19 ┆ 19 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_20 ┆ 20 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_21 ┆ 21 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_22 ┆ 22 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_23 ┆ 23 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤
│ col_other_24 ┆ 24 │
If needed, set the polars.Config.set_tbl_rows option to the number of rows you find acceptable. (This only needs to be done once per session, not every time you print.)
Notice that all values have been cast to super-type str. (One caution: this approach won't work if any of your columns are of dtype list.)
You may try check Polars Cookbook about indexing here
It's stated that
| pandas | polars |
|------------|-----------|
| select row | |
|df.iloc[2] | df[2, :] |
Cheers!