Related
In pandas, I could set several named columns as an index and find the quotient of the division of two
DataFrame,like this
import pandas as pd
df_1=pd.DataFrame( {
'Name':['a','a','a', 'b','b', 'c'],
'Name_2':['first','second','third', 'first','second', 'first'],
'Value':[20,40,50,100,150,400]
})
df_2=pd.DataFrame( {
'Name':['a','a','a', 'b','b', 'c'],
'Name_2':['first','second','third', 'first','second', 'first'],
'Value':[10,20,25,50,75,200]
})
df_1=df_1.set_index(['Name','Name_2'])
df_2=df_2.set_index(['Name','Name_2'])
df_1/df_2
How can something like this be implemented in python-polars?
I can't find an answer to this question in the documentation.
You just use a join, then do the math on the appropriate column(s).
df_1=pl.DataFrame( {
'Name':['a','a','a', 'b','b', 'c'],
'Name_2':['first','second','third', 'first','second', 'first'],
'Value':[20,40,50,100,150,400]
})
df_2=pl.DataFrame( {
'Name':['a','a','a', 'b','b', 'c'],
'Name_2':['first','second','third', 'first','second', 'first'],
'Value':[10,20,25,50,75,200]
})
That's the setup then the solution is:
df_1.join(df_2, on=['Name','Name_2']) \
.select(['Name','Name_2', pl.col('Value')/pl.col('Value_right')])
If you have a bunch of "Value" columns and different indx columns you can do something like:
myindxcols=['Name', 'Name_2]
myvalcols=[x for x in df_1.columns if x in df_2.columns and not x in myindxcols]
df_1.join(df_2, on=myindxcols) \
.select(myindxcols + [pl.col(x)/pl.col(f"{x}_right") for x in myvalcols])
#Dean MacGregor beat me to it. Please accept his answer.
df_1 = pl.DataFrame( {
"Name":["a","a","a", "b","b", "c"],
"Name_2":["first","second","third", "first","second", "first"],
"Value":[20,40,50,100,150,400]
})
df_2 = pl.DataFrame( {
"Name":["a","a","a", "b","b", "c"],
"Name_2":["first","second","third", "first","second", "first"],
"Value":[10,20,25,50,75,200]
})
keys = ["Name", "Name_2"]
(df_1
.join(df_2, on=keys, suffix="_right")
.select([
*keys,
pl.col("Value") / pl.col("Value_right")
])
)
shape: (6, 3)
┌──────┬────────┬───────┐
│ Name ┆ Name_2 ┆ Value │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ f64 │
╞══════╪════════╪═══════╡
│ a ┆ first ┆ 2.0 │
│ a ┆ second ┆ 2.0 │
│ a ┆ third ┆ 2.0 │
│ b ┆ first ┆ 2.0 │
│ b ┆ second ┆ 2.0 │
│ c ┆ first ┆ 2.0 │
└──────┴────────┴───────┘
I am working with multiple parquet datasets that were written with nested structs (sometimes multiple levels deep). I need to output a flattened (no struct) schema. Right now the only way I can think to do that is to use for loops to iterate through the columns. Here is a simplified example where I'm for looping.
while len([x.name for x in df if x.dtype == pl.Struct]) > 0:
for col in df:
if col.dtype == pl.Struct:
df = df.unnest(col.name)
This works, maybe that is the only way to do it, and if so it would be helpful to know that. But Polars is pretty neat and I'm wondering if there is a more functional way to do this without all the looping and reassigning the df to itself.
If you have a df like this:
df=pl.DataFrame({'a':[1,2,3], 'b':[2,3,4], 'c':[3,4,5], 'd':[4,5,6], 'e':[5,6,7]}).select([pl.struct(['a','b']).alias('ab'), pl.struct(['c','d']).alias('cd'),'e'])
You can unnest the ab and cd at the same time by just doing
df.unnest(['ab','cd'])
If you don't know in advance what your column names and types are in advance then you can just use a list comprehension like this:
[col_name for col_name,dtype in zip(df.columns, df.dtypes) if dtype==pl.Struct]
We can now just put that list comprehension in the unnest method.
df=df.unnest([col_name for col_name,dtype in zip(df.columns, df.dtypes) if dtype==pl.Struct])
If you have structs inside structs like:
df=pl.DataFrame({'a':[1,2,3], 'b':[2,3,4], 'c':[3,4,5], 'd':[4,5,6], 'e':[5,6,7]}).select([pl.struct(['a','b']).alias('ab'), pl.struct(['c','d']).alias('cd'),'e']).select([pl.struct(['ab','cd']).alias('abcd'),'e'])
then I don't think you can get away from some kind of while loop but this might be more concise:
while any([x==pl.Struct for x in df.dtypes]):
df=df.unnest([col_name for col_name,dtype in zip(df.columns, df.dtypes) if dtype==pl.Struct])
This is a minor addition. If you're concerned about constantly re-looping through a large number of columns, you can create a recursive formula to address only structs (and nested structs).
def unnest_all(self: pl.DataFrame):
cols = []
for next_col in self:
if next_col.dtype != pl.Struct:
cols.append(next_col)
else:
cols.extend(next_col.struct.to_frame().unnest_all().get_columns())
return pl.DataFrame(cols)
pl.DataFrame.unnest_all = unnest_all
So, using the second example by #Dean MacGregor above:
df = (
pl.DataFrame(
{"a": [1, 2, 3], "b": [2, 3, 4], "c": [
3, 4, 5], "d": [4, 5, 6], "e": [5, 6, 7]}
)
.select([pl.struct(["a", "b"]).alias("ab"), pl.struct(["c", "d"]).alias("cd"), "e"])
.select([pl.struct(["ab", "cd"]).alias("abcd"), "e"])
)
df
df.unnest_all()
>>> df
shape: (3, 2)
┌───────────────┬─────┐
│ abcd ┆ e │
│ --- ┆ --- │
│ struct[2] ┆ i64 │
╞═══════════════╪═════╡
│ {{1,2},{3,4}} ┆ 5 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ {{2,3},{4,5}} ┆ 6 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ {{3,4},{5,6}} ┆ 7 │
└───────────────┴─────┘
>>> df.unnest_all()
shape: (3, 5)
┌─────┬─────┬─────┬─────┬─────┐
│ a ┆ b ┆ c ┆ d ┆ e │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╪═════╡
│ 1 ┆ 2 ┆ 3 ┆ 4 ┆ 5 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 3 ┆ 4 ┆ 5 ┆ 6 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 4 ┆ 5 ┆ 6 ┆ 7 │
└─────┴─────┴─────┴─────┴─────┘
And using the first example:
df = pl.DataFrame(
{"a": [1, 2, 3], "b": [2, 3, 4], "c": [
3, 4, 5], "d": [4, 5, 6], "e": [5, 6, 7]}
).select([pl.struct(["a", "b"]).alias("ab"), pl.struct(["c", "d"]).alias("cd"), "e"])
df
df.unnest_all()
>>> df
shape: (3, 3)
┌───────────┬───────────┬─────┐
│ ab ┆ cd ┆ e │
│ --- ┆ --- ┆ --- │
│ struct[2] ┆ struct[2] ┆ i64 │
╞═══════════╪═══════════╪═════╡
│ {1,2} ┆ {3,4} ┆ 5 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ {2,3} ┆ {4,5} ┆ 6 │
├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ {3,4} ┆ {5,6} ┆ 7 │
└───────────┴───────────┴─────┘
>>> df.unnest_all()
shape: (3, 5)
┌─────┬─────┬─────┬─────┬─────┐
│ a ┆ b ┆ c ┆ d ┆ e │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╪═════╡
│ 1 ┆ 2 ┆ 3 ┆ 4 ┆ 5 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 3 ┆ 4 ┆ 5 ┆ 6 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 4 ┆ 5 ┆ 6 ┆ 7 │
└─────┴─────┴─────┴─────┴─────┘
In the end, I'm not sure that this saves you much wall-clock time (or RAM).
The other answers taught me a lot. I encountered a new situation where I wanted to easily be able to get each column labeled with all the structs it came from. i.e. for
pl.col("my").struct.field("test").struct.field("thing")
I wanted to recover
my.test.thing
as a string which I could easily use when reading a subset of columns with pyarrow via
pq.ParquetDataset(path).read(columns = ["my.test.thing"])
Since there are many hundreds of columns and the nesting can go quite deep, I wrote functions to do a depth first search on the schema, extract the columns in that pyarrow friendly format, then I can use those to select each column unnested all in one go.
First, I worked with the pyarrow schema because I couldn't figure out how to drill into the structs in the polars schema:
schema = df.to_arrow().schema
navigating structs in that schema is quirky, at the top level the structure behaves differently from deeper in. I ended up writing two functions, the first to navigate the top level structure and the second to continue the search below:
def schema_top_level_DFS(pa_schema):
top_level_stack = list(range(len(pa_schema)))
while top_level_stack:
working_top_level_index = top_level_stack.pop()
working_element_name = pa_schema.names[working_top_level_index]
if type(pa_schema.types[working_top_level_index]) == pa.lib.StructType:
second_level_stack = list(range(len(pa_schema.types[working_top_level_index])))
while second_level_stack:
working_second_level_index = second_level_stack.pop()
schema_DFS(pa_schema.types[working_top_level_index][working_second_level_index],working_element_name)
else:
column_paths.append(working_element_name)
def schema_DFS(incoming_element,upstream_names):
current_name = incoming_element.name
combined_names = ".".join([upstream_names,current_name])
if type(incoming_element.type) == pa.lib.StructType:
stack = list(range(len(incoming_element.type)))
while stack:
working_index = stack.pop()
working_element = incoming_element.type[working_index]
schema_DFS(working_element,combined_names)
else:
column_paths.append(combined_names)
So that running
column_paths = []
schema_top_level_DFS(schema)
gives me column paths like
['struct_name_1.inner_struct_name_2.thing1','struct_name_1.inner_struct_name_2.thing2]
to actually do the unnesting, I wasn't sure how to do better than a function with a case statement:
def return_pl_formatting(col_string):
col_list = col_string.split(".")
match len(col_list):
case 1:
return pl.col(col_list[0]).alias(col_string)
case 2:
return pl.col(col_list[0]).struct.field(col_list[1]).alias(col_string)
case 3:
return pl.col(col_list[0]).struct.field(col_list[1]).struct.field(col_list[2]).alias(col_string)
case 4:
return pl.col(col_list[0]).struct.field(col_list[1]).struct.field(col_list[2]).struct.field(col_list[3]).alias(col_string)
case 5:
return pl.col(col_list[0]).struct.field(col_list[1]).struct.field(col_list[2]).struct.field(col_list[3]).struct.field(col_list[4]).alias(col_string)
case 6:
return pl.col(col_list[0]).struct.field(col_list[1]).struct.field(col_list[2]).struct.field(col_list[3]).struct.field(col_list[4]).struct.field(col_list[5]).alias(col_string)
Then get my unnested and nicely named df with:
df.select([return_pl_formatting(x) for x in column_paths])
To show the output on the example from #Dean MacGregor
test = (
pl.DataFrame(
{"a": [1, 2, 3], "b": [2, 3, 4], "c": [
3, 4, 5], "d": [4, 5, 6], "e": [5, 6, 7]}
)
.select([pl.struct(["a", "b"]).alias("ab"), pl.struct(["c", "d"]).alias("cd"), "e"])
.select([pl.struct(["ab", "cd"]).alias("abcd"), "e"])
)
column_paths = []
schema_top_level_DFS(test.to_arrow().schema)
print(test.select([return_pl_formatting(x) for x in column_paths]))
┌─────┬───────────┬───────────┬───────────┬───────────┐
│ e ┆ abcd.cd.d ┆ abcd.cd.c ┆ abcd.ab.b ┆ abcd.ab.a │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 5 ┆ 4 ┆ 3 ┆ 2 ┆ 1 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 6 ┆ 5 ┆ 4 ┆ 3 ┆ 2 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 7 ┆ 6 ┆ 5 ┆ 4 ┆ 3 │
└─────┴───────────┴───────────┴───────────┴───────────┘
I have a Polars DataFrame with a list column. I want to control how many elements of a pl.List column are printed.
I've tried pl.pl.Config.set_fmt_str_lengths() but this only restricts the number of elements if set to a small value, it doesn't show more elements for a large value.
I'm working in Jupyterlab but I think it's a general issue.
import polars as pl
N = 5
df = (
pl.DataFrame(
{
'id': range(N)
}
)
.with_row_count("value")
.groupby_rolling(
"id",period=f"{N}i"
)
.agg(
pl.col("value")
)
)
df
shape: (5, 2)
┌─────┬───────────────┐
│ id ┆ value │
│ --- ┆ --- │
│ i64 ┆ list[u32] │
╞═════╪═══════════════╡
│ 0 ┆ [0] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ [0, 1] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ [0, 1, 2] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ [0, 1, ... 3] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ [0, 1, ... 4] │
└─────┴───────────────┘
pl.Config.set_tbl_rows(100)
And more generally, I would try looking at dir(pl.Config)
You can use the following config parameter from the Polars Documentation to set the length of the output e.g. 100.
import Polars as pl
pl.Config.set_fmt_str_lengths(100)
Currently I do not think you can, directly; the documentation for Config does not list any such method, and for me (in VSCode at least) set_fmt_str_lengths does not affect lists.
However, if your goal is simply to be able to see what you're working on and you don't mind a slightly hacky workaround, you can simply add a column next to it where you convert your list to a string representation of itself, at which point pl.Config.set_fmt_str_lengths(<some large n>) will then display however much of it you like. For example:
import polars as pl
pl.Config.set_fmt_str_lengths(100)
N = 5
df = (
pl.DataFrame(
{
'id': range(N)
}
)
.with_row_count("value")
.groupby_rolling(
"id",period=f"{N}i"
)
.agg(
pl.col("value")
).with_column(
pl.col("value").apply(lambda x: ["["+", ".join([f'{i}' for i in x])+"]"][0]).alias("string_repr")
)
)
df
shape: (5, 3)
┌─────┬───────────────┬─────────────────┐
│ id ┆ value ┆ string_repr │
│ --- ┆ --- ┆ --- │
│ i64 ┆ list[u32] ┆ str │
╞═════╪═══════════════╪═════════════════╡
│ 0 ┆ [0] ┆ [0] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ [0, 1] ┆ [0, 1] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ [0, 1, 2] ┆ [0, 1, 2] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ [0, 1, ... 3] ┆ [0, 1, 2, 3] │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ [0, 1, ... 4] ┆ [0, 1, 2, 3, 4] │
└─────┴───────────────┴─────────────────┘
I frequently need to calculate the percentage counts of a variable. For example for the dataframe below
df = pl.DataFrame({"person": ["a", "a", "b"],
"value": [1, 2, 3]})
I want to return a dataframe like this:
person
percent
a
0.667
b
0.333
What I have been doing is the following, but I can't help but think there must be a more efficient / polars way to do this
n_rows = len(df)
(
df
.with_column(pl.lit(1)
.alias('percent'))
.groupby('person')
.agg([pl.sum('percent') / n_rows])
)
polars.count will help here. When called without arguments, polars.count returns the number of rows in a particular context.
(
df
.groupby("person")
.agg([pl.count().alias("count")])
.with_column((pl.col("count") / pl.sum("count")).alias("percent_count"))
)
shape: (2, 3)
┌────────┬───────┬───────────────┐
│ person ┆ count ┆ percent_count │
│ --- ┆ --- ┆ --- │
│ str ┆ u32 ┆ f64 │
╞════════╪═══════╪═══════════════╡
│ a ┆ 2 ┆ 0.666667 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 1 ┆ 0.333333 │
└────────┴───────┴───────────────┘
I have a dataframe like:
pl.DataFrame({'a': [['a', 'b'], None, ['c', 'd', 'e'], None], 't':['x', 'y', None, None]})
shape: (4, 2)
┌─────────────────┬──────┐
│ a ┆ t │
│ --- ┆ --- │
│ list[str] ┆ str │
╞═════════════════╪══════╡
│ ["a", "b"] ┆ x │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ null ┆ y │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ ["c", "d", "e"] ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ null ┆ null │
└─────────────────┴──────┘
I'd like to have a transformation that results in:
┌─────────────────┐
│ a │
│ --- │
│ list[str] │
╞═════════════════╡
│ ["a", "b", "x"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["y"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["c", "d", "e"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ null │
└─────────────────┘
However, the obvious solutions which come to mind don't seem to work.
df.with_column(
col('a').arr.concat(col('t'))
)
results in
┌──────────────────────┬──────┐
│ a ┆ t │
│ --- ┆ --- │
│ list[str] ┆ str │
╞══════════════════════╪══════╡
│ ["a", "b", "x"] ┆ x │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ null ┆ y │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ ["c", "d", ... null] ┆ null │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ null ┆ null │
└──────────────────────┴──────┘
Strangely, somehow
df.with_column(
col('t').apply(lambda s: [s]).arr.concat(col('a'))
)
results in an error saying that the dataframe length has changed.:
ShapeError: Could not add column. The Series length 5 differs from the DataFrame height: 4
I don't understand why concatenating the two Series together should produce a new series of a different length. Is this a bug?
I have tried a number of ways to produce a solution but continue to run into errors. For example, using a list comprehension works to add the arrays together, but .append does not.
def combine(d):
x, y = d['a'], d['t']
if x and y:
# return x.append(y) # produces error
return [a for a in x] + [b for b in y]
if x and not y:
return [a for a in x]
if y and not x:
return [b for b in y]
else:
# return None # (produces error)
return ['None']
df.with_column(
pl.struct([col('a'), col('t')]).apply(combine).alias('combined')
)
gives
┌─────────────────┬──────┬─────────────────┐
│ a ┆ t ┆ combined │
│ --- ┆ --- ┆ --- │
│ list[str] ┆ str ┆ list[str] │
╞═════════════════╪══════╪═════════════════╡
│ ["a", "b"] ┆ x ┆ ["a", "b", "x"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ null ┆ y ┆ ["y"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ ["c", "d", "e"] ┆ null ┆ ["c", "d", "e"] │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ null ┆ null ┆ ["None"] │
└─────────────────┴──────┴─────────────────┘
This gets part of the way there but now we have to deal with ["None"] at some point.