PySpark: Count nested objects in array/list given condition - pyspark

How can I count elements contained in an array when a condition is met?
Consider this example:
df = spark.createDataFrame(sc.parallelize([
[
['pass', 'pass', 'fail', 'fail', 'fail'],
['Clear BMC SEL', 'Check SEL', 'Clear BMC SEL', 'CPU', 'Check SEL']
]
]),
['Status', 'Description'])
+--------------------+--------------------+
| Status| Description|
+--------------------+--------------------+
|[pass, pass, fail...|[Clear BMC SEL, C...|
+--------------------+--------------------+
This method works well for counting a particular column, but I want to count the values that meet the condition, given that the corresponding value in Status, a separate column, is fail.
df = df.selectExpr('*', 'filter(Description, x -> x = "Clear BMC SEL" or x = "Check SEL") as pass_array')
df = df.selectExpr('*', 'size(pass_array) as testFailCount').drop('pass_array')
# the expected result is 2
+--------------------+--------------------+-------------+
| Status| Description|testFailCount|
+--------------------+--------------------+-------------+
|[pass, pass, fail...|[Clear BMC SEL, C...| 4|
+--------------------+--------------------+-------------+

you can zip the 2 array columns using arrays_zip which creates an array of structs where the Nth struct will have Nth elements from the two array fields. the resulting array field can then be filtered.
here's the example.
data_sdf. \
withColumn('desc_status_struct_arr', func.arrays_zip('description', 'status')). \
withColumn('pass_array',
func.expr('filter(desc_status_struct_arr, x -> x.description in ("Clear BMC SEL", "Check SEL") and x.status = "fail")')
). \
withColumn('test_fail_count', func.size('pass_array')). \
show(truncate=False)
# +------------------------------+---------------------------------------------------------+-------------------------------------------------------------------------------------------------+------------------------------------------+---------------+
# |status |description |desc_status_struct_arr |pass_array |test_fail_count|
# +------------------------------+---------------------------------------------------------+-------------------------------------------------------------------------------------------------+------------------------------------------+---------------+
# |[pass, pass, fail, fail, fail]|[Clear BMC SEL, Check SEL, Clear BMC SEL, CPU, Check SEL]|[{Clear BMC SEL, pass}, {Check SEL, pass}, {Clear BMC SEL, fail}, {CPU, fail}, {Check SEL, fail}]|[{Clear BMC SEL, fail}, {Check SEL, fail}]|2 |
# +------------------------------+---------------------------------------------------------+-------------------------------------------------------------------------------------------------+------------------------------------------+---------------+

Related

How to combine UDFs when creating a new column in Pyspark 1.6

I am trying to aggregate a table that I have around one kay value (id here) so that I can have one row per id and perform some verifications on the rows that belong to each id in order to identify the 'result' (type of transaction of sorts). Lets say that after aggregating, I have something like this:
sc = SparkContext()
cols = ['id', 'list1', 'list2']
data = [('zero', ['cd1', 'cd7', 'cd5', 'cd2'], ['', '', '', 'debit']),('one', ['cd2', 'cd3', 'cd9', 'cd6'], ['credit', '', '', '']),('two', ['cd4', 'cd3', 'cd5', 'cd1'],['', '', '', ''])]
rdd = sc.parallelize(data)
df = rdd.toDF(cols)
>>> df.show()
+----+--------------------+--------------+
| id| list1| list2|
+----+--------------------+--------------+
|zero|[cd1, cd7, cd5, cd2]| [, , , debit]|
| one|[cd2, cd3, cd9, cd6]|[credit, , , ]|
| two|[cd4, cd3, cd5, cd1]| [, , , ]|
+----+--------------------+--------------+
The question I have to answer here is: does list1 have cd9 in it? If so, what is the corresponding value in list2 of list1's cd2?
What I have done to solve it was defining a couple of UDFs, since array functions in PySpark 1.6 are limited:
enum = F.udf(lambda x,y: [i for i, e in enumerate(x) if e==y], T.ArrayType(T.IntegerType()))
elat = F.udf(lambda x,y: [e for i, e in enumerate(x) if (i in y)], T.ArrayType(T.StringType()))
nulls = F.udf(lambda: [], T.ArrayType(T.IntegerType()))
Then creating a new 'lookup' column with the indexes of the elements I want to grab from the other column of lists:
df = df.withColumn('lookup',
F.when((F.array_contains(F.col('list1'), 'cd7')) | (F.array_contains(F.col('list1'), 'cd9')), enum(F.col('list1'), F.lit('cd2')))
.otherwise(nulls()))
And finally using this column to reach my endgoal:
df = df.withColumn('result',
F.when(F.array_contains(F.col('list1'), 'cd7') & (F.array_contains(elat(F.col('list2'), F.col('lookup')),'debit')), 'CD 7 - DEBIT')
.otherwise(F.when(F.array_contains(F.col('list1'), 'cd7') & (F.array_contains(elat(F.col('list2'), F.col('lookup')),'credit')), 'CD 7 - CREDIT')
.otherwise(F.when(F.array_contains(F.col('list1'), 'cd9') & (F.array_contains(elat(F.col('list2'), F.col('lookup')),'debit')), 'CD 9 - DEBIT')
.otherwise(F.when(F.array_contains(F.col('list1'), 'cd9') & (F.array_contains(elat(F.col('list2'), F.col('lookup')),'credit')), 'CD 9 - CREDIT')
.otherwise('etc')
)))
)
>>> df.show()
+----+--------------------+--------------+------+-------------+
| id| list1| list2|lookup| result|
+----+--------------------+--------------+------+-------------+
|zero|[cd1, cd7, cd5, cd2]| [, , , debit]| [3]| CD 7 - DEBIT|
| one|[cd2, cd3, cd9, cd6]|[credit, , , ]| [0]|CD 9 - CREDIT|
| two|[cd4, cd3, cd5, cd1]| [, , , ]| []| etc|
+----+--------------------+--------------+------+-------------+
But I would very much prefer if there was a way to achieve the same without creating one extra column, because the actual dataframe has more columns and the lookup list may need to access different columns depending on the rule that I need to check for. When I tried to combine both elat and enum UDFs on one go it was unable to compute one or the other.

how I can groupby a column and use it to groupby the other column?

I am classifying a column to different parts based on their first letter. It means that if they have the same 4 letter, they are in a same class. I use the following code to do that:
# this code extracts the first 4 elements of each title
df1_us2 = df1_us2.withColumn("first_2_char", df1_us2.clean_company_name.substr(1,4))
#this code group them in a list
group_user = df1_us2.groupBy('first_2_char').agg(collect_set('col1').alias('cal11'))
Each title has a description, I want this classification happen for the description as well:
Example:
col1 description
summer a season
summary it is a brief
common having similar
communication null
house living place
output:
col11 description1
['summer','summary'] ['a season',' it is a brief']
['common','communication'] ['having similar', null]
['house'] ['living place ']
How I can modify the above code to get description1?
Note: if a description is null, the null should be in the list. Because I am gonna use index of elements incol1 to get their description. So both of columns should have the same size of list per each row.
collect_list should work as aggregation function:
from pyspark.sql import functions as F
df = ...
df.withColumn('f2c', df.col1.substr(1,2)) \
.fillna('null') \
.groupby('f2c') \
.agg(F.collect_list('col1').alias('col11'),
F.collect_list('description').alias('description1')) \
.drop('f2c') \
.show(truncate=False)
To include the null values in the arrays they are replaced with strings first.
Output:
+-----------------------+-------------------------+
|col11 |description1 |
+-----------------------+-------------------------+
|[house] |[living place] |
|[common, communication]|[having similar, null] |
|[summer, summary] |[a season, it is a brief]|
+-----------------------+-------------------------+
For further processing the two arrays can be combined into a map using map_from_arrays:
[...]
.withColumn('map', F.map_from_arrays('col11', 'description1')) \
.show(truncate=False)
Output:
+-----------------------+-------------------------+-------------------------------------------------+
|col11 |description1 |map |
+-----------------------+-------------------------+-------------------------------------------------+
|[house] |[living place] |{house -> living place} |
|[common, communication]|[having similar, null] |{common -> having similar, communication -> null}|
|[summer, summary] |[a season, it is a brief]|{summer -> a season, summary -> it is a brief} |
+-----------------------+-------------------------+-------------------------------------------------+

Logic with dates pyspark

I am using Pyspark and I have data like this in the dataframe
and I want the output like this
The logic goes like this - from table 1 above,
the first date of category B for id=1 is 08/06/2022 and the first date for category A is 13/06/2022.So, for any dates on or after 13/06/2022 should have both categories A and B.
So, for 08/06/2022, there is category B only and for 13/06/2022 there is category A and B. For 24/06/2022, there is just category A in table1 but the output should have category B too as the first date of category B is 13/06/2022 and for 26/07/2022, there is just category B in table 1 but output should have both category and category B for 26/07/2022.
How do I achieve this in pyspark?
# input dataframe creation
data_sdf = spark.sparkContext.parallelize(data_ls).toDF(['id', 'cat', 'dt']). \
withColumn('dt', func.col('dt').cast('date'))
# required solution
data_sdf. \
withColumn('min_dt', func.min('dt').over(wd.partitionBy('id'))). \
withColumn('all_cats', func.collect_set('cat').over(wd.partitionBy('id'))). \
withColumn('cat_arr',
func.when(func.col('min_dt') == func.col('dt'), func.array(func.col('cat'))).
otherwise(func.col('all_cats'))
). \
drop('cat', 'min_dt', 'all_cats'). \
dropDuplicates(). \
withColumn('cat', func.explode('cat_arr')). \
drop('cat_arr'). \
orderBy('id', 'dt', 'cat'). \
show()
# +---+----------+---+
# |id |dt |cat|
# +---+----------+---+
# |1 |2022-06-08|B |
# |1 |2022-06-13|A |
# |1 |2022-06-13|B |
# |1 |2022-06-24|A |
# |1 |2022-06-24|B |
# +---+----------+---+
I've used a subset of the posted data. The idea of the approach is that you create an array of distinct categories and apply that to all dates except the minimum date. The minimum date will only have that row's category (not all categories). The array can then be exploded to get the desired result for all dates.

Remove null values and shift values from the next column in pyspark

I need to transform a Python script to Pyspark and it's being a tough task for me.
I'm trying to remove null values from a dataframe (without removing the entire column or row) and shift the next value to the prior column. Example:
CLIENT| ANIMAL_1 | ANIMAL_2 | ANIMAL_3| ANIMAL_4
ROW_1 1 | cow | frog | null | dog
ROW_2 2 | pig | null | cat | null
My goal is to have:
CLIENT| ANIMAL_1 | ANIMAL_2 | ANIMAL_3| ANIMAL_4
ROW_1 1 | cow | frog | dog | null
ROW_2 2 | pig | cat | null | null
The code I'm using on python is (which I got here on Stackoverflow):
df_out = df.apply(lambda x: pd.Series(x.dropna().to_numpy()), axis=1)
Then I rename the columns. But I have no idea how to do this on Pyspark.
Here's a way to do this for Spark version 2.4+:
Create an array of the columns you want and sort by your conditions, which are the following:
Sort non-null values first
Sort values in the order they appear in the columns
We can do the sorting by using array_sort. To achieve the multiple conditions, use arrays_zip. To make it easy to extract the value you want (i.e. the animal in this example) zip column value as well.
from pyspark.sql.functions import array, array_sort, arrays_zip, col, lit
animal_cols = df.columns[1:]
N = len(animal_cols)
df_out = df.select(
df.columns[0],
array_sort(
arrays_zip(
array([col(c).isNull() for c in animal_cols]),
array([lit(i) for i in range(N)]),
array([col(c) for c in animal_cols])
)
).alias('sorted')
)
df_out.show(truncate=False)
#+------+----------------------------------------------------------------+
#|CLIENT|sorted |
#+------+----------------------------------------------------------------+
#|1 |[[false, 0, cow], [false, 1, frog], [false, 3, dog], [true, 2,]]|
#|2 |[[false, 0, pig], [false, 2, cat], [true, 1,], [true, 3,]] |
#+------+----------------------------------------------------------------+
Now that things are in the right order, you just need to extract the value. In this case, that's the item at element '2' in the i-th index of sorted column.
df_out = df_out.select(
df.columns[0],
*[col("sorted")[i]['2'].alias(animal_cols[i]) for i in range(N)]
)
df_out.show(truncate=False)
#+------+--------+--------+--------+--------+
#|CLIENT|ANIMAL_1|ANIMAL_2|ANIMAL_3|ANIMAL_4|
#+------+--------+--------+--------+--------+
#|1 |cow |frog |dog |null |
#|2 |pig |cat |null |null |
#+------+--------+--------+--------+--------+

Alternative to GroupBy for Pyspark Dataframe?

I have a dataset like this:
timestamp vars
2 [1,2]
2 [1,2]
3 [1,2,3]
3 [1,2]
And I want a dataframe like this. Basically each value in the above dataframe is an index and the frequency of that value is the value at that index. This computation is done over every unique timestamp.
timestamp vars
2 [0, 2, 2]
3 [0,2,2,1]
Right now, I'm grouping by timestamp, and aggregrating/flattening vars (to get something like (1,2,1,2 for timestamp 2 or 1,2,3,1,2 for timestamp 3) and then I have a udf that uses collections.Counter to to get a key->value dict. I then turn this dict into the format I want.
The groupBy/agg can get arbitrarily large (arrays size can be in the millions) and this seems like a good usecase for the Window function, but I'm not sure how to put it all together.
Thinks it's also worth mentioning that I've tried repartioning, and converting to an RDD and using groupByKey. Both are arbitrarily slow (>24 hours) on large datasets.
Edit: As discussed in comments, the issue for the original methods could be from count using the filter or aggregate functions which triggers unnecessary data scans. Below we explode the arrays and do the aggregation(count) before creating the final array column:
from pyspark.sql.functions import collect_list, struct
df = spark.createDataFrame([(2,[1,2]), (2,[1,2]), (3,[1,2,3]), (3,[1,2])],['timestamp', 'vars'])
df.selectExpr("timestamp", "explode(vars) as var") \
.groupby('timestamp','var') \
.count() \
.groupby("timestamp") \
.agg(collect_list(struct("var","count")).alias("data")) \
.selectExpr(
"timestamp",
"transform(data, x -> x.var) as indices",
"transform(data, x -> x.count) as values"
).selectExpr(
"timestamp",
"transform(sequence(0, array_max(indices)), i -> IFNULL(values[array_position(indices,i)-1],0)) as new_vars"
).show(truncate=False)
+---------+------------+
|timestamp|new_vars |
+---------+------------+
|3 |[0, 2, 2, 1]|
|2 |[0, 2, 2] |
+---------+------------+
Where:
(1) we explode the array and do count() for each timestamp + var
(2) groupby timestamp and create an array of structs containing two fields var and count
(3) convert the array of structs into two arrays: indices and values (similar to what we define SparseVector)
(4) transform the sequence sequence(0, array_max(indices)), for each i in the sequence, use array_position to find the index of i in indices array and then retrieve the value from values array at the same position, see below:
IFNULL(values[array_position(indices,i)-1],0)
notice that the function array_position uses 1-based index and array indexing is 0-based, thus we have a -1 in the above expression.
Old methods:
(1) Use transform + filter/size
from pyspark.sql.functions import flatten, collect_list
df.groupby('timestamp').agg(flatten(collect_list('vars')).alias('data')) \
.selectExpr(
"timestamp",
"transform(sequence(0, array_max(data)), x -> size(filter(data, y -> y = x))) as vars"
).show(truncate=False)
+---------+------------+
|timestamp|vars |
+---------+------------+
|3 |[0, 2, 2, 1]|
|2 |[0, 2, 2] |
+---------+------------+
(2) Use aggregate function:
df.groupby('timestamp').agg(flatten(collect_list('vars')).alias('data')) \
.selectExpr("timestamp", """
aggregate(
data,
/* use an array as zero_value, size = array_max(data))+1 and all values are zero */
array_repeat(0, int(array_max(data))+1),
/* increment the ith value of the Array by 1 if i == y */
(acc, y) -> transform(acc, (x,i) -> IF(i=y, x+1, x))
) as vars
""").show(truncate=False)