Group rows with common id in pyspark - pyspark

I try to analyze the number of combinations with the same label but in different periods.
from pyspark.sql.functions import *
from pyspark.sql.functions import last
from pyspark.sql.functions import arrays_zip
from pyspark.sql.types import *
from pyspark.sql import *
data = [["1", "2022-06-01 00:00:04.437000+02:00", "c", "A", "8", "6"],
["2", "2022-06-01 00:00:04.625000+02:00", "e", "A", "4", "2"],
["3", "2022-06-01 00:00:04.640000+02:00", "b", "A", "5", "3"],
["4", "2022-06-01 00:00:04.640000+02:00", "a", "A", "1", "6"],
["1", "2022-06-01 00:00:04.669000+02:00", "c", "B", "2", "4"],
["5", "2022-06-01 00:00:05.223000+02:00", "b", "B", "6", "8"],
["6", "2022-06-01 00:00:05.599886+02:00", "c", "A", None, "9"],
["1", "2022-06-01 00:00:05.740886+02:00", "b", "A", "8", "6"],
["2", "2022-06-01 00:00:05.937000+02:00", "a", "A", "4", "2"],
["3", "2022-06-01 00:00:05.937000+02:00", "e", "A", "5", "3"],
["4", "2022-06-01 00:00:30.746501-05:00", "b", "C", "1", "6"],
["1", "2022-06-01 00:00:30.747498-05:00", "d", "C", "2", "4"],
["5", "2022-06-01 00:00:30.789820+02:00", "b", "D", "6", "8"],
["6", "2022-06-01 00:00:31.062000+02:00", "e", "E", None, "9"],
["1", "2022-06-01 00:00:31.078000+02:00", "b", "E", "8", "6"],
["2", "2022-06-01 00:00:31.078000+02:00", "a", "F", "4", "2"],
["3", "2022-06-01 00:00:31.861017+02:00", "c", "G", "5", "3"],
["4", "2022-06-01 00:00:32.205639+00:00", "b", "H", "1", "6"],
["1", "2022-06-01 00:00:34.656000+02:00", "b", "I", "2", "4"],
["5", "2022-06-01 00:00:34.656000+02:00", "a", "I", "6", "8"],
["6", "2022-06-01 00:00:34.656000+02:00", "e", "I", None, "9"]]
columns = ['ID', 'source_timestamp', 'node_id', 'cd_equipment_no', 'x', 'y']
dataframe = spark.createDataFrame(data, columns)
dataframe = dataframe.withColumn("source_timestamp", to_timestamp(col("source_timestamp")))
This is how the data looks like
+---+--------------------+-------+---------------+----+---+
| ID| source_timestamp|node_id|cd_equipment_no| x| y|
+---+--------------------+-------+---------------+----+---+
| 1|2022-05-31 22:00:...| c| A| 8| 6|
| 2|2022-05-31 22:00:...| e| A| 4| 2|
| 3|2022-05-31 22:00:...| b| A| 5| 3|
| 4|2022-05-31 22:00:...| a| A| 1| 6|
| 1|2022-05-31 22:00:...| c| B| 2| 4|
| 5|2022-05-31 22:00:...| b| B| 6| 8|
| 6|2022-05-31 22:00:...| c| A|null| 9|
| 1|2022-05-31 22:00:...| b| A| 8| 6|
| 2|2022-05-31 22:00:...| a| A| 4| 2|
| 3|2022-05-31 22:00:...| e| A| 5| 3|
| 4|2022-06-01 05:00:...| b| C| 1| 6|
| 1|2022-06-01 05:00:...| d| C| 2| 4|
| 5|2022-05-31 22:00:...| b| D| 6| 8|
| 6|2022-05-31 22:00:...| e| E|null| 9|
| 1|2022-05-31 22:00:...| b| E| 8| 6|
| 2|2022-05-31 22:00:...| a| F| 4| 2|
| 3|2022-05-31 22:00:...| c| G| 5| 3|
| 4|2022-06-01 00:00:...| b| H| 1| 6|
| 1|2022-05-31 22:00:...| b| I| 2| 4|
| 5|2022-05-31 22:00:...| a| I| 6| 8|
+---+--------------------+-------+---------------+----+---+
My intention is to create an identifier when the time is sorted ascending based on source_timestamp
This is what I get
window = Window.partitionBy('cd_equipment_no').orderBy(col('source_timestamp'))
dataframe = dataframe.select('*', row_number().over(window).alias('posicion'))
+---+--------------------+-------+---------------+----+---+--------+--------+
| ID| source_timestamp|node_id|cd_equipment_no| x| y|posicion|posicion|
+---+--------------------+-------+---------------+----+---+--------+--------+
| 1|2022-05-31 22:00:...| c| A| 8| 6| 1| 1|
| 2|2022-05-31 22:00:...| e| A| 4| 2| 2| 2|
| 3|2022-05-31 22:00:...| b| A| 5| 3| 3| 3|
| 4|2022-05-31 22:00:...| a| A| 1| 6| 4| 4|
| 6|2022-05-31 22:00:...| c| A|null| 9| 7| 5|
| 1|2022-05-31 22:00:...| b| A| 8| 6| 8| 6|
| 2|2022-05-31 22:00:...| a| A| 4| 2| 9| 7|
| 3|2022-05-31 22:00:...| e| A| 5| 3| 10| 8|
| 1|2022-05-31 22:00:...| c| B| 2| 4| 5| 1|
| 5|2022-05-31 22:00:...| b| B| 6| 8| 6| 2|
| 4|2022-06-01 05:00:...| b| C| 1| 6| 20| 1|
| 1|2022-06-01 05:00:...| d| C| 2| 4| 21| 2|
| 5|2022-05-31 22:00:...| b| D| 6| 8| 11| 1|
| 6|2022-05-31 22:00:...| e| E|null| 9| 12| 1|
| 1|2022-05-31 22:00:...| b| E| 8| 6| 13| 2|
| 2|2022-05-31 22:00:...| a| F| 4| 2| 14| 1|
| 3|2022-05-31 22:00:...| c| G| 5| 3| 15| 1|
| 4|2022-06-01 00:00:...| b| H| 1| 6| 19| 1|
| 1|2022-05-31 22:00:...| b| I| 2| 4| 16| 1|
| 5|2022-05-31 22:00:...| a| I| 6| 8| 17| 2|
+---+--------------------+-------+---------------+----+---+--------+--------+
And this is what I want
+---+--------------------+-------+---------------+----+---+--------+--------+
| ID| source_timestamp|node_id|cd_equipment_no| x| y|posicion|posicion|
+---+--------------------+-------+---------------+----+---+--------+--------+
| 1|2022-05-31 22:00:...| c| A| 8| 6| 1| 1|
| 2|2022-05-31 22:00:...| e| A| 4| 2| 2| 2|
| 3|2022-05-31 22:00:...| b| A| 5| 3| 3| 3|
| 4|2022-05-31 22:00:...| a| A| 1| 6| 4| 4|
| 6|2022-05-31 22:00:...| c| A|null| 9| 7| 1|
| 1|2022-05-31 22:00:...| b| A| 8| 6| 8| 2|
| 2|2022-05-31 22:00:...| a| A| 4| 2| 9| 3|
| 3|2022-05-31 22:00:...| e| A| 5| 3| 10| 4|
| 1|2022-05-31 22:00:...| c| B| 2| 4| 5| 1|
| 5|2022-05-31 22:00:...| b| B| 6| 8| 6| 2|
| 4|2022-06-01 05:00:...| b| C| 1| 6| 20| 1|
| 1|2022-06-01 05:00:...| d| C| 2| 4| 21| 2|
| 5|2022-05-31 22:00:...| b| D| 6| 8| 11| 1|
| 6|2022-05-31 22:00:...| e| E|null| 9| 12| 1|
| 1|2022-05-31 22:00:...| b| E| 8| 6| 13| 2|
| 2|2022-05-31 22:00:...| a| F| 4| 2| 14| 1|
| 3|2022-05-31 22:00:...| c| G| 5| 3| 15| 1|
| 4|2022-06-01 00:00:...| b| H| 1| 6| 19| 1|
| 1|2022-05-31 22:00:...| b| I| 2| 4| 16| 1|
| 5|2022-05-31 22:00:...| a| I| 6| 8| 17| 2|
+---+--------------------+-------+---------------+----+---+--------+--------+

Related

Equivalent ungroup() from R in Pyspark

I try to group by values of metric which can be I or M and make a summe based on the values of x. The result should be stored in each row of its respective value. Normally I make it in R with group and then ungroup but I dont know the equivalent in PysSpark. Any advice?
from pyspark.sql.functions import *
from pyspark.sql.functions import last
from pyspark.sql.functions import arrays_zip
from pyspark.sql.types import *
data = [["1", "Amit", "DU", "I", "8", "6"],
["2", "Mohit", "DU", "I", "4", "2"],
["3", "rohith", "BHU", "I", "5", "3"],
["4", "sridevi", "LPU", "I", "1", "6"],
["1", "sravan", "KLMP", "M", "2", "4"],
["5", "gnanesh", "IIT", "M", "6", "8"],
["6", "gnadesh", "KLM", "M","0", "9"]]
columns = ['ID', 'NAME', 'college', 'metric', 'x', 'y']
dataframe = spark.createDataFrame(data, columns)
dataframe = dataframe.withColumn("x",dataframe.x.cast(DoubleType()))
This is how the data looks like
+---+-------+-------+------+----+---+
| ID| NAME|college|metric| x| y|
+---+-------+-------+------+----+---+
| 1| Amit| DU| I| 8| 6|
| 2| Mohit| DU| I| 4| 2|
| 3| rohith| BHU| I| 5| 3|
| 4|sridevi| LPU| I| 1| 6|
| 1| sravan| KLMP| M| 2| 4|
| 5|gnanesh| IIT| M| 6| 8|
| 6|gnadesh| KLM| M|0 | 9|
+---+-------+-------+------+----+---+
Expected output
+---+-------+-------+------+----+---+------+
| ID| NAME|college|metric| x| y| total|
+---+-------+-------+------+----+---+------+
| 1| Amit| DU| I| 8| 6| 18 |
| 2| Mohit| DU| I| 4| 2| 18 |
| 3| rohith| BHU| I| 5| 3| 18 |
| 4|sridevi| LPU| I| 1| 6| 18 |
| 1| sravan| KLMP| M| 2| 4| 8 |
| 5|gnanesh| IIT| M| 6| 8| 8 |
| 6|gnadesh| KLM| M| 0| 9| 8 |
+---+-------+-------+------+----+---+------+
I tried this but it does not work
dataframe.withColumn("total",dataframe.groupBy("metric").sum("x"))
You can do groupby on data and calculate the total value and then join the grouped dataframe with original data
metric_sum_df = dataframe.groupby('metric').agg(F.sum('x').alias('total'))
total_df = dataframe.join(metric_sum_df, 'metric')

Pyspark cumsum over same values in orderBy column

I have the following dataframe:
+----+----+-----+
|col1|col2|value|
+----+----+-----+
| 11| a| 1|
| 11| a| 2|
| 11| b| 3|
| 11| a| 4|
| 11| b| 5|
| 22| a| 6|
| 22| b| 7|
+----+----+-----+
I want to calculate to calculate the cumsum of the 'value' column that is partitioned by 'col1' and ordered by 'col2'.
This is the desired output:
+----+----+-----+------+
|col1|col2|value|cumsum|
+----+----+-----+------+
| 11| a| 1| 1|
| 11| a| 2| 3|
| 11| a| 4| 7|
| 11| b| 3| 10|
| 11| b| 5| 15|
| 22| a| 6| 6|
| 22| b| 7| 13|
+----+----+-----+------+
I have used this code which gives me the df shown below. It is not what I wanted. Can someone help me please?
df.withColumn("cumsum", F.sum("value").over(Window.partitionBy("col1").orderBy("col2").rangeBetween(Window.unboundedPreceding, 0)))
+----+----+-----+------+
|col1|col2|value|cumsum|
+----+----+-----+------+
| 11| a| 2| 7|
| 11| a| 1| 7|
| 11| a| 4| 7|
| 11| b| 3| 15|
| 11| b| 5| 15|
| 22| a| 6| 6|
| 22| b| 7| 13|
+----+----+-----+------+
You have to use .rowsBetween instead of .rangeBetween in your window clause.
rowsBetween (vs) rangeBetween
Example:
df.withColumn("cumsum", sum("value").over(Window.partitionBy("col1").orderBy("col2").rowsBetween(Window.unboundedPreceding, 0))).show()
#+----+----+-----+------+
#|col1|col2|value|cumsum|
#+----+----+-----+------+
#| 11| a| 1| 1|
#| 11| a| 2| 3|
#| 11| a| 4| 7|
#| 11| b| 3| 10|
#| 11| b| 5| 15|
#| 12| a| 6| 6|
#| 12| b| 7| 13|
#+----+----+-----+------+

Spark: how to make value of new column based on different columns

Spark 2.2.1
Pyspark
df = sqlContext.createDataFrame([
("dog", "1", "2", "3"),
("cat", "4", "5", "6"),
("dog", "7", "8", "9"),
("cat", "10", "11", "12"),
("dog", "13", "14", "15"),
("parrot", "16", "17", "18"),
("goldfish", "19", "20", "21"),
], ["pet", "dog_30", "cat_30", "parrot_30"])
And then I have list of the fields that I care above from the "Pet" column
dfvalues = ["dog", "cat", "parrot"]
I want to write code taht will give me the value from dog_30, cat_30 or parrot_30 that corresponds to the value in "pet". For example, in the first row the value for the pet column is dog and so we take the value for dog_30 which is 1.
I tried using this to get the code, but it just gives me nulls for the column stats. I also haven't figured out how to handle the goldfish case. I want to set that to 0.
mycols = [F.when(F.col("pet") == p + "_30", p) for p in dfvalues]
df = df.withColumn("newCol2",F.coalesce(*stats) )
df.show()
Desired output:
+--------+------+------+---------+------+
| pet|dog_30|cat_30|parrot_30|stats |
+--------+------+------+---------+------+
| dog| 1| 2| 3| 1 |
| cat| 4| 5| 6| 5 |
| dog| 7| 8| 9| 7 |
| cat| 10| 11| 12| 11 |
| dog| 13| 14| 15| 13 |
| parrot| 16| 17| 18| 18 |
|goldfish| 19| 20| 21| 0 |
+--------+------+------+---------+------+
The logic is off; you need .when(F.col("pet") == p, F.col(p + '_30')):
mycols = [F.when(F.col("pet") == p, F.col(p + '_30')) for p in dfvalues]
df = df.withColumn("newCol2",F.coalesce(F.coalesce(*mycols),F.lit(0)))
df.show()
+--------+------+------+---------+-------+
| pet|dog_30|cat_30|parrot_30|newCol2|
+--------+------+------+---------+-------+
| dog| 1| 2| 3| 1|
| cat| 4| 5| 6| 5|
| dog| 7| 8| 9| 7|
| cat| 10| 11| 12| 11|
| dog| 13| 14| 15| 13|
| parrot| 16| 17| 18| 18|
|goldfish| 19| 20| 21| 0|
+--------+------+------+---------+-------+

Last Entry that matches a condition per Window

This dummy data represents a device with measurement cycles.
One measurement clycle goes from "Type" Init to Init.
What I want to find out is the f.e. last error (the condition will get way more complicated) within each measurement cylce.
I already figured out a solution for this. What I really want to know is if there is an easier / more efficient way to calculate this.
Example Dataset
val df_orig = spark.sparkContext.parallelize(Seq(
("Init", 1, 17, "I"),
("TypeA", 2, 17, "W"),
("TypeA", 3, 17, "E"),
("TypeA", 4, 17, "W"),
("TypeA", 5, 17, "E"),
("TypeA", 6, 17, "W"),
("Init", 7, 12, "I"),
("TypeB", 8, 12, "W"),
("TypeB", 9, 12, "E"),
("TypeB", 10, 12, "W"),
("TypeB", 11, 12, "W"),
("TypeB", 12, 12, "E"),
("TypeB", 13, 12, "E")
)).toDF("Type", "rn", "X_ChannelC", "Error_Type")
The following code represents my solution.
val fillWindow = Window.partitionBy().orderBy($"rn").rowsBetween(Window.unboundedPreceding, 0)
//create window
val df_with_window = df_orig.withColumn("window_flag", when($"Type".contains("Init"), 1).otherwise(null))
.withColumn("window_filled", sum($"window_flag").over(fillWindow))
val window = Window.partitionBy("window_filled").orderBy($"rn").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
//calulate last entry
val df_new = df_with_window.withColumn("is_relevant", when($"Error_Type".contains("E"), $"rn").otherwise(null))
.withColumn("last", last($"is_relevant", true).over(window))
.withColumn("pass", when($"last" === $"is_relevant", "Fail").otherwise(null))
df_new.show()
Result:
+-----+---+----------+----------+-----------+-------------+-----------+----+--------+
| Type| rn|X_ChannelC|Error_Type|window_flag|window_filled|is_relevant|last| pass|
+-----+---+----------+----------+-----------+-------------+-----------+----+--------+
| Init| 1| 17| I| 1| 1| null| 5| null|
|TypeA| 2| 17| W| null| 1| null| 5| null|
|TypeA| 3| 17| E| null| 1| 3| 5| null|
|TypeA| 4| 17| W| null| 1| null| 5| null|
|TypeA| 5| 17| E| null| 1| 5| 5|This one|
|TypeA| 6| 17| W| null| 1| null| 5| null|
| Init| 7| 12| I| 1| 2| null| 13| null|
|TypeB| 8| 12| W| null| 2| null| 13| null|
|TypeB| 9| 12| E| null| 2| 9| 13| null|
|TypeB| 10| 12| W| null| 2| null| 13| null|
|TypeB| 11| 12| W| null| 2| null| 13| null|
|TypeB| 12| 12| E| null| 2| 12| 13| null|
|TypeB| 13| 12| E| null| 2| 13| 13|This one|
+-----+---+----------+----------+-----------+-------------+-----------+----+--------+
Not sure if that is more efficient (still 2 window functions used, but a bit shorter):
val df_new = df_orig
.withColumn("measurement", sum(when($"Type"==="Init",1)).over(Window.orderBy($"rn")))
.withColumn("pass", $"rn"===max(when($"Error_Type"==="E",$"rn")).over(Window.partitionBy($"measurement")))
.show()
+-----+---+----------+----------+-----------+-----+
| Type| rn|X_ChannelC|Error_Type|measurement| pass|
+-----+---+----------+----------+-----------+-----+
| Init| 1| 17| I| 1|false|
|TypeA| 2| 17| W| 1|false|
|TypeA| 3| 17| E| 1|false|
|TypeA| 4| 17| W| 1|false|
|TypeA| 5| 17| E| 1| true|
|TypeA| 6| 17| W| 1|false|
| Init| 7| 12| I| 2|false|
|TypeB| 8| 12| W| 2|false|
|TypeB| 9| 12| E| 2|false|
|TypeB| 10| 12| W| 2|false|
|TypeB| 11| 12| W| 2|false|
|TypeB| 12| 12| E| 2|false|
|TypeB| 13| 12| E| 2| true|
+-----+---+----------+----------+-----------+-----+

Variable number of arguments for pyspark udf

I have around 275 columns and I would like to search 25 columns for a regex string "^D(410|412). If this search string is is present in any of 25 columns I would like to add true to MyNewColumn.
using below I could do it for 2 columns. Is there anyway for passing variable number of columns ?
Below code works for 2 columns
def moreThanTwoArgs(col1,col2):
return bool((re.search("^D(410|412)",col1) or re.search("^D(410|412)",col2)))
twoUDF= udf(moreThanTwoArgs,BooleanType())
df = df.withColumn("MyNewColumn", twoUDF(df["X1"], df["X2"]))
I tried some what similar have sample code try this and proceed:-
df1 = sc.parallelize(
[
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
]).toDF(['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10'])
df1.show()
+---+---+---+---+---+---+---+---+---+---+
| c1| c2| c3| c4| c5| c6| c7| c8| c9|c10|
+---+---+---+---+---+---+---+---+---+---+
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10|
+---+---+---+---+---+---+---+---+---+---+
import pyspark.sql.functions as F
import pyspark.sql.types as T
import re
def booleanFindFunc(*args):
return sum(args)
udfBoolean = F.udf(booleanFindFunc, T.StringType())
#Below is Sum of three columns (c1+c2+c2)
df1.withColumn("MyNewColumn", booleanFindFunc(F.col("c1"), F.col("c2"), F.col("c2"))).show()
+---+---+---+---+---+---+---+---+---+---+-----------+
| c1| c2| c3| c4| c5| c6| c7| c8| c9|c10|MyNewColumn|
+---+---+---+---+---+---+---+---+---+---+-----------+
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 5|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 5|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 5|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 5|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 5|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 5|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 5|
+---+---+---+---+---+---+---+---+---+---+-----------+
#Below is Sum of All Columns (c1+c2+c3---+c10)
df1.withColumn("MyNewColumn", booleanFindFunc(*[F.col(i) for i in df1.columns])).show()
+---+---+---+---+---+---+---+---+---+---+-----------+
| c1| c2| c3| c4| c5| c6| c7| c8| c9|c10|MyNewColumn|
+---+---+---+---+---+---+---+---+---+---+-----------+
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 55|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 55|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 55|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 55|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 55|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 55|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 55|
+---+---+---+---+---+---+---+---+---+---+-----------+
#Below is Sum of All odd Columns (c1+c3+c5--+c9)
df1.withColumn("MyNewColumn", booleanFindFunc(*[F.col(i) for i in df1.columns if int(i[1:])%2])).show()
+---+---+---+---+---+---+---+---+---+---+-----------+
| c1| c2| c3| c4| c5| c6| c7| c8| c9|c10|MyNewColumn|
+---+---+---+---+---+---+---+---+---+---+-----------+
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 25|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 25|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 25|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 25|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 25|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 25|
| 1| 2| 3| 4| 5| 6| 7| 8| 9| 10| 25|
+---+---+---+---+---+---+---+---+---+---+-----------+
Hope This will solve your problem