group by per day pyspark - pyspark

I have a PySpark DataFrame :
From id To id Price Date
a b 20 30/05/2019
b c 5 30/05/2019
c a 20 30/05/2019
a d 10 02/06/2019
d c 5 02/06/2019
id Name
a Claudia
b Manuella
c remy
d Paul
The output that i want is :
Date Name current balance
30/05/2019 Claudia 0
30/05/2019 Manuella 15
30/05/2019 Remy -15
30/05/2019 Paul 0
02/06/2019 Claudia -10
02/06/2019 Manuella 15
02/06/2019 Remy -10
02/06/2019 Paul 5
I want to get the current balance in each day for all users.
my idea is to make a groupby per user and calculate the sum of the TO column minus the From column. But how to do it per day? especially it's cumulative and not per day?
Thank You

I took a bit of an effort to get the requirements right. Here's my version of the solution.
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark import SparkContext, SQLContext
import pyspark.sql.functions as F
from pyspark.sql import Window
sc = SparkContext('local')
sqlContext = SQLContext(sc)
data1 = [
("a","b",20,"30/05/2019"),
("b","c",5 ,"30/05/2019"),
("c","a",20,"30/05/2019"),
("a","d",10,"02/06/2019"),
("d","c",5 ,"02/06/2019"),
]
df1Columns = ["From_Id", "To_Id", "Price", "Date"]
df1 = sqlContext.createDataFrame(data=data1, schema = df1Columns)
df1 = df1.withColumn("Date",F.to_date(F.to_timestamp("Date", 'dd/MM/yyyy')).alias('Date'))
print("Actual initial data")
df1.show(truncate=False)
data2 = [
("a","Claudia"),
("b","Manuella"),
("c","Remy"),
("d","Paul"),
]
df2Columns = ["id","Name"]
df2 = sqlContext.createDataFrame(data=data2, schema = df2Columns)
print("Actual initial data")
df2.show(truncate=False)
alldays_df = df1.select("Date").distinct().repartition(20)
allusers_df = df2.select("id").distinct().repartition(10)
crossjoin_df = alldays_df.crossJoin(allusers_df)
crossjoin_df = crossjoin_df.withColumn("initial", F.lit(0))
crossjoin_df = crossjoin_df.withColumnRenamed("id", "common_id").cache()
crossjoin_df.show(n=40, truncate=False)
from_sum_df = df1.groupby("Date", "From_Id").agg(F.sum("Price").alias("from_sum"))
from_sum_df = from_sum_df.withColumnRenamed("From_Id", "common_id")
from_sum_df.show(truncate=False)
from_sum_df = crossjoin_df.alias('cross').join(
from_sum_df.alias('from'), ['Date', 'common_id'], how='outer'
).select('Date', 'common_id',
F.coalesce('from.from_sum', 'cross.initial').alias('from_amount') ).cache()
from_sum_df.show(truncate=False)
to_sum_df = df1.groupby("Date", "To_Id").agg(F.sum("Price").alias("to_sum"))
to_sum_df = to_sum_df.withColumnRenamed("To_Id", "common_id")
to_sum_df.show(truncate=False)
to_sum_df = crossjoin_df.alias('cross').join(
to_sum_df.alias('to'), ['Date', 'common_id'], how='outer'
).select('Date', 'common_id',
F.coalesce('to.to_sum', 'cross.initial').alias('to_amount') ).cache()
to_sum_df.show(truncate=False)
joined_df = to_sum_df.join(from_sum_df, ["Date", "common_id"], how='inner')
joined_df.show(truncate=False)
balance_df = joined_df.withColumn("balance", F.col("to_amount") - F.col("from_amount"))
balance_df.show(truncate=False)
final_df = balance_df.join(df2, F.col("id") == F.col("common_id"))
final_df.show(truncate=False)
final_cum_sum = final_df.withColumn('cumsum_balance', F.sum('balance').over(Window.partitionBy('common_id').orderBy('Date').rowsBetween(-sys.maxsize, 0)))
final_cum_sum.show()
Following are all the outputs for your progressive understanding. I am not explaining the steps. You can figure them out.
Actual initial data
+-------+-----+-----+----------+
|From_Id|To_Id|Price|Date |
+-------+-----+-----+----------+
|a |b |20 |2019-05-30|
|b |c |5 |2019-05-30|
|c |a |20 |2019-05-30|
|a |d |10 |2019-06-02|
|d |c |5 |2019-06-02|
+-------+-----+-----+----------+
Actual initial data
+---+--------+
|id |Name |
+---+--------+
|a |Claudia |
|b |Manuella|
|c |Remy |
|d |Paul |
+---+--------+
+----------+---------+-------+
|Date |common_id|initial|
+----------+---------+-------+
|2019-05-30|a |0 |
|2019-05-30|d |0 |
|2019-05-30|b |0 |
|2019-05-30|c |0 |
|2019-06-02|a |0 |
|2019-06-02|d |0 |
|2019-06-02|b |0 |
|2019-06-02|c |0 |
+----------+---------+-------+
+----------+---------+--------+
|Date |common_id|from_sum|
+----------+---------+--------+
|2019-06-02|a |10 |
|2019-05-30|a |20 |
|2019-06-02|d |5 |
|2019-05-30|c |20 |
|2019-05-30|b |5 |
+----------+---------+--------+
+----------+---------+-----------+
|Date |common_id|from_amount|
+----------+---------+-----------+
|2019-06-02|a |10 |
|2019-06-02|c |0 |
|2019-05-30|a |20 |
|2019-05-30|d |0 |
|2019-06-02|b |0 |
|2019-06-02|d |5 |
|2019-05-30|c |20 |
|2019-05-30|b |5 |
+----------+---------+-----------+
+----------+---------+------+
|Date |common_id|to_sum|
+----------+---------+------+
|2019-06-02|c |5 |
|2019-05-30|a |20 |
|2019-06-02|d |10 |
|2019-05-30|c |5 |
|2019-05-30|b |20 |
+----------+---------+------+
+----------+---------+---------+
|Date |common_id|to_amount|
+----------+---------+---------+
|2019-06-02|a |0 |
|2019-06-02|c |5 |
|2019-05-30|a |20 |
|2019-05-30|d |0 |
|2019-06-02|b |0 |
|2019-06-02|d |10 |
|2019-05-30|c |5 |
|2019-05-30|b |20 |
+----------+---------+---------+
+----------+---------+---------+-----------+
|Date |common_id|to_amount|from_amount|
+----------+---------+---------+-----------+
|2019-06-02|a |0 |10 |
|2019-06-02|c |5 |0 |
|2019-05-30|a |20 |20 |
|2019-05-30|d |0 |0 |
|2019-06-02|b |0 |0 |
|2019-06-02|d |10 |5 |
|2019-05-30|c |5 |20 |
|2019-05-30|b |20 |5 |
+----------+---------+---------+-----------+
+----------+---------+---------+-----------+-------+
|Date |common_id|to_amount|from_amount|balance|
+----------+---------+---------+-----------+-------+
|2019-06-02|a |0 |10 |-10 |
|2019-06-02|c |5 |0 |5 |
|2019-05-30|a |20 |20 |0 |
|2019-05-30|d |0 |0 |0 |
|2019-06-02|b |0 |0 |0 |
|2019-06-02|d |10 |5 |5 |
|2019-05-30|c |5 |20 |-15 |
|2019-05-30|b |20 |5 |15 |
+----------+---------+---------+-----------+-------+
+----------+---------+---------+-----------+-------+---+--------+
|Date |common_id|to_amount|from_amount|balance|id |Name |
+----------+---------+---------+-----------+-------+---+--------+
|2019-05-30|a |20 |20 |0 |a |Claudia |
|2019-06-02|a |0 |10 |-10 |a |Claudia |
|2019-05-30|b |20 |5 |15 |b |Manuella|
|2019-06-02|b |0 |0 |0 |b |Manuella|
|2019-05-30|c |5 |20 |-15 |c |Remy |
|2019-06-02|c |5 |0 |5 |c |Remy |
|2019-06-02|d |10 |5 |5 |d |Paul |
|2019-05-30|d |0 |0 |0 |d |Paul |
+----------+---------+---------+-----------+-------+---+--------+
+----------+---------+---------+-----------+-------+---+--------+--------------+
| Date|common_id|to_amount|from_amount|balance| id| Name|cumsum_balance|
+----------+---------+---------+-----------+-------+---+--------+--------------+
|2019-05-30| d| 0| 0| 0| d| Paul| 0|
|2019-06-02| d| 10| 5| 5| d| Paul| 5|
|2019-05-30| c| 5| 20| -15| c| Remy| -15|
|2019-06-02| c| 5| 0| 5| c| Remy| -10|
|2019-05-30| b| 20| 5| 15| b|Manuella| 15|
|2019-06-02| b| 0| 0| 0| b|Manuella| 15|
|2019-05-30| a| 20| 20| 0| a| Claudia| 0|
|2019-06-02| a| 0| 10| -10| a| Claudia| -10|
+----------+---------+---------+-----------+-------+---+--------+--------------+

Related

WithColumn and nulls, Scala Spark

I trying to create a new column and compare it with another one, if are equal I have to put "Yes" else "No" as you can see here:
+----+-------+-----------+----------+
|Game| statB | statPrev | Change |
+----+-------+-----------+----------+
| CA| 2 | 2 | No |
| BL| 5 | 2 | Yes |
| CD| null | null | No |
| NT| 4 | 5 | Yes |
| FT| 6 | null | Yes |
+----+-------+-----------+----------+
What I am trying is:
var df1 = df.withColumn("Change",
when($"statB" =!= $"statPrev"
|| $"statPrev".isNull && $"statB".isNotNull
|| $"statPrev".isNotNull && $"statB".isNotNull, "Yes").otherwise("No"))
But for example when StatB and statPrev are both nulls, I get an "Yes"... What am I doing wrong?
To compare equality with nulls, you can use eqNullSafe for a simpler syntax:
val df2 = df.withColumn(
"Change",
when($"statB".eqNullSafe($"statPrev"), "Yes").otherwise("No")
)
df2.show
+----+-----+--------+------+
|Game|statB|statPrev|Change|
+----+-----+--------+------+
| CA| 2| 2| Yes|
| BL| 5| 2| No|
| CD| null| null| Yes|
| NT| 4| 5| No|
| FT| 6| null| No|
+----+-----+--------+------+
According to your question, if are equal I have to put "Yes" else "No"
It should be
var df1 = df.withColumn("Change1",
when($"statB" === $"statPrev" || ($"statB".isNull && $"statPrev".isNull),
"Yes").otherwise("No"))
df1.show(false)
Or you could use null safe equal operator as
df.withColumn("Change1",
when(($"statB" <=> $"statPrev" ), "Yes").otherwise("No"))
.show(false)
Result:
+----+-----+--------+------+
|Game|statB|statPrev|Change|
+----+-----+--------+------+
|CA |2 |2 |Yes |
|BL |5 |2 |No |
|CD |null |null |Yes |
|NT |4 |5 |No |
|FT |6 |null |No |
+----+-----+--------+------+
If stateB and statePrev equals :
df.withColumn("Change", when($"stateB" === $"statePrev", lit("YES")).otherwise("NO")).show(false);
output
+----+------+---------+---+
|Game|stateB|statePrev|Change|
+----+------+---------+---+
|CA |2 |2 |YES|
|BL |5 |2 |NO |
|CD |null |null |YES|
|NT |4 |5 |NO |
|FT |6 |null |NO |
+----+------+---------+---+
if you want to tell No if the null values on stateB and statePrev -
df.withColumn("Change",
when(($"stateB" === $"statePrev") && ($"stateB".notEqual( "null")
&& $"statePrev".notEqual( "null")),
lit("YES")).otherwise("NO")).show(false)
output
+----+------+---------+------+
|Game|stateB|statePrev|Change|
+----+------+---------+------+
|CA |2 |2 |YES |
|BL |5 |2 |NO |
|CD |null |null |NO |
|NT |4 |5 |NO |
|FT |6 |null |NO |
+----+------+---------+------+

How would I repeat each row in a Scala dataframe N times

Here is the before of the dataframe:
and here is the after:
notice how the rows that are repeated are all next to each other, as opposed to just starting the dataframe over from scratch at the end.
Thanks
Try with array_repeat with struct function then explode the array.
Example:
df.show()
/*
+----+----+
|col1|col2|
+----+----+
| 1| 4|
| 2| 5|
| 3| 6|
+----+----+
*/
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
df.withColumn("arr",explode(array_repeat(struct(df.columns.head,df.columns.tail:_*),7))).
select("arr.*").
toDF("col1","col2").
show(100,false)
/*
+----+----+
|col1|col2|
+----+----+
|1 |4 |
|1 |4 |
|1 |4 |
|1 |4 |
|1 |4 |
|1 |4 |
|1 |4 |
|2 |5 |
|2 |5 |
|2 |5 |
|2 |5 |
|2 |5 |
|2 |5 |
|2 |5 |
|3 |6 |
|3 |6 |
|3 |6 |
|3 |6 |
|3 |6 |
|3 |6 |
|3 |6 |
+----+----+
*/
Here's a function which duplicates a DataFrame:
def repeatRows(df: DataFrame, numRepeats: Int): DataFrame = {
(1 until numRepeats).foldLeft(df)((growingDF, _) => growingDF.union(df))
}
The problem of having the resulting DataFrame sorted is separate from the duplication process, and hence wasn't included in the function, but can be easily achieved afterwards.
So let's take your problem:
// Problem setup
val someDF = Seq((1,4),(2,4),(3,6)).toDF("col1","col2")
// Duplicate followed by sort
val duplicatedSortedDF = repeatRows(someDF, 3).sort("col1")
// Show result
duplicatedSortedDF.show()
+----+----+
|col1|col2|
+----+----+
| 1| 4|
| 1| 4|
| 1| 4|
| 2| 4|
| 2| 4|
| 2| 4|
| 3| 6|
| 3| 6|
| 3| 6|
+----+----+
And there you have it.

Spark Scala, merging two columnar dataframes duplicating the second dataframe each time

I want to merge 2 columns or 2 dataframes like
df1
+--+
|id|
+--+
|1 |
|2 |
|3 |
+--+
df2 --> this one can be a list as well
+--+
|m |
+--+
|A |
|B |
|C |
+--+
I want to have as resulting table
+--+--+
|id|m |
+--+--+
|1 |A |
|1 |B |
|1 |C |
|2 |A |
|2 |B |
|2 |C |
|3 |A |
|3 |B |
|3 |C |
+--+--+
def crossJoin(right: org.apache.spark.sql.Dataset[_]): org.apache.spark.sql.DataFrame
Using crossJoin function you can get same result. Please check code below.
scala> dfa.show
+---+
| id|
+---+
| 1|
| 2|
| 3|
+---+
scala> dfb.show
+---+
| m|
+---+
| A|
| B|
| C|
+---+
scala> dfa.crossJoin(dfb).orderBy($"id".asc).show(false)
+---+---+
|id |m |
+---+---+
|1 |B |
|1 |A |
|1 |C |
|2 |A |
|2 |B |
|2 |C |
|3 |C |
|3 |B |
|3 |A |
+---+---+

how to rename the Columns Produced by count() function in Scala

I have the below df:
+------+-------+--------+
|student| vars|observed|
+------+-------+--------+
| 1| ABC | 19|
| 1| ABC | 1|
| 2| CDB | 1|
| 1| ABC | 8|
| 3| XYZ | 3|
| 1| ABC | 389|
| 2| CDB | 946|
| 1| ABC | 342|
|+------+-------+--------+
I wanted to add a new frequency column groupBy two columns "student", "vars" in SCALA.
val frequency = df.groupBy($"student", $"vars").count()
This code generates a "count" column with the frequencies BUT losing observed column from the df.
I would like to create a new df as follows without losing "observed" column
+------+-------+--------+------------+
|student| vars|observed|total_count |
+------+-------+--------+------------+
| 1| ABC | 9|22
| 1| ABC | 1|22
| 2| CDB | 1|7
| 1| ABC | 2|22
| 3| XYZ | 3|3
| 1| ABC | 8|22
| 2| CDB | 6|7
| 1| ABC | 2|22
|+------+-------+-------+--------------+
You cannot do this directly but there are couple of ways,
You can join original df with count df. check here
You collect the observed column while doing aggregation and explode it again
With explode:
val frequency = df.groupBy("student", "vars").agg(collect_list("observed").as("observed_list"),count("*").as("total_count")).select($"student", $"vars",explode($"observed_list").alias("observed"), $"total_count")
scala> frequency.show(false)
+-------+----+--------+-----------+
|student|vars|observed|total_count|
+-------+----+--------+-----------+
|3 |XYZ |3 |1 |
|2 |CDB |1 |2 |
|2 |CDB |946 |2 |
|1 |ABC |389 |5 |
|1 |ABC |342 |5 |
|1 |ABC |19 |5 |
|1 |ABC |1 |5 |
|1 |ABC |8 |5 |
+-------+----+--------+-----------+
We can use Window functions as well
val windowSpec = Window.partitionBy("student","vars")
val frequency = df.withColumn("total_count", count(col("student")) over windowSpec)
.show
+-------+----+--------+-----------+
|student|vars|observed|total_count|
+-------+----+--------+-----------+
|3 |XYZ |3 |1 |
|2 |CDB |1 |2 |
|2 |CDB |946 |2 |
|1 |ABC |389 |5 |
|1 |ABC |342 |5 |
|1 |ABC |19 |5 |
|1 |ABC |1 |5 |
|1 |ABC |8 |5 |
+-------+----+--------+-----------+

clean missing values spark with aggregation function

I would like to clean missing values by replacing them by the mean.This source code used to work i do not why , it doesn't work now.Any help will be appreciated.
Here is the dataset i use
RowNumber,Poids,Age,Taille,0MI,Hmean,CoocParam,LdpParam,Test2,Classe
0,,72,160,5,,2.9421,,3,4
1,54,70,,5,0.6301,2.7273,,3,
2,,51,164,5,,2.9834,,3,4
3,,74,170,5,0.6966,2.9654,2.3699,3,4
4,108,62,,5,0.6087,2.7093,2.1619,3,4
Here what i did
val spark = SparkSession.builder.master("local").appName("my-spark-app").getOrCreate()
val df = spark.read.option("header", true).option("inferSchema", true).format("com.databricks.spark.csv").load("C:/Users/mhattabi/Desktop/data_with_missing_values3.csv")
df.show(false)
var newDF = df
df.dtypes.foreach { x =>
val colName = x._1
newDF = newDF.na.fill(df.agg(max(colName)).first()(0).toString, Seq(colName))
}
newDF.show(false)
Here is the result , nothing happened
initial_data
+---------+-----+---+------+---+------+---------+--------+-----+------+
|RowNumber|Poids|Age|Taille|0MI|Hmean |CoocParam|LdpParam|Test2|Classe|
+---------+-----+---+------+---+------+---------+--------+-----+------+
|0 |null |72 |160 |5 |null |2.9421 |null |3 |4 |
|1 |54 |70 |null |5 |0.6301|2.7273 |null |3 |null |
|2 |null |51 |164 |5 |null |2.9834 |null |3 |4 |
|3 |null |74 |170 |5 |0.6966|2.9654 |2.3699 |3 |4 |
|4 |108 |62 |null |5 |0.6087|2.7093 |2.1619 |3 |4 |
+---------+-----+---+------+---+------+---------+--------+-----+------+
new_data
+---------+-----+---+------+---+------+---------+--------+-----+------+
|RowNumber|Poids|Age|Taille|0MI|Hmean |CoocParam|LdpParam|Test2|Classe|
+---------+-----+---+------+---+------+---------+--------+-----+------+
|0 |null |72 |160 |5 |null |2.9421 |null |3 |4 |
|1 |54 |70 |null |5 |0.6301|2.7273 |null |3 |null |
|2 |null |51 |164 |5 |null |2.9834 |null |3 |4 |
|3 |null |74 |170 |5 |0.6966|2.9654 |2.3699 |3 |4 |
|4 |108 |62 |null |5 |0.6087|2.7093 |2.1619 |3 |4 |
+---------+-----+---+------+---+------+---------+--------+-----+------+
What should i do
You can use withColumn api and use when function to check for null values in the columns as
df.dtypes.foreach { x =>
val colName = x._1
val fill = df.agg(max(col(s"`$colName`"))).first()(0).toString
newDF = newDF.withColumn(colName, when(col(s"`$colName`").isNull , fill).otherwise(col(s"`$colName`")) )
}
newDF.show(false)
I hope this solves your issue
If you are trying to replace the null values with mean value then you calculate mean and fill as
import org.apache.spark.sql.functions.mean
val data = spark.read.option("header", true)
.option("inferSchema", true).format("com.databricks.spark.csv")
.load("data.csv")
//Calculate the mean for each column and create a map with its column name
//and use na.fill() method to replace null with that mean
data.na.fill(data.columns.zip(
data.select(data.columns.map(mean(_)): _*).first.toSeq
).toMap)
I have tested the code in local and works fine.
Output:
+---------+-----+---+------+---+------------------+---------+------------------+-----+------+
|RowNumber|Poids|Age|Taille|0MI| Hmean|CoocParam| LdpParam|Test2|Classe|
+---------+-----+---+------+---+------------------+---------+------------------+-----+------+
| 0| 81| 72| 160| 5|0.6451333333333333| 2.9421|2.2659000000000002| 3| 4|
| 1| 54| 70| 164| 5| 0.6301| 2.7273|2.2659000000000002| 3| 4|
| 2| 81| 51| 164| 5|0.6451333333333333| 2.9834|2.2659000000000002| 3| 4|
| 3| 81| 74| 170| 5| 0.6966| 2.9654| 2.3699| 3| 4|
| 4| 108| 62| 164| 5| 0.6087| 2.7093| 2.1619| 3| 4|
+---------+-----+---+------+---+------------------+---------+------------------+-----+------+
Hope this helps!
This should do:
var imputeDF = df
df.dtypes.foreach { x =>
val colName = x._1
newDF = newDF.na.fill(df.agg(max(colName)).first()(0).toString , Seq(colName)) }
Note that it is not a good practice to use Mutable data types with scala.
Depending on your data, you can use a SQL join or something else to replace the nulls with a more suitable value.