Pyspark aggregation - pyspark

I am trying to aggregate pyspark dataframe. Sample looks like this:
+---+-------------------+
| id| struct|
+---+-------------------+
|id1| [foo, true, true]|
|id1| [foo, true, false]|
|id1|[foo, false, false]|
|id1| [bar, true, true]|
|id1| [bar, true, false]|
|id1|[bar, false, false]|
|id2| [foo, true, true]|
|id2|[foo, false, false]|
|id2| [bar, true, true]|
|id2|[bar, false, false]|
+---+-------------------+
ID column will have max 1500 unique IDs, struct.name will have 5 unique values.
Here is my code, which compute what I want:
from pyspark.sql.types import *
from shared.spark import start_spark
import pyspark.sql.functions as F
spark = start_spark('app')
schema = StructType([StructField('id', StringType()),
StructField('struct', StructType(
[StructField('name', StringType()),
StructField('param1', BooleanType()),
StructField('param2', BooleanType()),
]
))])
data = [['id1', ['foo', True, True]],
['id1', ['foo', True, False]],
['id1', ['foo', False, False]],
['id1', ['bar', True, True]],
['id1', ['bar', True, False]],
['id1', ['bar', False, False]],
['id2', ['foo', True, True]],
['id2', ['foo', False, False]],
['id2', ['bar', True, True]],
['id2', ['bar', False, False]]
]
df = spark.createDataFrame(data, schema)
df.groupby('id')\
.agg(F.count(F.when((df['struct.name']=='foo') &
(df['struct.param1']) &
(df['struct.param2']), 1)).alias('foo_cond1'),
F.count(F.when((df['struct.name'] == 'foo') &
(df['struct.param1']) &
(df['struct.param2']==False), 1)).alias('foo_cond2'),
F.count(F.when((df['struct.name'] == 'foo') &
(df['struct.param1']==False) &
(df['struct.param2']==False), 1)).alias('foo_cond3'),
F.count(F.when((df['struct.name']=='bar') &
(df['struct.param1']) &
(df['struct.param2']), 1)).alias('bar_cond1'),
F.count(F.when((df['struct.name'] == 'bar') &
(df['struct.param1']) &
(df['struct.param2']==False), 1)).alias('bar_cond2'),
F.count(F.when((df['struct.name'] == 'bar') &
(df['struct.param1']==False) &
(df['struct.param2']==False), 1)).alias('bar_cond3'),
) \
.withColumn('foo', F.struct(F.col('foo_cond1').alias('cond1'),
F.col('foo_cond2').alias('cond2'),
F.col('foo_cond3').alias('cond3')
)
) \
.withColumn('bar', F.struct(F.col('bar_cond1').alias('cond1'),
F.col('bar_cond2').alias('cond2'),
F.col('bar_cond3').alias('cond3')
)
) \
.select('id', 'foo', 'bar') \
.show()
And the result is as follows:
+---+---------+---------+
| id| foo| bar|
+---+---------+---------+
|id1|[1, 1, 1]|[1, 1, 1]|
|id2|[1, 0, 1]|[1, 0, 1]|
+---+---------+---------+
Is there a better way to do such aggregation, which will perform better with less code? Maybe using pandas UDAF? Appreciate every comment. Thanks

I was able to use pandasUDFType, but it seems that running time had increased on more that 30%. But I only use mentioned sample data.
from pyspark.sql.types import *
from shared.spark import start_spark
import pyspark.sql.functions as F
from pyspark.sql.functions import pandas_udf, PandasUDFType
spark = start_spark('app')
schema = StructType([StructField('id', StringType()),
StructField('struct', StructType(
[StructField('name', StringType()),
StructField('param1', BooleanType()),
StructField('param2', BooleanType()),
]
))])
schema_udf = StructType(
[StructField('id', StringType()),
StructField('foo1', DoubleType()),
StructField('foo2', DoubleType()),
StructField('foo3', DoubleType()),
StructField('bar1', DoubleType()),
StructField('bar2', DoubleType()),
StructField('bar3', DoubleType()),
])
data = [['id1', ['foo', True, True]],
['id1', ['foo', True, False]],
['id1', ['foo', False, False]],
['id1', ['bar', True, True]],
['id1', ['bar', True, False]],
['id1', ['bar', False, False]],
['id2', ['foo', True, True]],
['id2', ['foo', False, False]],
['id2', ['bar', True, True]],
['id2', ['bar', False, False]]
]
df = spark.createDataFrame(data, schema)
#pandas_udf(schema_udf, PandasUDFType.GROUPED_MAP)
def myGroupby(df_group):
def countComb(df):
def countCombinations(param1, param2):
cond1, cond2, cond3 = 0, 0, 0
if param1:
if param2:
cond1 += 1
else:
cond2 += 1
else:
cond3 += 1
return cond1, cond2, cond3
if df['name']=='foo':
df['foo1'], df['foo2'], df['foo3'] = countCombinations(df.param1, df.param2)
if df['name']=='bar':
df['bar1'], df['bar2'], df['bar3'] = countCombinations(df.param1, df.param2)
return df
df_result = df_group.apply(countComb, axis=1)
return df_result[['id', 'foo1', 'foo2', 'foo3', 'bar1', 'bar2', 'bar3']].groupby('id').sum().reset_index()
df \
.select('id', 'struct.name', 'struct.param1', 'struct.param2') \
.groupby("id") \
.apply(myGroupby) \
.withColumn('foo', F.struct(F.col('foo1').alias('cond1'),
F.col('foo2').alias('cond2'),
F.col('foo3').alias('cond3')
)
) \
.withColumn('bar', F.struct(F.col('bar1').alias('cond1'),
F.col('bar2').alias('cond2'),
F.col('bar3').alias('cond3')
)
) \
.select('id', 'foo', 'bar') \
.show()
So what is the best practise for writing pyspark aggregations? What if I want to aggregate on lot of columns, it is better to write a lot of conditions (like in original question) or divide aggregation to smaller parts and then join dataframes?

How about this one?
Split the Struc into separate columns and use window to compute the count.
from pyspark.sql.types import *
import pyspark.sql.functions as F
schema = StructType([StructField('id', StringType()),
StructField('struct', StructType(
[StructField('name', StringType()),
StructField('param1', BooleanType()),
StructField('param2', BooleanType()),
]
))])
data = [['id1', ['foo', True, True]],
['id1', ['foo', True, False]],
['id1', ['foo', False, False]],
['id1', ['bar', True, True]],
['id1', ['bar', True, False]],
['id1', ['bar', False, False]],
['id2', ['foo', True, True]],
['id2', ['foo', False, False]],
['id2', ['bar', True, True]],
['id2', ['bar', False, False]]
]
df = spark.createDataFrame(data, schema)
df = df.withColumn('name', F.col('struct').getField('name'))
df = df.withColumn('param1', F.col('struct').getField('param1'))
df = df.withColumn('param2', F.col('struct').getField('param2'))
w = Window.partitionBy(['id', 'name'])
df = df.withColumn('c1', F.count(F.when((df['param1']==True)&(df['param2']==True), 1)).over(w))
df = df.withColumn('c2', F.count(F.when((df['param1']==True)&(df['param2']==False), 1)).over(w))
df = df.withColumn('c3', F.count(F.when((df['param1']==False)&(df['param2']==False), 1)).over(w))
df = df.withColumn('result', F.array(['c1', 'c2', 'c3']))
df.show()
+---+-------------------+----+------+------+---+---+---+---------+
| id| struct|name|param1|param2| c1| c2| c3| result|
+---+-------------------+----+------+------+---+---+---+---------+
|id2| [bar, true, true]| bar| true| true| 1| 0| 1|[1, 0, 1]|
|id2|[bar, false, false]| bar| false| false| 1| 0| 1|[1, 0, 1]|
|id1| [foo, true, true]| foo| true| true| 1| 1| 1|[1, 1, 1]|
|id1|[foo, false, false]| foo| false| false| 1| 1| 1|[1, 1, 1]|
|id1| [foo, true, false]| foo| true| false| 1| 1| 1|[1, 1, 1]|
|id1| [bar, true, true]| bar| true| true| 1| 1| 1|[1, 1, 1]|
|id1| [bar, true, false]| bar| true| false| 1| 1| 1|[1, 1, 1]|
|id1|[bar, false, false]| bar| false| false| 1| 1| 1|[1, 1, 1]|
|id2|[foo, false, false]| foo| false| false| 1| 0| 1|[1, 0, 1]|
|id2| [foo, true, true]| foo| true| true| 1| 0| 1|[1, 0, 1]|
+---+-------------------+----+------+------+---+---+---+---------+
We then use pivot:
df = df.groupby('id').pivot('name').agg(F.first('result'))
df.show()
+---+---------+---------+
| id| bar| foo|
+---+---------+---------+
|id1|[1, 1, 1]|[1, 1, 1]|
|id2|[1, 0, 1]|[1, 0, 1]|
+---+---------+---------+

Related

filtering spark dataframe based on label changes in time series

Input dataframe has 4 columns - id (str), group (str), elapsed time in days (int) and label (int).
inp = spark.createDataFrame([
['1', "A", 23, 2],
['1', "A", 45, 2],
['1', "A", 73, 2],
['1', "A", 84, 3],
['1', "A", 95, 3],
['1', "A", 101, 2],
['1', "A", 105, 2],
['1', "B", 20, 1],
['1', "B", 40, 1],
['1', "B", 60, 2],
['2', "A", 10, 4],
['2', "A", 20, 4],
['2', "A", 30, 4]
], schema=["id","grp","elap","lbl"])
For every (id,grp) I need the output frame to have records with the first occurence of a different label.
out = spark.createDataFrame([
['1', "A", 23, 2],
['1', "A", 84, 3],
['1', "A", 101, 2],
['1', "B", 20, 1],
['1', "B", 60, 2],
['2', "A", 10, 4],
], schema=["id","grp","elap","lbl"])
The dataframe has a billion rows and looking for an efficient way to do this.
Check if current label is not equal to previous label (group by id and grp):
from pyspark.sql.window import Window
import pyspark.sql.functions as f
inp.withColumn('prevLbl', f.lag('lbl').over(Window.partitionBy('id', 'grp').orderBy('elap')))\
.filter(f.col('prevLbl').isNull() | (f.col('prevLbl') != f.col('lbl')))\
.drop('prevLbl').show()
+---+---+----+---+
| id|grp|elap|lbl|
+---+---+----+---+
| 1| A| 23| 2|
| 1| A| 84| 3|
| 1| A| 101| 2|
| 1| B| 20| 1|
| 1| B| 60| 2|
| 2| A| 10| 4|
+---+---+----+---+

pyspark: Is it possible to create array with missing elements in one struct

My input DataFrame schema is like below. The difference between elements 1 and 2 in d is 1 has attributes a,b,c,d and 2 has only a,b,c
root
|-- a: string (nullable = true)
|-- b: string (nullable = true)
|-- c: string (nullable = true)
|-- d: struct (nullable = true)
| |-- 1: struct (nullable = true)
| | |-- a: string (nullable = true)
| | |-- b: string (nullable = true)
| | |-- c: string (nullable = true)
| | |-- d: double (nullable = true)
| |-- 2: struct (nullable = true)
| | |-- a: string (nullable = true)
| | |-- b: string (nullable = true)
| | |-- c: string (nullable = true)
I am trying explode the elements of d using below code
df2 = inputDF.withColumn("d1",f.explode(f.array("d.*").getField("c")))
and getting error pyspark.sql.utils.AnalysisException: cannot resolve 'array(d.1, d.2)' due to data type mismatch: input to function array should all be the same type, but it's [struct<a:string,b:string,c:string,d:double>, struct<a:string,b:string,c:string>];
'Project [a#832, b#833, c#834, d#835, explode(array(d#835.1, d#835.2)[c]) AS d1#843]
+- Relation[a#832,b#833,c#834,d#835] json
Is there any way to instruct the function to assume NULLS when missing columns in input to array function?
You can explode array of struct where one of the element missing a field as in you case by following:
import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, ArrayType, StructField, StringType
spark = SparkSession \
.builder \
.appName("SparkTesting") \
.getOrCreate()
d_schema = ArrayType(StructType([
StructField('a', StringType(), nullable=True),
StructField('b', StringType(), nullable=True),
StructField('c', StringType(), nullable=True),
StructField('d', StringType(), nullable=True),
]))
df_schema = (StructType()
.add("a", StringType(), nullable=True)
.add("b", StringType(), nullable=True)
.add("c", StringType(), nullable=True)
.add("d", d_schema, nullable=True))
item1 = {
"a": "a1",
"b": "b1",
"c": "c1",
"d": [
{
"a": "a1",
"b": "b1",
"c": "c1",
"d": "d1"
},
{
"a": "a1",
"b": "b1",
"c": "c1",
}
],
}
df = spark.createDataFrame([item1], schema=df_schema)
df.printSchema()
df.show(truncate=False)
df2 = df.withColumn("d1", f.explode(col("d")))
df2.printSchema()
df2.show(truncate=False)
df2.select("d1.c").show()
+---+---+---+--------------------------------------+------------------+
|a |b |c |d |d1 |
+---+---+---+--------------------------------------+------------------+
|a1 |b1 |c1 |[{a1, b1, c1, d1}, {a1, b1, c1, null}]|{a1, b1, c1, d1} |
|a1 |b1 |c1 |[{a1, b1, c1, d1}, {a1, b1, c1, null}]|{a1, b1, c1, null}|
+---+---+---+--------------------------------------+------------------+
In case you are not sure whether the array field d itself will be null then its advisable to use explode_outer() function instead of explode().
As per the comment to match the schema:
below code will work:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
spark = SparkSession \
.builder \
.appName("StructuredStreamTesting") \
.getOrCreate()
d_inter_schema = (StructType([
StructField('a', StringType(), nullable=True),
StructField('b', StringType(), nullable=True),
StructField('c', StringType(), nullable=True),
StructField('d', StringType(), nullable=True),
]))
d_schema = StructType().add("1", d_inter_schema, nullable=True).add("2", d_inter_schema, nullable=True)
df_schema = (StructType()
.add("a", StringType(), nullable=True)
.add("b", StringType(), nullable=True)
.add("c", StringType(), nullable=True)
.add("d", d_schema, nullable=True))
item1 = {
"a": "a1",
"b": "b1",
"c": "c1",
"d": {"1": {
"a": "a1",
"b": "b1",
"c": "c1",
"d": "d1"
},
"2": {
"a": "a1",
"b": "b1",
"c": "c1",
}
},
}
df = spark.createDataFrame([item1], schema=df_schema)
df.printSchema()
df.show(truncate=False)
+---+---+---+--------------------------------------+
|a |b |c |d |
+---+---+---+--------------------------------------+
|a1 |b1 |c1 |{{a1, b1, c1, d1}, {a1, b1, c1, null}}|
+---+---+---+--------------------------------------+
df.select("d.1.c", "d.2.c").show()
+---+---+
| c| c|
+---+---+
| c1| c1|
+---+---+

What kind of schema will help parsing this type of json into Spark SQL in Scala?

I am interested in accessing data attrbitue values as rows with each item inside that row to be assigned value to the corresponding column name mentioned in the sample at the bottom of this question.
{
"meta": {
"a": {
"b": []
}
},
"data" : [ [ "row-r9pv-p86t.ifsp", "00000000-0000-0000-0838-60C2FFCC43AE", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOEY", "KINGS", "F", "11" ]
, [ "row-7v2v~88z5-44se", "00000000-0000-0000-C8FC-DDD3F9A72DFF", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOEY", "SUFFOLK", "F", "6" ]
, [ "row-hzc9-4kvv~mbc9", "00000000-0000-0000-562E-D9A0792557FC", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOEY", "MONROE", "F", "6" ]
, [ "row-3473_8cwy~3vez", "00000000-0000-0000-B19D-7B88FF2FB6A0", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOEY", "ERIE", "F", "9" ]
, [ "row-tyuh.nmy9.r2n3", "00000000-0000-0000-7D66-E7EC8F12BB8D", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOE", "ULSTER", "F", "5" ]
, [ "row-ct48~ui69-2zsn", "00000000-0000-0000-7ECC-F350540A8F92", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOE", "WESTCHESTER", "F", "24" ]
, [ "row-gdva~4v8k-vuwy", "00000000-0000-0000-30FB-CB5E36017AD5", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOE", "BRONX", "F", "13" ]
, [ "row-gzu3~a7hk~bqym", "00000000-0000-0000-E380-AAAB1FA5C7A7", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOE", "NEW YORK", "F", "55" ]
, [ "row-ekbw_tb7c.yvgp", "00000000-0000-0000-A7FF-8A4260B3A505", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOE", "NASSAU", "F", "15" ]
, [ "row-zk7s-r2ma_t8mk", "00000000-0000-0000-3F7C-4DECA15E0F5B", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOE", "ERIE", "F", "6" ]
, [ "row-ieja_864x~w2ki", "00000000-0000-0000-854E-D29D5B4D5636", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOE", "SUFFOLK", "F", "14" ]
, [ "row-8fp4.rjtj.h46h", "00000000-0000-0000-C177-43F52BFECC07", 0, 1574264158, null, 1574264158, null, "{ }", "2007", "ZOE", "KINGS", "F", "34" ]
]
}
I tried following but it only gives null values for each row. Can you help how to get each item in the row into a specific field such as each value is assigned to attribute named on the right of the value below.
val schema = new StructType()
.add(
"data", new ArrayType(new StructType(), false), false
)
val nestDF = spark.read.schema(schema).json("dbfs:/tmp/rows.json")
Here's the expected structure :
/* [
"row-r9pv-p86t.ifsp" <-- sid
"00000000-0000-0000-0838-60C2FFCC43AE" <-- id
0 <-- position
1574264158 <-- created_at
null <-- created_meta
1574264158 <-- updated_at
null <-- updated_meta
"{ }" <-- meta
"2007" <-- year of birth
"ZOEY" <-- child's first name
"KINGS" <-- county
"F" <-- gender
"11" <-- count
]
*/
Atharva, you can try this piece of code. I didn't cast the attributes to expected datatypes but should be easy now :) :
import sparkSession.implicits._
import org.apache.spark.sql.functions._
val df = sparkSession.read.option("multiLine", true).json("src/main/resources/json.json")
val schema = StructType(Seq("sid","id","position","created_at","created_meta","updated_at","updated_meta","meta","yearOfBirth","childsFirstName","county","gender","count").map(c => StructField(c, StringType)))
val toStruct = udf({seq: Seq[String] => Row.fromSeq(seq)}, schema)
val newDF = df.select(explode($"data").as("dataRow"))
.select(toStruct($"dataRow").as("struct"))
.select("struct.*")
newDF.printSchema()
root
|-- sid: string (nullable = true)
|-- id: string (nullable = true)
|-- position: string (nullable = true)
|-- created_at: string (nullable = true)
|-- created_meta: string (nullable = true)
|-- updated_at: string (nullable = true)
|-- updated_meta: string (nullable = true)
|-- meta: string (nullable = true)
|-- yearOfBirth: string (nullable = true)
|-- childsFirstName: string (nullable = true)
|-- county: string (nullable = true)
|-- gender: string (nullable = true)
|-- count: string (nullable = true)
newDF.show(false)
+------------------+------------------------------------+--------+----------+------------+----------+------------+----+-----------+---------------+-----------+------+-----+
|sid |id |position|created_at|created_meta|updated_at|updated_meta|meta|yearOfBirth|childsFirstName|county |gender|count|
+------------------+------------------------------------+--------+----------+------------+----------+------------+----+-----------+---------------+-----------+------+-----+
|row-r9pv-p86t.ifsp|00000000-0000-0000-0838-60C2FFCC43AE|0 |1574264158|null |1574264158|null |{ } |2007 |ZOEY |KINGS |F |11 |
|row-7v2v~88z5-44se|00000000-0000-0000-C8FC-DDD3F9A72DFF|0 |1574264158|null |1574264158|null |{ } |2007 |ZOEY |SUFFOLK |F |6 |
|row-hzc9-4kvv~mbc9|00000000-0000-0000-562E-D9A0792557FC|0 |1574264158|null |1574264158|null |{ } |2007 |ZOEY |MONROE |F |6 |
|row-3473_8cwy~3vez|00000000-0000-0000-B19D-7B88FF2FB6A0|0 |1574264158|null |1574264158|null |{ } |2007 |ZOEY |ERIE |F |9 |
|row-tyuh.nmy9.r2n3|00000000-0000-0000-7D66-E7EC8F12BB8D|0 |1574264158|null |1574264158|null |{ } |2007 |ZOE |ULSTER |F |5 |
|row-ct48~ui69-2zsn|00000000-0000-0000-7ECC-F350540A8F92|0 |1574264158|null |1574264158|null |{ } |2007 |ZOE |WESTCHESTER|F |24 |
|row-gdva~4v8k-vuwy|00000000-0000-0000-30FB-CB5E36017AD5|0 |1574264158|null |1574264158|null |{ } |2007 |ZOE |BRONX |F |13 |
|row-gzu3~a7hk~bqym|00000000-0000-0000-E380-AAAB1FA5C7A7|0 |1574264158|null |1574264158|null |{ } |2007 |ZOE |NEW YORK |F |55 |
|row-ekbw_tb7c.yvgp|00000000-0000-0000-A7FF-8A4260B3A505|0 |1574264158|null |1574264158|null |{ } |2007 |ZOE |NASSAU |F |15 |
|row-zk7s-r2ma_t8mk|00000000-0000-0000-3F7C-4DECA15E0F5B|0 |1574264158|null |1574264158|null |{ } |2007 |ZOE |ERIE |F |6 |
|row-ieja_864x~w2ki|00000000-0000-0000-854E-D29D5B4D5636|0 |1574264158|null |1574264158|null |{ } |2007 |ZOE |SUFFOLK |F |14 |
|row-8fp4.rjtj.h46h|00000000-0000-0000-C177-43F52BFECC07|0 |1574264158|null |1574264158|null |{ } |2007 |ZOE |KINGS |F |34 |
+------------------+------------------------------------+--------+----------+------------+----------+------------+----+-----------+---------------+-----------+------+-----+

How to create nested json using Apache Spark with Scala

I am trying to create a nested JSON from my spark dataframe which has data in following structure.
Vendor_Name,count,Categories,Category_Count,Subcategory,Subcategory_Count
Vendor1,10,Category 1,4,Sub Category 1,1
Vendor1,10,Category 1,4,Sub Category 2,2
Vendor1,10,Category 1,4,Sub Category 3,3
Vendor1,10,Category 1,4,Sub Category 4,4
Required json output in below format using Apache-Spark with Scala.
[{
"vendor_name": "Vendor 1",
"count": 10,
"categories": [{
"name": "Category 1",
"count": 4,
"subCategories": [{
"name": "Sub Category 1",
"count": 1
},
{
"name": "Sub Category 2",
"count": 1
},
{
"name": "Sub Category 3",
"count": 1
},
{
"name": "Sub Category 4",
"count": 1
}
]
}]
//read file into DataFrame
scala> val df = spark.read.format("csv").option("header", "true").load(<input CSV path>)
df: org.apache.spark.sql.DataFrame = [Vendor_Name: string, count: string ... 4 more fields]
scala> df.show(false)
+-----------+-----+----------+--------------+--------------+-----------------+
|Vendor_Name|count|Categories|Category_Count|Subcategory |Subcategory_Count|
+-----------+-----+----------+--------------+--------------+-----------------+
|Vendor1 |10 |Category 1|4 |Sub Category 1|1 |
|Vendor1 |10 |Category 1|4 |Sub Category 2|2 |
|Vendor1 |10 |Category 1|4 |Sub Category 3|3 |
|Vendor1 |10 |Category 1|4 |Sub Category 4|4 |
+-----------+-----+----------+--------------+--------------+-----------------+
//convert into desire Json format
scala> val df1 = df.groupBy("Vendor_Name","count","Categories","Category_Count").agg(collect_list(struct(col("Subcategory").alias("name"),col("Subcategory_Count").alias("count"))).alias("subCategories")).groupBy("Vendor_Name","count").agg(collect_list(struct(col("Categories").alias("name"),col("Category_Count").alias("count"),col("subCategories"))).alias("categories"))
df1: org.apache.spark.sql.DataFrame = [Vendor_Name: string, count: string ... 1 more field]
scala> df1.printSchema
root
|-- Vendor_Name: string (nullable = true)
|-- count: string (nullable = true)
|-- categories: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- name: string (nullable = true)
| | |-- count: string (nullable = true)
| | |-- subCategories: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- name: string (nullable = true)
| | | | |-- count: string (nullable = true)
//Write df in json format
scala> df1.write.format("json").mode("append").save(<output Path>)

replace null value in the column of a dataframe with the value in other dataframe wrt to id

I have two dataframe
df1 :
+---------------+-------------------+-----+------------------------+------------------------+---------+
|id |dt |speed|stats |lag_stat |lag_speed|
+---------------+-------------------+-----+------------------------+------------------------+---------+
|358899055773504|2018-07-31 18:38:36|0 |[9, -1, -1, 13, 0, 1, 0]|null |null |
|358899055773504|2018-07-31 18:58:34|0 |[9, 0, -1, 22, 0, 1, 0] |[9, -1, -1, 13, 0, 1, 0]|0 |
|358899055773505|2018-07-31 18:54:23|4 |[9, 0, 0, 22, 1, 1, 1] |null |null |
+---------------+-------------------+-----+------------------------+------------------------+---------+
df2 :
+---------------+-------------------+-----+------------------------+
|id |dt |speed|stats |
+---------------+-------------------+-----+------------------------+
|358899055773504|2018-07-31 18:38:34|0 |[9, -1, -1, 13, 0, 1, 0]|
|358899055773505|2018-07-31 18:48:23|4 |[8, -1, 0, 22, 1, 1, 1] |
+---------------+-------------------+-----+------------------------+
I want to replace the null value in column lag_stat,speed in df1 with the value of stat and speed from dataframe df2 wrt to the same id.
Desired output looks like this:
+---------------+-------------------+-----+--------------------+--------------------+---------+
| id| dt|speed| stats| lag_stat|lag_speed|
+---------------+-------------------+-----+--------------------+--------------------+---------+
|358899055773504|2018-07-31 18:38:36| 0|[9, -1, -1, 13, 0, 1,0]|[9, -1, -1, 13, 0, 1, 0]| 0|
|358899055773504|2018-07-31 18:58:34| 0|[9, 0, -1, 22, 0, 1, 0]|[9, -1, -1, 13, 0, 1, 0]| 0|
|358899055773505|2018-07-31 18:54:23| 4|[9, 0, 0, 22, 1, 1, 1]|[8, -1, 0, 22, 1, 1, 1] | 4 |
+---------------+-------------------+-----+--------------------+--------------------+---------+
One possible way could be join the DFs and then apply some when functions on that columns.
For example, this:
val output = df1.join(df2, df1.col("id")===df2.col("id"))
.select(df1.col("id"),
df1.col("dt"),
df1.col("speed"),
df1.col("stats"),
when(df1.col("lag_stat").isNull,df2.col("stats")).otherwise(df1.col("lag_stat")).alias("lag_stats"),
when(df1.col("lag_speed").isNull,df2.col("speed")).otherwise(df1.col("lag_speed")).alias("lag_speed")
)
will give you the expected output:
+---------------+------------------+-----+------------------+------------------+---------+
| id| dt|speed| stats| lag_stats|lag_speed|
+---------------+------------------+-----+------------------+------------------+---------+
|358899055773504|2018-07-3118:38:36| 0|[9,-1,-1,13,0,1,0]|[9,-1,-1,13,0,1,0]| 0|
|358899055773504|2018-07-3118:58:34| 0| [9,0,-1,22,0,1,0]|[9,-1,-1,13,0,1,0]| 0|
|358899055773505|2018-07-3118:54:23| 4| [9,0,0,22,1,1,1]| [8,-1,0,22,1,1,1]| 4|
+---------------+------------------+-----+------------------+------------------+---------+