error while using spark UserDefinedType - scala

I have a problem while using UserDefinedType in spark, I want to define a class, and then build the schema of that class so it can be written into parquet files to store. But I got the problem, and I don't know how the code caused. The exception is throwed from DataType.scala, with the message "Unsupported dataType ......"
the code is:
#SQLUserDefinedType(udt = classOf[NodeDataType])
class Node(val dst: Int, val sim: Int) extends Serializable
//{
// def compare(that: Node): Int = that.sim.compareTo(sim)
// override def toString = dst.toString + "," + sim.toString
// override def hashCode(): Int = this.toString.hashCode
// override def equals(other: Any): Boolean = this.toString.equals(other.toString)
//}
class NodeDataType extends UserDefinedType[Node]{
override def sqlType: DataType = StructType(Seq(
StructField("dst", IntegerType, nullable = true),
StructField("sim", IntegerType, nullable = true)
))
override def serialize(obj: Any): GenericArrayData = {
obj match{
case p: Node =>
println("serialize Node")
val output = new Array[Any](2)
output(0) = p.dst
output(1) = p.sim
new GenericArrayData(output)
}
}
override def deserialize(datum: Any): Node = {
datum match{
case values: ArrayData =>
println("deserialize Node")
new Node(values.getInt(0), values.getInt(1))
}
}
override def userClass: Class[Node] = classOf[Node]
override def hashCode(): Int = 1
override def equals(other: Any): Boolean = {
other match{
case that: NodeDataType => true
case _ => false
}
}
}
case object NodeDataType extends NodeDataType
object Test{
def main(args: Array[String]): Unit ={
val conf = new SparkConf().setAppName(this.getClass.getName)
val sc = new SparkContext(conf)
def buildNgbSchema(row: Row): StructType = {
val schema = StructType(Seq(
StructField("src", IntegerType, true),
StructField("neighbours", NodeDataType)
))
println("schema: " + schema)
schema
}
val s1 = Seq(
(1, new Node(1, 1)),
(2, new Node(2, 2))
)
val rdd1 = sc.makeRDD(s1)
val rows = rdd1.map(t => Row(t._1, t._2))
val schema = buildNgbSchema(rows.take(1)(0))
val sqlContext = new SQLContext(sc)
CommonUtility.deletePath("./test") //just delete the path
sqlContext.createDataFrame(rows, schema).write.parquet("./test")
}
}
and while I run this code, the error occured:
Caused by: java.lang.IllegalArgumentException: Unsupported dataType: {"type":"struct","fields":[{"name":"src","type":"integer","nullable":true,"metadata":{}},{"name":"neighbours","type":{"type":"udt","class":"com.tencent.ieg.tgp.recommend.NodeDataType$","pyClass":null,"sqlType":{"type":"struct","fields":[{"name":"dst","type":"integer","nullable":true,"metadata":{}},{"name":"sim","type":"integer","nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}}]}, [1.1] failure: `TimestampType' expected but `{' found
{"type":"struct","fields":[{"name":"src","type":"integer","nullable":true,"metadata":{}},{"name":"neighbours","type":{"type":"udt","class":"com.tencent.ieg.tgp.recommend.NodeDataType$","pyClass":null,"sqlType":{"type":"struct","fields":[{"name":"dst","type":"integer","nullable":true,"metadata":{}},{"name":"sim","type":"integer","nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}}]}
^
at org.apache.spark.sql.types.DataType$CaseClassStringParser$.apply(DataType.scala:245)
at org.apache.spark.sql.types.DataType$.fromCaseClassString(DataType.scala:102)
at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$$anonfun$3.apply(ParquetTypesConverter.scala:62)
at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$$anonfun$3.apply(ParquetTypesConverter.scala:62)
at scala.util.Try.getOrElse(Try.scala:77)
at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$.convertFromString(ParquetTypesConverter.scala:62)
at org.apache.spark.sql.execution.datasources.parquet.RowWriteSupport.init(ParquetTableSupport.scala:51)
at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:288)
at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:262)
at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
......
Have any one got this problem? Or something wrong I just make?

Related

How to port UDAF to Aggregator?

I have a DF looking like this:
time,channel,value
0,foo,5
0,bar,23
100,foo,42
...
I want a DF like this:
time,foo,bar
0,5,23
100,42,...
In Spark 2, I did it with a UDAF like this:
case class ColumnBuilderUDAF(channels: Seq[String]) extends UserDefinedAggregateFunction {
#transient lazy val inputSchema: StructType = StructType {
StructField("channel", StringType, nullable = false) ::
StructField("value", DoubleType, nullable = false) ::
Nil
}
#transient lazy val bufferSchema: StructType = StructType {
channels
.toList
.indices
.map(i => StructField("c%d".format(i), DoubleType, nullable = false))
}
#transient lazy val dataType: DataType = bufferSchema
#transient lazy val deterministic: Boolean = false
def initialize(buffer: MutableAggregationBuffer): Unit = channels.indices.foreach(buffer(_) = NaN)
def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val channel = input.getAs[String](0)
val p = channels.indexOf(channel)
if (p >= 0 && p < channels.length) {
val v = input.getAs[Double](1)
if (!v.isNaN) {
buffer(p) = v
}
}
}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit =
channels
.indices
.foreach { i =>
val v2 = buffer2.getAs[Double](i)
if ((!v2.isNaN) && buffer1.getAs[Double](i).isNaN) {
buffer1(i) = v2
}
}
def evaluate(buffer: Row): Any =
new GenericRowWithSchema(channels.indices.map(buffer.getAs[Double]).toArray, dataType.asInstanceOf[StructType])
}
which I use like this:
val cb = ColumnBuilderUDAF(Seq("foo", "bar"))
val dfColumnar = df.groupBy($"time").agg(cb($"channel", $"value") as "c")
and then, I rename c.c0, c.c1 etc. to foo, bar etc.
In Spark 3, UDAF is deprecated and Aggregator should be used instead. So I began to port it like this:
case class ColumnBuilder(channels: Seq[String]) extends Aggregator[(String, Double), Array[Double], Row] {
lazy val bufferEncoder: Encoder[Array[Double]] = Encoders.javaSerialization[Array[Double]]
lazy val zero: Array[Double] = channels.map(_ => Double.NaN).toArray
def reduce(b: Array[Double], a: (String, Double)): Array[Double] = {
val index = channels.indexOf(a._1)
if (index >= 0 && !a._2.isNaN) b(index) = a._2
b
}
def merge(b1: Array[Double], b2: Array[Double]): Array[Double] = {
(0 until b1.length.min(b2.length)).foreach(i => if (b1(i).isNaN) b1(i) = b2(i))
b1
}
def finish(reduction: Array[Double]): Row =
new GenericRowWithSchema(reduction.map(x => x: Any), outputEncoder.schema)
def outputEncoder: Encoder[Row] = ??? // what goes here?
}
I don't know how to implement the Encoder[Row] as Spark does not have a pre-defined one. If I simply do a straightforward approach like this:
val outputEncoder: Encoder[Row] = new Encoder[Row] {
val schema: StructType = StructType(channels.map(StructField(_, DoubleType, nullable = false)))
val clsTag: ClassTag[Row] = classTag[Row]
}
I get a ClassCastException because outputEncoder actually has to be ExpressionEncoder.
So, how do I implement this correctly? Or do I still have to use the deprecated UDAF?
You can do it with the use of groupBy and pivot
import spark.implicits._
import org.apache.spark.sql.functions._
val df = Seq(
(0, "foo", 5),
(0, "bar", 23),
(100, "foo", 42)
).toDF("time", "channel", "value")
df.groupBy("time")
.pivot("channel")
.agg(first("value"))
.show(false)
Output:
+----+----+---+
|time|bar |foo|
+----+----+---+
|100 |null|42 |
|0 |23 |5 |
+----+----+---+

Spark sessionization using data frames

I want to do clickstream sessionization on the spark data frame. Let's I have loaded the data frame which has events from multiple sessions with the following schema -
And I want to aggregate(stitch) the sessions, like this -
I have explored UDAF and Window functions but could not understand how I can use them for this specific use case. I know that partitioning the data by session id puts entire session data in a single partition but how do I aggregate them?
The idea is to aggregate all the events specific to each session as a single output record.
You can use collect_set:
def process(implicit spark: SparkSession) = {
import spark._
import org.apache.spark.sql.functions.{ concat, col, collect_set }
val seq = Seq(Row(1, 1, "startTime=1549270909"), Row(1, 1, "endTime=1549270913"))
val rdd = spark.sparkContext.parallelize(seq)
val df1 = spark.createDataFrame(rdd, StructType(List(StructField("sessionId", IntegerType, false), StructField("userId", IntegerType, false), StructField("session", StringType, false))))
df1.groupBy("sessionId").agg(collect_set("session"))
}
}
That gives you:
+---------+------------------------------------------+
|sessionId|collect_set(session) |
+---------+------------------------------------------+
|1 |[startTime=1549270909, endTime=1549270913]|
+---------+------------------------------------------+
as output.
If you need a more complex logic, it could be included in the following UDAF:
class YourComplexLogicStrings extends UserDefinedAggregateFunction {
override def inputSchema: StructType = StructType(StructField("input", StringType) :: Nil)
override def bufferSchema: StructType = StructType(StructField("pair", StringType) :: Nil)
override def dataType: DataType = StringType
override def deterministic: Boolean = true
override def initialize(buffer: MutableAggregationBuffer): Unit = buffer(0) = ""
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val b = buffer.getAs[String](0)
val i = input.getAs[String](0)
buffer(0) = { if(b.isEmpty) b + i else b + " + " + i }
}
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val b1 = buffer1.getAs[String](0)
val b2 = buffer2.getAs[String](0)
if(!b1.isEmpty)
buffer1(0) = (b1) ++ "," ++ (b2)
else
buffer1(0) = b2
}
override def evaluate(buffer: Row): Any = {
val yourString = buffer.getAs[String](0)
// Compute your logic and return another String
yourString
}
}
def process0(implicit spark: SparkSession) = {
import org.apache.spark.sql.functions.{ concat, col, collect_set }
val agg0 = new YourComplexLogicStrings()
val seq = Seq(Row(1, 1, "startTime=1549270909"), Row(1, 1, "endTime=1549270913"))
val rdd = spark.sparkContext.parallelize(seq)
val df1 = spark.createDataFrame(rdd, StructType(List(StructField("sessionId", IntegerType, false), StructField("userId", IntegerType, false), StructField("session", StringType, false))))
df1.groupBy("sessionId").agg(agg0(col("session")))
}
It gives:
+---------+---------------------------------------+
|sessionId|yourcomplexlogicstrings(session) |
+---------+---------------------------------------+
|1 |startTime=1549270909,endTime=1549270913|
+---------+---------------------------------------+
Note that you could include very complex logic using spark sql functions directly if you want to avoid UDAFs.

Calculate a mode for multiple columns

I would like to calculate a mode for multiple columns in the same time in Spark and use this calculated values to impute missings in a DataFrame. I found how to calculate e.g. a mean, but a mode is more complex I think.
Here is a mean calculation:
val multiple_mean = df.na.fill(df.columns.zip(
df.select(intVars.map(mean(_)): _*).first.toSeq
).toMap)
I am able to calculate a mode in brute force way:
var list = ArrayBuffer.empty[Float]
for(column <- df.columns){
list += df.select(column).groupBy(col(column)).count().orderBy(desc("count")).first.toSeq(0).asInstanceOf[Float]
}
val multiple_mode = df.na.fill(df.columns.zip(list.toSeq).toMap)
What way would be the best if we consider a performance?
Thank you for any help.
You could use UserDefinedAggregateFunction. The code below is tested in spark 1.6.2
First create a class which extends UserDefinedAggregateFunction.
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
class ModeUDAF extends UserDefinedAggregateFunction{
override def dataType: DataType = StringType
override def inputSchema: StructType = new StructType().add("input", StringType)
override def deterministic: Boolean = true
override def bufferSchema: StructType = new StructType().add("mode", MapType(StringType, LongType))
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = Map.empty[Any, Long]
}
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val buff0 = buffer.getMap[Any, Long](0)
val inp = input.get(0)
buffer(0) = buff0.updated(inp, buff0.getOrElse(inp, 0L) + 1L)
}
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val mp1 = buffer1.getMap[Any, Long](0)
val mp2 = buffer2.getMap[Any, Long](0)
buffer1(0) = mp1 ++ mp2.map { case (k, v) => k -> (v + mp1.getOrElse(k, 0L)) }
}
override def evaluate(buffer: Row): Any = {
lazy val st = buffer.getMap[Any, Long](0).toStream
val mode = st.foldLeft(st.head){case (e, s) => if (s._2 > e._2) s else e}
mode._1
}
}
Afterwords you could use it with your dataframe in the following manner.
val modeColumnList = List("some", "column", "names") // or df.columns.toList
val modeAgg = new ModeUDAF()
val aggCols = modeColumnList.map(c => modeAgg(df(c)))
val aggregatedModeDF = df.agg(aggCols.head, aggCols.tail: _*)
aggregatedModeDF.show()
Also you could use .collect on the final dataframe to collect the result in a scala data structure.
Note: The performance of this solution depends on the cardinality of the input column.

HashMap UserDefinedType giving cast exception in Spark 1.6.2 while implementing UDAF

I am trying to use a custom HashMap implementation as UserDefinedType instead of MapType in spark. The code is working fine in spark 1.5.2 but giving java.lang.ClassCastException: scala.collection.immutable.HashMap$HashMap1 cannot be cast to org.apache.spark.sql.catalyst.util.MapData exception in spark 1.6.2
The code :-
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
import scala.collection.immutable.HashMap
class Test extends UserDefinedAggregateFunction {
def inputSchema: StructType =
StructType(Array(StructField("input", StringType)))
def bufferSchema = StructType(Array(StructField("top_n", CustomHashMapType)))
def dataType: DataType = CustomHashMapType
def deterministic = true
def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = HashMap.empty[String, Long]
}
def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val buff0 = buffer.getAs[HashMap[String, Long]](0)
buffer(0) = buff0.updated("test", buff0.getOrElse("test", 0L) + 1L)
}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1(0) = buffer1.
getAs[HashMap[String, Long]](0)
.merged(buffer2.getAs[HashMap[String, Long]](0))({ case ((k, v1), (_, v2)) => (k, v1 + v2) })
}
def evaluate(buffer: Row): Any = {
buffer(0)
}
}
private case object CustomHashMapType extends UserDefinedType[HashMap[String, Long]] {
override def sqlType: DataType = MapType(StringType, LongType)
override def serialize(obj: Any): Map[String, Long] =
obj.asInstanceOf[Map[String, Long]]
override def deserialize(datum: Any): HashMap[String, Long] = {
datum.asInstanceOf[Map[String, Long]] ++: HashMap.empty[String, Long]
}
override def userClass: Class[HashMap[String, Long]] = classOf[HashMap[String, Long]]
}
The wrapper Class to run the UDAF:-
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object TestJob {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[4]").setAppName("DataStatsExecution")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val df = sc.parallelize(Seq(1,2,3,4)).toDF("col")
val udaf = new Test()
val outdf = df.agg(udaf(df("col")))
outdf.show
}
}
When I run the above code in spark 1.6.2, I get the following exception:-
Caused by: java.lang.ClassCastException: scala.collection.immutable.HashMap$HashMap1 cannot be cast to org.apache.spark.sql.catalyst.util.MapData
at org.apache.spark.sql.catalyst.expressions.BaseGenericInternalRow$class.getMap(rows.scala:50)
at org.apache.spark.sql.catalyst.expressions.GenericMutableRow.getMap(rows.scala:248)
at org.apache.spark.sql.catalyst.expressions.JoinedRow.getMap(JoinedRow.scala:115)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificMutableProjection.apply(Unknown Source)
at org.apache.spark.sql.execution.aggregate.AggregationIterator$$anonfun$31.apply(AggregationIterator.scala:345)
at org.apache.spark.sql.execution.aggregate.AggregationIterator$$anonfun$31.apply(AggregationIterator.scala:344)
at org.apache.spark.sql.execution.aggregate.SortBasedAggregationIterator.next(SortBasedAggregationIterator.scala:154)
at org.apache.spark.sql.execution.aggregate.SortBasedAggregationIterator.next(SortBasedAggregationIterator.scala:29)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:149)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I have found that the HashMap implementation is way faster than the available spark MapType implementation. Are there any changes that can be done to run the code in spark 1.6.2 or is there any possible alternative?

Unable to parse JSON with GSON in Scala

I am using Gson to parse json to Scala, but error occured,the code is as follows, looks the Gson.fromJson(String, java.lang.reflect.Type type)
object GsonUtils {
val GSON = new GsonBuilder().create()
def java2Json(obj: Object) = GSON.toJson(obj)
def json2Java[T](json: String, tyze: Type) = GSON.fromJson(json, tyze)
}
case class Data(#BeanProperty val name: String, #BeanProperty val age: Int)
object GsonUtilsTest {
def main(args: Array[String]) {
val d = Data("1",1)
val json = GsonUtils.java2Json(d)
println(json)
//ERROR
val d2 = GsonUtils.json2Java(json, classOf[Data]).asInstanceOf[Data]
val dats = new java.util.ArrayList[Data]()
dats.add(Data("1",1))
val json2 = GsonUtils.java2Json(dats)
val tyze = new TypeToken[ java.util.List[Data]](){
}.getType()
//ERROR
GsonUtils.json2Java(json2,tyze)
}
}
When I run it, exception throws, the exception is:
Exception in thread "main" java.lang.ClassCastException: com.xyz.Data incompatible with scala.runtime.Nothing$
at java.lang.ClassCastException.<init>(ClassCastException.java:58)
at com.xyz.GsonUtils$.json2Java(GsonUtils.scala:18)
at com.xyz.GsonUtilsTest$.main(GsonUtils.scala:30)
at com.xyz.GsonUtilsTest.main(GsonUtils.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:88)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:55)
at java.lang.reflect.Method.invoke(Method.java:613)
at com.intellij.rt.execution.application.AppMain.main(AppMain.java:144)
Use fromJson(String json, Class classOfT) instead of fromJson(String json, Type typeOfT)
object GsonUtils {
val GSON = new GsonBuilder().create()
def java2Json(obj: Object) = GSON.toJson(obj)
//def json2Java[T](json: String, tyze: Type) = GSON.fromJson(json, tyze)
def json2Java[T](json: String, tyze: Class[T]) = GSON.fromJson(json, tyze)
}
case class Data(val name: String, val age: Int)
object GsonUtilsTest {
def main(args: Array[String]) {
val d = Data("1",1)
val json = GsonUtils.java2Json(d)
println(json)
val obj:Data = GsonUtils.json2Java(json, classOf[Data])
println(s"${obj}")
val d2:util.ArrayList[Data] = new util.ArrayList[Data]
d2.add(Data("1",1))
d2.add(Data("2",2))
val json2 = GsonUtils.java2Json(d2)
println(json2)
val dataclass = classOf[util.ArrayList[Data]]
val obj2:util.ArrayList[Data] = GsonUtils.json2Java(json2, dataclass)
println(s"${obj2}")
}
}