Scala spark aggregator NotSerializableException

Scala spark aggregator NotSerializableException - scala

I have DataSet with id and sub_ids. I want to use Aggregator for grouping sub_ids per ids into set.
When I execute below code in databricks:
%scala
import org.apache.spark.sql.{DataFrame, Dataset, Encoder}
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
case class SubIdPerId(id: Int, sub_id: String)
case class SubIdsPerId(id: Int, sub_ids: Set[String])
object PreprocessData{
def createSubIdsArray(df: Dataset[SubIdPerId]): Dataset[SubIdsPerId] = {
val transformedData: Dataset[SubIdsPerId] =
df.groupByKey(_.id)
.agg(
PreprocessData.distinctSubIdsAggregator.toColumn.name("sub_ids")
).map{
case(id: Int, si: Set[String]) => SubIdsPerId(id, si)
}
transformedData
}
val distinctSubIdsAggregator: Aggregator[SubIdPerId, Set[String], Set[String]] = new Aggregator[SubIdPerId, Set[String], Set[String]] {
override def zero: Set[String] = Set[String]()
override def reduce(es: Set[String], supi: SubIdPerId): Set[String] =
es + supi.sub_id
override def merge(wx: Set[String], wy: Set[String]): Set[String] =
wx.union(wy)
override def finish(reduction: Set[String]): Set[String] = reduction
override def bufferEncoder: Encoder[Set[String]] =
implicitly(ExpressionEncoder[Set[String]])
override def outputEncoder: Encoder[Set[String]] =
implicitly(ExpressionEncoder[Set[String]])
}
}
val df_t = Seq(
(1, "zk67"),
(1, "gg89"),
(2, "gg97"),
(2, "gd01"),
(2, "af83"),
(3, "af84"),
(3, "gd77"),
(3, "gd73"),
(3, "cl55"),
(3, "zk67")
).toDF("id","sub_id")
val ds_t = df_t.as[SubIdPerId]
val transformedData = PreprocessData.createSubIdsArray(ds_t)
display(transformedData)
I receive:
NotSerializableException: $line813d300e73a74f3e9ab1e2bfe2afe2de59.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$PreprocessData$
Additional note:
When I execute this code without wrapping it into object it works:
ds_t.groupByKey(_.id).agg(distinctSubIdsAggregator.toColumn.name("sub_ids"))

Related

How to port UDAF to Aggregator?

I have a DF looking like this:
time,channel,value
0,foo,5
0,bar,23
100,foo,42
...
I want a DF like this:
time,foo,bar
0,5,23
100,42,...
In Spark 2, I did it with a UDAF like this:
case class ColumnBuilderUDAF(channels: Seq[String]) extends UserDefinedAggregateFunction {
#transient lazy val inputSchema: StructType = StructType {
StructField("channel", StringType, nullable = false) ::
StructField("value", DoubleType, nullable = false) ::
Nil
}
#transient lazy val bufferSchema: StructType = StructType {
channels
.toList
.indices
.map(i => StructField("c%d".format(i), DoubleType, nullable = false))
}
#transient lazy val dataType: DataType = bufferSchema
#transient lazy val deterministic: Boolean = false
def initialize(buffer: MutableAggregationBuffer): Unit = channels.indices.foreach(buffer(_) = NaN)
def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
val channel = input.getAs[String](0)
val p = channels.indexOf(channel)
if (p >= 0 && p < channels.length) {
val v = input.getAs[Double](1)
if (!v.isNaN) {
buffer(p) = v
}
}
}
def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit =
channels
.indices
.foreach { i =>
val v2 = buffer2.getAs[Double](i)
if ((!v2.isNaN) && buffer1.getAs[Double](i).isNaN) {
buffer1(i) = v2
}
}
def evaluate(buffer: Row): Any =
new GenericRowWithSchema(channels.indices.map(buffer.getAs[Double]).toArray, dataType.asInstanceOf[StructType])
}
which I use like this:
val cb = ColumnBuilderUDAF(Seq("foo", "bar"))
val dfColumnar = df.groupBy($"time").agg(cb($"channel", $"value") as "c")
and then, I rename c.c0, c.c1 etc. to foo, bar etc.
In Spark 3, UDAF is deprecated and Aggregator should be used instead. So I began to port it like this:
case class ColumnBuilder(channels: Seq[String]) extends Aggregator[(String, Double), Array[Double], Row] {
lazy val bufferEncoder: Encoder[Array[Double]] = Encoders.javaSerialization[Array[Double]]
lazy val zero: Array[Double] = channels.map(_ => Double.NaN).toArray
def reduce(b: Array[Double], a: (String, Double)): Array[Double] = {
val index = channels.indexOf(a._1)
if (index >= 0 && !a._2.isNaN) b(index) = a._2
b
}
def merge(b1: Array[Double], b2: Array[Double]): Array[Double] = {
(0 until b1.length.min(b2.length)).foreach(i => if (b1(i).isNaN) b1(i) = b2(i))
b1
}
def finish(reduction: Array[Double]): Row =
new GenericRowWithSchema(reduction.map(x => x: Any), outputEncoder.schema)
def outputEncoder: Encoder[Row] = ??? // what goes here?
}
I don't know how to implement the Encoder[Row] as Spark does not have a pre-defined one. If I simply do a straightforward approach like this:
val outputEncoder: Encoder[Row] = new Encoder[Row] {
val schema: StructType = StructType(channels.map(StructField(_, DoubleType, nullable = false)))
val clsTag: ClassTag[Row] = classTag[Row]
}
I get a ClassCastException because outputEncoder actually has to be ExpressionEncoder.
So, how do I implement this correctly? Or do I still have to use the deprecated UDAF?

You can do it with the use of groupBy and pivot
import spark.implicits._
import org.apache.spark.sql.functions._
val df = Seq(
(0, "foo", 5),
(0, "bar", 23),
(100, "foo", 42)
).toDF("time", "channel", "value")
df.groupBy("time")
.pivot("channel")
.agg(first("value"))
.show(false)
Output:
+----+----+---+
|time|bar |foo|
+----+----+---+
|100 |null|42 |
|0 |23 |5 |
+----+----+---+

Error: Unable to find encoder for type org.apache.spark.sql.Dataset[(String, Long)]

Following test for Dataset comparison is failing with the error:
Error:(55, 38) Unable to find encoder for type org.apache.spark.sql.Dataset[(String, Long)]. An implicit Encoder[org.apache.spark.sql.Dataset[(String, Long)]] is needed to store org.apache.spark.sql.Dataset[(String, Long)] instances in a Dataset. Primitive types (Int, String, etc) and Product types (case classes) are supported by importing spark.implicits._ Support for serializing other types will be added in future releases.
).toDF("lower(word)", "count").as[Dataset[(String, Long)]]
Error:(55, 38) not enough arguments for method as: (implicit evidence$2: org.apache.spark.sql.Encoder[org.apache.spark.sql.Dataset[(String, Long)]])org.apache.spark.sql.Dataset[org.apache.spark.sql.Dataset[(String, Long)]].
Unspecified value parameter evidence$2.
).toDF("lower(word)", "count").as[Dataset[(String, Long)]]
Test
As you can see, I tried creating the Kryo Encoder for (String, Long)
class WordCountDSAppTestSpec extends FlatSpec with SparkSessionTestWrapper with DatasetComparer {
import spark.implicits._
"countWords" should "return count of each word" in {
val wordsDF = Seq(
("one", "one"),
("two", "two"),
("three Three", "three"),
("three Three", "Three"),
("", "")
).toDF("line", "word").as[LineAndWord]
implicit val tupleEncoder = org.apache.spark.sql.Encoders.kryo[(String, Long)]
val expectedDF = Seq(
("one", 1L),
("two", 1L),
("three", 2L)
).toDF("lower(word)", "count").as[Dataset[(String, Long)]]
val actualDF = WordCountDSApp.countWords(wordsDF)
assertSmallDatasetEquality(actualDF, expectedDF, orderedComparison = false)
}
}
Spark App under test
import com.aravind.oss.Logging
import com.aravind.oss.eg.wordcount.spark.WordCountUtil.{WhitespaceRegex, getClusterCfg, getPaths, getSparkSession}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions.{explode, split}
object WordCountDSApp extends App with Logging {
logInfo("WordCount with Dataset API and multiple Case classes")
val paths = getPaths(args)
val cluster = getClusterCfg(args)
if (paths.size > 1) {
logInfo("More than one file to process")
}
logInfo("Path(s): " + paths)
logInfo("Cluster: " + cluster)
val spark = getSparkSession("WordCountDSApp", cluster)
import spark.implicits._
/*
case class <code>Line<code> SHOULD match the number of columns in the input file
*/
val linesDs: Dataset[Line] = spark.read
.textFile(paths: _*)
.toDF("line")
.as[Line]
logInfo("Dataset before splitting line")
linesDs.show(false)
/*
<code>toWords<code> adds additional column (word) to the output so we need a
new case class <code>LineAndWord</code> that contains two properties to represent two columns.
The names of the properties should match the name of the columns as well.
*/
val wordDs: Dataset[LineAndWord] = toWords(linesDs)
logInfo("Dataset after splitting the line into words")
wordDs.show(false)
val wordCount = countWords(wordDs)
wordCount
.orderBy($"count(1)".desc)
.show(false)
def toWords(linesDs: Dataset[Line]): Dataset[LineAndWord] = {
import linesDs.sparkSession.implicits._
linesDs
.select($"line",
explode(split($"line", WhitespaceRegex)).as("word"))
.as[LineAndWord]
}
def countWords(wordsDs: Dataset[LineAndWord]): Dataset[(String, Long)] = {
import wordsDs.sparkSession.implicits._
val result = wordsDs
.filter(_.word != null)
.filter(!_.word.isEmpty)
.groupByKey(_.word.toLowerCase)
.count()
result
}
case class Line(line: String)
case class LineAndWord(line: String, word: String)
}

You should call as[Something], not .as[Dataset[Something]]. Here is working version:
"countWords" should "return count of each word" in {
import org.apache.spark.sql.{Encoder, Encoders}
import spark.implicits._
implicit def tuple2[A1, A2](implicit e1: Encoder[A1],
e2: Encoder[A2]): Encoder[(A1, A2)] =
Encoders.tuple[A1, A2](e1, e2)
val expectedDF = Seq(("one", 1L), ("two", 1L), ("three", 2L))
.toDF("value", "count(1)")
.as[(String, Long)]
val wordsDF1 = Seq(
("one", "one"),
("two", "two"),
("three Three", "three"),
("three Three", "Three"),
("", "")
).toDF("line", "word").as[LineAndWord]
val actualDF = WordCountDSApp.countWords(wordsDF1)
actualDF.show()
expectedDF.show()
assertSmallDatasetEquality(actualDF, expectedDF, orderedComparison = false)
}

GroupBy + custom aggregation on Dataset with Case class / Trait in the Key

I am trying to refactor some code and put the general logic into a trait. I basically want to process datasets, group them by some key and aggregate:
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{ Dataset, Encoder, Encoders, TypedColumn }
case class SomeKey(a: String, b: Boolean)
case class InputRow(
SomeKey,
v: Double
)
trait MyTrait {
def processInputs: Dataset[InputRow]
def groupAndAggregate(
logs: Dataset[InputRow]
): Dataset[(SomeKey, Long)] = {
import logs.sparkSession.implicits._
logs
.groupByKey(i => i.key)
.agg(someAggFunc)
}
//Whatever agg function: here, it counts the number of v that are >= 0.5
def someAggFunc: TypedColumn[InputRow, Long] =
new Aggregator[
/*input type*/ InputRow,
/* "buffer" type */ Long,
/* output type */ Long
] with Serializable {
def zero = 0L
def reduce(b: Long, a: InputRow) = {
if (a.v >= 0.5)
b + 1
else
b
}
def merge(b1: Long, b2: Long) =
b1 + b2
// map buffer to output type
def finish(b: Long) = b
def bufferEncoder: Encoder[Long] = Encoders.scalaLong
def outputEncoder: Encoder[Long] = Encoders.scalaLong
}.toColumn
}
everything works fine: I can instantiate a class that inherits from MyTrait and override the way I process inputs:
import spark.implicits._
case class MyTraitTest(testDf: DataFrame) extends MyTrait {
override def processInputs: Dataset[InputRow] = {
val ds = testDf
.select(
$"a",
$"b",
$"v",
)
.rdd
.map(
r =>
InputRow(
SomeKey(r.getAs[String]("a"), r.getAs[Boolean]("b")),
r.getAs[Double]("v")
)
)
.toDS
ds
}
val df: DataFrame = Seq(
("1", false, 0.40),
("1", false, 0.54),
("0", true, 0.85),
("1", true, 0.39)
).toDF("a", "b", "v")
val myTraitTest = MyTraitTest(df)
val ds: Dataset[InputRow] = myTraitTest.processInputs
val res = myTraitTest.groupAndAggregate(ds)
res.show(false)
+----------+----------------------------------+
|key |InputRow |
+----------+----------------------------------+
|[1, false]|1 |
|[0, true] |1 |
|[1, true] |0 |
+----------+----------------------------------+
Now the problem: I want SomeKey to derive from a more generic trait Key, because the key will not always have only two fields, the fields won't have the same type etc. It will always be a simple tuple of some basic primitive types though.
So I tried to do the following:
trait Key extends Product
case class SomeKey(a: String, b: Boolean) extends Key
case class SomeOtherKey(x: Int, y: Boolean, z: String) extends Key
case class InputRow[T <: Key](
key: T,
v: Double
)
trait MyTrait[T <: Key] {
def processInputs: Dataset[InputRow[T]]
def groupAndAggregate(
logs: Dataset[InputRow[T]]
): Dataset[(T, Long)] = {
import logs.sparkSession.implicits._
logs
.groupByKey(i => i.key)
.agg(someAggFunc)
}
def someAggFunc: TypedColumn[InputRow[T], Long] = {...}
I now do:
case class MyTraitTest(testDf: DataFrame) extends MyTrait[SomeKey] {
override def processInputs: Dataset[InputRow[SomeKey]] = {
...
}
etc.
But now I get the error : Unable to find encoder for type T. An implicit Encoder[T] is needed to store T instances in a Dataset. Primitive types (Int, String, etc) and Product types (case classes) are supported by importing spark.implicits._ Support for serializing other types will be added in future releases.
.groupByKey(i => i.key)
I really don't know how to work around this issue, I tried lots of things without success. Sorry for this quite lengthy description but hopefully you have all the elements to help me understand... thanks!

Spark needs to be able to implicitly create the encoder for product type T so you'll need to help it work around the JVM type erasure and pass the TypeTag for T as an implicit parameter of your groupAndAggregate method.
A working example:
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{ DataFrame, Dataset, Encoders, TypedColumn }
import scala.reflect.runtime.universe.TypeTag
trait Key extends Product
case class SomeKey(a: String, b: Boolean) extends Key
case class SomeOtherKey(x: Int, y: Boolean, z: String) extends Key
case class InputRow[T <: Key](key: T, v: Double)
trait MyTrait[T <: Key] {
def processInputs: Dataset[InputRow[T]]
def groupAndAggregate(
logs: Dataset[InputRow[T]]
)(implicit tTypeTag: TypeTag[T]): Dataset[(T, Long)] = {
import logs.sparkSession.implicits._
logs
.groupByKey(i => i.key)
.agg(someAggFunc)
}
def someAggFunc: TypedColumn[InputRow[T], Long] =
new Aggregator[InputRow[T], Long, Long] with Serializable {
def reduce(b: Long, a: InputRow[T]) = b + (a.v * 100).toLong
def merge(b1: Long, b2: Long) = b1 + b2
def zero = 0L
def finish(b: Long) = b
def bufferEncoder = Encoders.scalaLong
def outputEncoder = Encoders.scalaLong
}.toColumn
}
with a wrapping case class
case class MyTraitTest(testDf: DataFrame) extends MyTrait[SomeKey] {
import testDf.sparkSession.implicits._
import org.apache.spark.sql.functions.struct
override def processInputs = testDf
.select(struct($"a", $"b") as "key", $"v" )
.as[InputRow[SomeKey]]
}
and a test execution
val df = Seq(
("1", false, 0.40),
("1", false, 0.54),
("0", true, 0.85),
("1", true, 0.39)
).toDF("a", "b", "v")
val myTraitTest = MyTraitTest(df)
val ds = myTraitTest.processInputs
val res = myTraitTest.groupAndAggregate(ds)
res.show(false)
+----------+-----------------------------------------------+
|key |$anon$1($line5460910223.$read$$iw$$iw$InputRow)|
+----------+-----------------------------------------------+
|[1, false]|94 |
|[1, true] |39 |
|[0, true] |85 |
+----------+-----------------------------------------------+

How to create custom set accumulator, i.e. Set[String]?

I am trying to use custom accumulator in Apache Spark to accumulate in a set. The result should have Set[String] type. For this I creat custom accumulator:
object SetAccumulatorParam extends AccumulatorParam[Set[String]] {
def addInPlace(r1: mutable.Set[String], r2: mutable.Set[String]): mutable.Set[String] = {
r1 ++= r2
}
def zero(initialValue: mutable.Set[String]): mutable.Set[String] = {
Set()
}
}
Yet I can not instantiate variable of this type.
val tags = sc.accumulator(Set(""))(SetAccumulatorParam)
result in error .Please help.
required: org.apache.spark.AccumulatorParam[Set[String]]

Adding to Traian's answer, here is a general case SetAccumulator for spark 2.x.
import org.apache.spark.util.AccumulatorV2
class SetAccumulator[T](var value: Set[T]) extends AccumulatorV2[T, Set[T]] {
def this() = this(Set.empty[T])
override def isZero: Boolean = value.isEmpty
override def copy(): AccumulatorV2[T, Set[T]] = new SetAccumulator[T](value)
override def reset(): Unit = Set.empty[T]
override def add(v: T): Unit = value + v
override def merge(other: AccumulatorV2[T, Set[T]]): Unit = value ++ other.value
override def value: Set[String] = value
}
And you can use it like this:
val accum = new SetAccumulator[String]()
spark.sparkContext.register(accum, "My Accum") // Optional, name it for SparkUI
spark.sparkContext.parallelize(Seq("a", "b", "a", "b", "c")).foreach(s => accum.add(s))
accum.value
Which outputs:
Set[String] = Set(a, b, c)

Update for 1.6:
object StringSetAccumulatorParam extends AccumulatorParam[Set[String]] {
def zero(initialValue: Set[String]): Set[String] = { Set() }
def addInPlace(s1: Set[String], s2: Set[String]): Set[String] = { s1 ++ s2 }
}
val stringSetAccum = sc.accumulator(Set[String]())(StringSetAccumulatorParam)
sc.parallelize(Array("1", "2", "3", "1")).foreach(s => stringSetAccum += Set(s))
stringSetAccum.value.toString
res0: String = Set(2, 3, 1)
In Spark 2.0 you're probably fine with using the existing collectionAccumulator (if you care about distinct values, you can check and add only if they don't exist):
val collAcc = spark.sparkContext.collectionAccumulator[String]("myCollAcc")
collAcc: org.apache.spark.util.CollectionAccumulator[String] = CollectionAccumulator(id: 32154, name: Some(myCollAcc), value: [])
spark.sparkContext.parallelize(Array("1", "2", "3")).foreach(s => collAcc.add(s))
collAcc.value.toString
res0: String = [3, 2, 1]
More info: https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.util.AccumulatorV2

error while using spark UserDefinedType

I have a problem while using UserDefinedType in spark, I want to define a class, and then build the schema of that class so it can be written into parquet files to store. But I got the problem, and I don't know how the code caused. The exception is throwed from DataType.scala, with the message "Unsupported dataType ......"
the code is:
#SQLUserDefinedType(udt = classOf[NodeDataType])
class Node(val dst: Int, val sim: Int) extends Serializable
//{
// def compare(that: Node): Int = that.sim.compareTo(sim)
// override def toString = dst.toString + "," + sim.toString
// override def hashCode(): Int = this.toString.hashCode
// override def equals(other: Any): Boolean = this.toString.equals(other.toString)
//}
class NodeDataType extends UserDefinedType[Node]{
override def sqlType: DataType = StructType(Seq(
StructField("dst", IntegerType, nullable = true),
StructField("sim", IntegerType, nullable = true)
))
override def serialize(obj: Any): GenericArrayData = {
obj match{
case p: Node =>
println("serialize Node")
val output = new Array[Any](2)
output(0) = p.dst
output(1) = p.sim
new GenericArrayData(output)
}
}
override def deserialize(datum: Any): Node = {
datum match{
case values: ArrayData =>
println("deserialize Node")
new Node(values.getInt(0), values.getInt(1))
}
}
override def userClass: Class[Node] = classOf[Node]
override def hashCode(): Int = 1
override def equals(other: Any): Boolean = {
other match{
case that: NodeDataType => true
case _ => false
}
}
}
case object NodeDataType extends NodeDataType
object Test{
def main(args: Array[String]): Unit ={
val conf = new SparkConf().setAppName(this.getClass.getName)
val sc = new SparkContext(conf)
def buildNgbSchema(row: Row): StructType = {
val schema = StructType(Seq(
StructField("src", IntegerType, true),
StructField("neighbours", NodeDataType)
))
println("schema: " + schema)
schema
}
val s1 = Seq(
(1, new Node(1, 1)),
(2, new Node(2, 2))
)
val rdd1 = sc.makeRDD(s1)
val rows = rdd1.map(t => Row(t._1, t._2))
val schema = buildNgbSchema(rows.take(1)(0))
val sqlContext = new SQLContext(sc)
CommonUtility.deletePath("./test") //just delete the path
sqlContext.createDataFrame(rows, schema).write.parquet("./test")
}
}
and while I run this code, the error occured:
Caused by: java.lang.IllegalArgumentException: Unsupported dataType: {"type":"struct","fields":[{"name":"src","type":"integer","nullable":true,"metadata":{}},{"name":"neighbours","type":{"type":"udt","class":"com.tencent.ieg.tgp.recommend.NodeDataType$","pyClass":null,"sqlType":{"type":"struct","fields":[{"name":"dst","type":"integer","nullable":true,"metadata":{}},{"name":"sim","type":"integer","nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}}]}, [1.1] failure: `TimestampType' expected but `{' found
{"type":"struct","fields":[{"name":"src","type":"integer","nullable":true,"metadata":{}},{"name":"neighbours","type":{"type":"udt","class":"com.tencent.ieg.tgp.recommend.NodeDataType$","pyClass":null,"sqlType":{"type":"struct","fields":[{"name":"dst","type":"integer","nullable":true,"metadata":{}},{"name":"sim","type":"integer","nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{}}]}
^
at org.apache.spark.sql.types.DataType$CaseClassStringParser$.apply(DataType.scala:245)
at org.apache.spark.sql.types.DataType$.fromCaseClassString(DataType.scala:102)
at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$$anonfun$3.apply(ParquetTypesConverter.scala:62)
at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$$anonfun$3.apply(ParquetTypesConverter.scala:62)
at scala.util.Try.getOrElse(Try.scala:77)
at org.apache.spark.sql.execution.datasources.parquet.ParquetTypesConverter$.convertFromString(ParquetTypesConverter.scala:62)
at org.apache.spark.sql.execution.datasources.parquet.RowWriteSupport.init(ParquetTableSupport.scala:51)
at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:288)
at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:262)
at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetRelation.scala:94)
at org.apache.spark.sql.execution.datasources.parquet.ParquetRelation$$anon$3.newInstance(ParquetRelation.scala:272)
at org.apache.spark.sql.execution.datasources.DefaultWriterContainer.writeRows(WriterContainer.scala:233)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelation$$anonfun$run$1$$anonfun$apply$mcV$sp$3.apply(InsertIntoHadoopFsRelation.scala:150)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:88)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
......
Have any one got this problem? Or something wrong I just make?