Scala Spark Dataset Error on Nested Object - scala
I am trying to test dataframe(dataset) code with strongly typed nested case classes into dataframe to then pass over my functions. The serialize/creation of the dataframe keeps failing and I do not have enough experience to know what is going on in scala or spark.
I think that I am trying to determine a schema while spark also determines a schema and since those do not match it errors??
Models:
package io.swagger.client.model
import java.sql.Date
import scala.Enumeration
case class Member (
memberId: String,
memberIdSuffix: String,
memberSubscriberId: String,
memberEmpi: Option[Long] = None,
memberFirstName: String,
memberLastName: String,
memberMiddleInitial: Option[String] = None,
memberGender: String,
memberBirthDate: Date,
memberSocialSecurityNumber: Option[String] = None,
memeberPhoneNumbers: List[Telecom],
memberEmailAddresses: Option[List[Email]] = None,
memberAddresses: List[Address],
memberEligibilities: List[MemberEligibility]
)
case class Email (
address: String,
effectiveDate: Option[Date] = None,
terminationDate: Option[Date] = None,
isCurrent: Option[Boolean] = None,
isActive: Option[Boolean] = None
)
case class Address (
lineOne: String,
lineTwo: String,
cityName: String,
stateCode: String,
zipCode: String,
effectiveDate: Option[Date] = None,
terminationDate: Option[Date] = None,
isCurrent: Option[Boolean] = None,
isActive: Option[Boolean] = None
)
case class MemberEligibility (
productId: String,
productCategoryCode: String,
classId: String,
planId: String,
groupId: String,
maxCopayAmount: Option[Float] = None,
voidIndicator: Boolean,
healthplanEntryDate: Date,
memberStatusDescription: Option[String] = None,
eligibilityExplanation: Option[String] = None,
eligibilitySelectionLevelDescription: Option[String] = None,
eligibilityReason: Option[String] = None,
effectiveDate: Option[Date] = None,
terminationDate: Option[Date] = None,
isCurrent: Option[Boolean] = None,
isActive: Option[Boolean] = None
)
case class Telecom (
phoneNumber: String,
effectiveDate: Option[Date] = None,
terminationDate: Option[Date] = None,
isCurrent: Option[Boolean] = None,
isActive: Option[Boolean] = None,
telecomType: String
)
object Genders extends Enumeration {
val male, female, unknown, other = Value
}
object Gender extends Enumeration {
val home, work, fax = Value
}
Test code:
import scala.util.{Try, Success, Failure}
import io.swagger.client.model._
import org.apache.spark.sql.{SparkSession, DataFrame, Dataset}
import org.apache.spark.SparkContext
import org.scalatest._
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
trait SparkContextSetup {
def withSparkContext(testMethod: (SparkSession, SparkContext) => Any) {
val spark = org.apache.spark.sql.SparkSession.builder
.master("local")
.appName("Spark test")
.getOrCreate()
val sparkContext = spark.sparkContext
try {
testMethod(spark,sparkContext)
} finally sparkContext.stop()
}
}
class HelloSpec extends WordSpec with Matchers with SparkContextSetup {
"My analytics" should {
"calculate the right thing" in withSparkContext { (spark, sparkContext) =>
MockMemberData(spark)
}
}
private def MockMemberData(spark: SparkSession) = {
import spark.implicits._
import java.sql.{Date}
import java.text.SimpleDateFormat
import org.apache.spark.sql.types._
var testDate = Try(new SimpleDateFormat("dd/MM/yyyy").parse("01/01/2018"))
.map(d => new java.sql.Date(d.getTime()))
.get
val mockData = spark.sparkContext
.parallelize(
Seq(
Member(
memberId = "12345",
memberIdSuffix = "Mr.",
memberSubscriberId = "000000011",
memberEmpi = None,
memberFirstName = "firstname",
memberLastName = "lastname",
Some("w"),
Genders.male.toString,
testDate,
Some("123456789"),
List(
Telecom("12345678910", None, None, Some(true), Some(true), "")
),
Option(
List(
Email(
"test#gmail.com",
None,
Some(testDate),
isCurrent = Some(true),
isActive = Some(true)
)
)
),
List(
Address(
"10 Awesome Dr",
"",
"St. Louis",
"MO",
"63000",
None,
None,
None,
None
)
),
List(
MemberEligibility(
"productid",
"productCategoryCode",
"classId",
"planId",
"groupId",
None,
false,
testDate,
None,
None,
None,
None,
None,
None,
None
)
)
)
)
)
.toDF()
mockData.show()
}
}
I expected to recieve a dataframe's schema(or dataset in this case, what i did recieve was:
[info] HelloSpec:
[info] My analytics
[info] - should calculate the right thing *** FAILED ***
[info] org.apache.spark.sql.AnalysisException: cannot resolve 'wrapoption(staticinvoke(class scala.collection.mutable.WrappedArray$, ObjectType(interface scala.collection.Seq), make, mapobjects(MapObjects_loopValue10, MapObjects_loopIsNull11, StructField(address,StringType,true), StructField(effectiveDate,DateType,true), StructField(terminationDate,DateType,true), StructField(isCurrent,BooleanType,true), StructField(isActive,BooleanType,true), if (isnull(lambdavariable(MapObjects_loopValue10, MapObjects_loopIsNull11, StructField(address,StringType,true), StructField(effectiveDate,DateType,true), StructField(terminationDate,DateType,true), StructField(isCurrent,BooleanType,true), StructField(isActive,BooleanType,true)))) null else newInstance(class io.swagger.client.model.Email), cast(memberEmailAddresses as array<struct<address:string,effectiveDate:date,terminationDate:date,isCurrent:boolean,isActive:boolean>>)).array, true), ObjectType(class scala.collection.immutable.List))' due to data type mismatch: argument 1 requires scala.collection.immutable.List type, however, 'staticinvoke(class scala.collection.mutable.WrappedArray$, ObjectType(interface scala.collection.Seq), make, mapobjects(MapObjects_loopValue10, MapObjects_loopIsNull11, StructField(address,StringType,true), StructField(effectiveDate,DateType,true), StructField(terminationDate,DateType,true), StructField(isCurrent,BooleanType,true), StructField(isActive,BooleanType,true), if (isnull(lambdavariable(MapObjects_loopValue10, MapObjects_loopIsNull11, StructField(address,StringType,true), StructField(effectiveDate,DateType,true), StructField(terminationDate,DateType,true), StructField(isCurrent,BooleanType,true), StructField(isActive,BooleanType,true)))) null else newInstance(class io.swagger.client.model.Email), cast(memberEmailAddresses as array<struct<address:string,effectiveDate:date,terminationDate:date,isCurrent:boolean,isActive:boolean>>)).array, true)' is of scala.collection.Seq type.;
[info] at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)
[info] at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:82)
[info] at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:74)
[info] at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:310)
[info] at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:310)
[info] at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)
[info] at org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:309)
[info] at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:307)
[info] at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:307)
[info] at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$5$$anonfun$apply$11.apply(TreeNode.scala:360)
[info] ...
UPDATE
so instead of
val mockData = spark.sparkContext
.parallelize(
Seq(
Or
val mockData = spark.sparkContext
.parallelize(
List(
Using Array works?
val mockData = spark.sparkContext
.parallelize(
Array(
Why does Array work but Seq and List do not work?
Related
Scala Play Framework: cannot generate object from json with null values
I'm new to Scala and the Play Framework. I have written the following controller: #Singleton class MyController #Inject()(val controllerComponents: ControllerComponents) extends BaseController { implicit val newMeasurementJson: OFormat[MeasurementModel] = Json.format[MeasurementModel] def addMeasurement(): Action[AnyContent] = Action { implicit request => val content = request.body val jsonObject: Option[JsValue] = content.asJson val measurement: Option[MeasurementModel] = jsonObject.flatMap( Json.fromJson[MeasurementModel](_).asOpt ) ... } ... } Where the endpoint receives the following JSON: { "sensor_id": "1029", "sensor_type": "BME280", "location": 503, "lat": 48.12, "lon": 11.488, "timestamp": "2022-04-05T00:34:24", "pressure": 94667.38, "altitude": null, "pressure_sealevel": null, "temperature": 3.91, "humidity": 65.85 } MeasurementModel looks like this: case class MeasurementModel( sensor_id: String, sensor_type: String, location: Int, lat: Float, lon: Float, timestamp: String, pressure: Float, altitude: Int, pressure_sealevel: Int, temperature: Float, humidity: Float) { } Through testing I have seen that the null values in the JSON are causing the creation of the measurement object to be unsuccessful. How can I successfully handle null values and have them set in the generated MeasurementModel object?
The datatypes that can store null are Null and Option[]. Consider the following REPL code: ` scala> val mightBeIntOrNull: Option[Int] = Option(1) val a: Option[Int] = Some(1) scala> val mightBeIntOrNull: Option[Int] = null val a: Option[Int] = null The Option wraps the Int value in Some, which can be extracted by pattern matching. scala> val mightBeIntOrNull: Option[Int] = Option(1) val mightBeIntOrNull: Option[Int] = Some(1) scala> mightBeIntOrNull match { | case Some(myIntVal) => println("This is an integer :" + myIntVal) | case _ => println("This might be a null") | } This is an integer :1 scala> val mightBeIntOrNull: Option[Int] = null val mightBeIntOrNull: Option[Int] = null scala> mightBeIntOrNull match { | case Some(myIntVal) => println("This is an integer :" + myIntVal) | case _ => println("This might be a null") | } This might be a null As Gaƫl J mentioned, you should add Option for the desired datatype in your case class So the solution can be to wrap the datatype in option where you expect a null. Like: { "altitude": Option[Float], "sensor_type": Option[String], }
Too Many Parameters
I have an application that has a single EntryPoint, it's a library to automate some data engineers stuffs. case class DeltaContextConfig( primaryKey: List[String], columnToOrder: String, filesCountFirstBatch: Int, destinationPath: String, sparkDf: DataFrame, sparkContext: SparkSession, operationType: String, partitionColumn: Option[String] = None, tableName: String, databaseName: String, autoCompaction: Option[Boolean] = Option(true), idealFileSize: Option[Int] = Option(128), deduplicationColumn: Option[String] = None, compactionIntervalTime: Option[Int] = Option(180), updateCondition: Option[String] = None, setExpression: Option[String] = None ) This is my case class, my single Entrypoint. After that all these parameters are pass to other objects, I have objects to write in Datalake, to Compact files and so on. And these objects use some of these parameters, for example, I have a DeltaWriterConfig object: DeltaWriterConfig( sparkDf = deltaContextConfig.sparkDf, columnToOrder = deltaContextConfig.columnToOrder, destinationPath = deltaContextConfig.destinationPath, primaryKey = deltaContextConfig.primaryKey, filesCountFirstBatch = deltaContextConfig.filesCountFirstBatch, sparkContext = deltaContextConfig.sparkContext, operationType = deltaContextConfig.operationType, partitionColumn = deltaContextConfig.partitionColumn, updateCondition = deltaContextConfig.updateCondition, setExpression = deltaContextConfig.setExpression ) I use the DeltaWriterConfig, to pass these parameters to my class DeltaWriter. I was creating all these configs objects on the MAIN, but I think it is not good, because, I have 3 Config Objects to populate, so I have 3 big constructors on the application main. Is there any pattern to solve this?
I think at least it would be better to replace creating another config from the first one to the companion object of DeltaWriterConfig: case class DeltaWriterConfig( sparkDf: DataFrame, columnToOrder: String, destinationPath: String, primaryKey: List[String], filesCountFirstBatch: Int, sparkContext: SparkSession, operationType: String, partitionColumn: Option[String] = None, updateCondition: Option[String] = None, setExpression: Option[String] = None ) case object DeltaWriterConfig { def from(deltaContextConfig: DeltaContextConfig): DeltaWriterConfig = DeltaWriterConfig( sparkDf = deltaContextConfig.sparkDf, columnToOrder = deltaContextConfig.columnToOrder, destinationPath = deltaContextConfig.destinationPath, primaryKey = deltaContextConfig.primaryKey, filesCountFirstBatch = deltaContextConfig.filesCountFirstBatch, sparkContext = deltaContextConfig.sparkContext, operationType = deltaContextConfig.operationType, partitionColumn = deltaContextConfig.partitionColumn, updateCondition = deltaContextConfig.updateCondition, setExpression = deltaContextConfig.setExpression ) } it gives us opportunity to create new config just in one line: val deltaContextConfig: DeltaContextConfig = ??? val deltaWriterConfig = DeltaWriterConfig.from(deltaContextConfig) but the better solution is have only that configs that are unique. For example if we have duplicates fields in DeltaContextConfig and DeltaWriterConfig why we couldn't have just composition of config and not duplicate these fields: // instead of this DeltaContextConfig declaration case class DeltaContextConfig( tableName: String, databaseName: String, autoCompaction: Option[Boolean] = Option(true), idealFileSize: Option[Int] = Option(128), deduplicationColumn: Option[String] = None, compactionIntervalTime: Option[Int] = Option(180), sparkDf: DataFrame, columnToOrder: String, destinationPath: String, primaryKey: List[String], filesCountFirstBatch: Int, sparkContext: SparkSession, operationType: String, partitionColumn: Option[String] = None, updateCondition: Option[String] = None, setExpression: Option[String] = None ) case class DeltaWriterConfig( sparkDf: DataFrame, columnToOrder: String, destinationPath: String, primaryKey: List[String], filesCountFirstBatch: Int, sparkContext: SparkSession, operationType: String, partitionColumn: Option[String] = None, updateCondition: Option[String] = None, setExpression: Option[String] = None ) we use such config structure: case class DeltaContextConfig( tableName: String, databaseName: String, autoCompaction: Option[Boolean] = Option(true), idealFileSize: Option[Int] = Option(128), deduplicationColumn: Option[String] = None, compactionIntervalTime: Option[Int] = Option(180), deltaWriterConfig: DeltaWriterConfig ) case class DeltaWriterConfig( sparkDf: DataFrame, columnToOrder: String, destinationPath: String, primaryKey: List[String], filesCountFirstBatch: Int, sparkContext: SparkSession, operationType: String, partitionColumn: Option[String] = None, updateCondition: Option[String] = None, setExpression: Option[String] = None ) but remember you should use the same config structure in your config file.
List[String] Object in scala case class
I am using dse 5.1.0 (packaged with spark 2.0.2.6 and scala 2.11.8). reading a cassandra table as below. val sparkSession = ... val rdd1 = sparkSession.table("keyspace.table") This table contains a List[String] column, say list1, which I read in scala rdd, say rdd1. But when I try to use encoder, it throws error. val myVoEncoder = Encoders.bean(classOf[myVo]) val dataSet = rdd1.as(myVoEncoder) I have tried with scala.collection.mutable.list, scala.collection.immutable.list, scala.collection.list, Seq, WrappedArray. All gave the same error as below. java.lang.UnsupportedOperationException: Cannot infer type for class scala.collection.immutable.List because it is not bean-compliant MyVo.scala case class MyVo( #BeanProperty var id: String, #BeanProperty var duration: Int, #BeanProperty var list1: List[String], ) { def this() = this("", 0, null) } Any help will be appriciated.
You should use Array[String]: case class MyVo( #BeanProperty var id: String, #BeanProperty var duration: Int, #BeanProperty var list1: Array[String] ) { def this() = this("", 0, null) } although it is important to stress out, that more idiomatic approach would be: import sparkSession.implicits._ case class MyVo( id: String, duration: Int, list1: Seq[String] ) rdd1.as[MyVo]
How to create generated objects in shapeless
Suppose I have a normalized database model for a generic type that comes in like this: case class BaseModel(id: String, createdBy: String, attr1: Option[String] = None, attr2: Option[Int] = None, attr3: Option[LocalDate] = None) Given a sequence of BaseModel, if all the fields of a certain Option attribute are not populated, can shapeless create a reduced model for me? For example suppose that all the attr1 fields are empty. Without me having to specify the object before hand can shapeless create a generic object that looks like this? case class BaseModel(id: String, createdBy: String, attr2: Option[Int] = None, attr3: Option[LocalDate] = None)
What Shapeless can do is, given two case classes, create an object of one of them from an object of another. import java.time.LocalDate import shapeless.LabelledGeneric import shapeless.record._ case class BaseModel(id: String, createdBy: String, attr1: Option[String] = None, attr2: Option[Int] = None, attr3: Option[LocalDate] = None) case class BaseModel1(id: String, createdBy: String, attr2: Option[Int] = None, attr3: Option[LocalDate] = None) val bm = BaseModel( id = "cff4545gvgf", createdBy = "John Doe", attr2 = Some(42), attr3 = Some(LocalDate.parse("2018-11-03")) ) // BaseModel(cff4545gvgf,John Doe,None,Some(42),Some(2018-11-03)) val hlist = LabelledGeneric[BaseModel].to(bm) val hlist1 = hlist - 'attr1 val bm1 = LabelledGeneric[BaseModel1].from(hlist1) // BaseModel1(cff4545gvgf,John Doe,Some(42),Some(2018-11-03)) But Shapeless can't create a new case class. If you need a new case class to be created automatically you can write a macro.
Scala:case class runTime Error
This demo ran Ok. But when I move it to another class function(my former project) and call the function, it compiles failure. object DFMain { case class Person(name: String, age: Double, t:String) def main (args: Array[String]): Unit = { val sc = new SparkContext("local", "Scala Word Count") val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ val bsonRDD = sc.parallelize(("foo",1,"female"):: ("bar",2,"male"):: ("baz",-1,"female")::Nil) .map(tuple=>{ var bson = new BasicBSONObject() bson.put("name","bfoo") bson.put("value",0.1) bson.put("t","female") (null,bson) }) val tDf = bsonRDD.map(_._2) .map(f=>Person(f.get("name").toString, f.get("value").toString.toDouble, f.get("t").toString)).toDF() tDf.limit(1).show() } } 'MySQLDao.insertIntoMySQL()' compile error object MySQLDao { private val sc= new SparkContext("local", "Scala Word Count") val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ case class Person(name: String, age: Double, t:String) def insertIntoMySQL(): Unit ={ val bsonRDD = sc.parallelize(("foo",1,"female"):: ("bar",2,"male"):: ("baz",-1,"female")::Nil) .map(tuple=>{ val bson = new BasicBSONObject() bson.put("name","bfoo") bson.put("value",0.1) bson.put("t","female") (null,bson) }) val tDf = bsonRDD.map(_._2).map( f=> Person(f.get("name").toString, f.get("value").toString.toDouble, f.get("t").toString)).toDF() tDf.limit(1).show() } } Will, when I call 'MySQLDao.insertIntoMySQL()' gets the Error of value typedProductIterator is not a member of object scala.runtim.scala.scalaRuntTime case class Person(name: String, age: Double, t:String)
I suppose that the case class isn't seen in closure inside map function. Move it to the package level. case class Person(name: String, age: Double, t:String) object MySQLDao { ... }