Throwing NullPointerException while reading from MySQL in Apache Spark with Scala - scala

I am trying to read data from MySQL but it is throwing NullPointerException. Not sure what is the reason.
code in main.scala
object main extends App {
val dt = args.lift(0)
if (dt.isEmpty || !PairingbatchUtil.validatePartitionDate(dt.get)) {
throw new Exception("Partition date is mandatory or enter valid format 'yyyy-MM-dd'")
}
var mailProperties:Properties = new Properties
var templateMappingData: Map[String, Object] = Map(
"job" -> "Load merchant count Data from hdfs to mongo",
"jobProcessedDate" -> dt.get,
"batch" -> "Pairing Batch")
val startTime = System.currentTimeMillis()
try {
val conf = new SparkConf().setAppName("read_from_mysql") //.setMaster("local")
conf.set("spark.sql.warehouse.dir", "/user/local/warehouse/")
conf.set("hive.exec.dynamic.partition", "true")
conf.set("hive.exec.dynamic.partition.mode", "nonstrict")
conf.set("spark.mongodb.input.uri", "mongodb://127.0.0.1/db.table_name")
conf.set("spark.mongodb.output.uri", "mongodb://127.0.0.1/db.table_name")
val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val schemaName = "/user/local/warehouse/"
val aid = "1000"
val resultPath = "/usr/local/proad" + "/" + dt.get
val dbDataPartitionsMap = Map("aid" -> aid, "dt" -> dt.get)
spark.sql("set aid=" + aid)
spark.sql("set dt=" + dt.get)
val configs = spark.sparkContext.getConf.getAll
configs.foreach(i => println(i))
val registerBaseTablesMap = Map(
"DAILY_COUNT" -> ("SELECT * FROM " + schemaName + ".table_name WHERE aid = '${aid}' and dt ='${dt}'"),
"DAILY_COUNT_FINAL" -> ("SELECT * FROM " + schemaName + ".second_table_name WHERE aid = '${aid}' and dt ='${dt}'"))
val parentDF = PairingbatchUtil.readDataFromHive(registerBaseTablesMap.get("DAILY_COUNT").get, spark)
val finalMerchantAffiliateDailyCountDF = Processor.process(parentDF, dbDataPartitionsMap, spark)
}
code in Processor.scala
object Processor {
case class MerchantDailyCount( _id: String, date: Date, totalClicks: String, totalLinks: String, shopUrl: String, shopUUID: String, shopName: String, publisherId: String)
def process(parentDF: DataFrame, partitionsMap: Map[String, String], spark: SparkSession): DataFrame = {
val schemaString = "_id date total_clicks total_links shop_url shop_uuid shop_name publisher_id"
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
var finalDF = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)
parentDF.foreach(row => {
if (parentDF == null || row.getAs("publisher_id") == null || StringUtils.isBlank(row.getAs("shop_uuid"))) {
} else {
val shopUUID = row.getAs("shop_uuid").toString
val currentDate = row.getAs("cur_date").toString
val date = PairingbatchUtil.parseDate(currentDate, Constants.DATE_FORMAT_YYYY_MM_DD, Constants.TAIWAN_TIMEZONE)
val publisherId = row.getAs("publisher_id").toString
val totalClicks = row.getAs("total_clicks").toString
val totalLinks = row.getAs("total_links").toString
val shopUrl = PairingbatchUtil.setShopUrlInfo(shopUUID, "com.mysql.jdbc.Driver", "user_mame", "password", s"""select shop_url, shop_name from db.table_name where shop_uuid ='$shopUUID'""", "shopUrl", spark)._1
val id = PairingbatchUtil.isNeedToSet(spark, shopUrl, publisherId, date)
val merchantDailyCount = MerchantDailyCount(id, date, totalClicks, totalLinks, shopUrl,shopUUID,shopName,publisherId)
import spark.implicits._
val merchantCountDF = Seq(merchantDailyCount).toDF()
finalDF = finalDF.union(merchantCountDF)
}
})
finalDF
}
}
code in PairingBatchUtil.scala:
def setShopUrlInfo(shopUUID: String, driverClass: String, user: String, pass: String, query: String, url: String, sparkSession: SparkSession)={
val merchantDetailsDF = sparkSession.read //line no 139
.format("jdbc")
.option("url", url)
.option("driver", driverClass)
.option("dbtable", s"( $query ) t")
.option("user",user)
.option("password", pass)
.load()
if (merchantDetailsDF.count() == 0) {
("INVALID SHOP URL","INVALID SHOP NAME")
}else {
(merchantDetailsDF.select(col = "shop_url").first().getAs("shop_url"),merchantDetailsDF.select(col = "shop_name").first().getAs("shop_name"))
}
}
i expect output of the query to be :
+--------------+---------+
| shop_url|shop_name|
+--------------+---------+
| parimal | roy |
+--------------+---------+
but actual output is :
19/07/04 14:48:50 ERROR executor.Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:117)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:115)
at org.apache.spark.sql.DataFrameReader.<init>(DataFrameReader.scala:549)
at org.apache.spark.sql.SparkSession.read(SparkSession.scala:613)
at com.rakuten.affiliate.order.pairing.batch.util.PairingbatchUtil$.setShopUrlInfo(PairingbatchUtil.scala:139)
at com.rakuten.affiliate.order.pairing.batch.Processors.MechantAffDailyCountProcessor$$anonfun$process$1.apply(MechantAffDailyCountProcessor.scala:40)
at com.rakuten.affiliate.order.pairing.batch.Processors.MechantAffDailyCountProcessor$$anonfun$process$1.apply(MechantAffDailyCountProcessor.scala:30)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1954)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1954)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)

Are you using Spark 2.1?
In that case I think you might have a problem with a configuration as you can see in the source on line 117
https://github.com/apache/spark/blob/branch-2.1/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala

Related

Update to the delta table in spark not working

package jobs
import io.delta.tables.DeltaTable
import model.RandomUtils
import org.apache.spark.sql.streaming.{ OutputMode, Trigger }
import org.apache.spark.sql.{ DataFrame, Dataset, Encoder, Encoders, SparkSession }
import jobs.SystemJob.Rate
import org.apache.spark.sql.functions._
import org.apache.spark.sql._
case class Student(firstName: String, lastName: String, age: Long, percentage: Long)
case class Rate(timestamp: Timestamp, value: Long)
case class College(name: String, address: String, principal: String)
object RCConfigDSCCDeltaLake {
def getSpark(): SparkSession = {
SparkSession.builder
.appName("Delta table demo")
.master("local[*]")
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
.getOrCreate()
}
def main(args: Array[String]): Unit = {
val spark = getSpark()
val rate = 1
val studentProfile = "student_profile"
if (!DeltaTable.isDeltaTable(s"spark-warehouse/$studentProfile")) {
val deltaTable: DataFrame = spark.sql(s"CREATE TABLE `$studentProfile` (firstName String, lastName String, age Long, percentage Long) USING delta")
deltaTable.show()
deltaTable.printSchema()
}
val studentProfileDT = DeltaTable.forPath(spark, s"spark-warehouse/$studentProfile")
def processStream(student: Dataset[Student], college: Dataset[College]) = {
val studentQuery = student.writeStream.outputMode(OutputMode.Update()).foreachBatch {
(st: Dataset[Student], y: Long) =>
val listOfStudents = st.collect().toList
println("list of students :::" + listOfStudents)
val (o, n) = ("oldData", "newData")
val colMap = Map(
"firstName" -> col(s"$n.firstName"),
"lastName" -> col(s"$n.lastName"),
"age" -> col(s"$n.age"),
"percentage" -> col(s"$n.percentage"))
studentProfileDT.as(s"$o").merge(st.toDF.as(s"$n"), s"$o.firstName = $n.firstName AND $o.lastName = $n.lastName")
.whenMatched.update(colMap)
.whenNotMatched.insert(colMap)
.execute()
}.start()
val os = spark.readStream.format("delta").load(s"spark-warehouse/$studentProfile").writeStream.format("console")
.outputMode(OutputMode.Append())
.option("truncate", value = false)
.option("checkpointLocation", "retrieved").start()
studentQuery.awaitTermination()
os.awaitTermination()
}
import spark.implicits._
implicit val encStudent: Encoder[Student] = Encoders.product[Student]
implicit val encCollege: Encoder[College] = Encoders.product[College]
def rateStream = spark
.readStream
.format("rate") // <-- use RateStreamSource
.option("rowsPerSecond", rate)
.load()
.as[Rate]
val studentStream: Dataset[Student] = rateStream.filter(_.value % 25 == 0).map {
stu =>
Student(...., ....., ....., .....) //fill with values
}
val collegeStream: Dataset[College] = rateStream.filter(_.value % 40 == 0).map {
stu =>
College(...., ....., ......) //fill with values
}
processStream(studentStream, collegeStream)
}
}
What I am trying to do is a simple UPSERT operation with streaming datasets. But it fails with error
22/04/13 19:50:33 ERROR MicroBatchExecution: Query [id = 8cf759fd-9bee-460f-
b0d9-91889c59c524, runId = 55723708-fd3c-4425-a2bc-83d737c37589] terminated with
error
java.lang.UnsupportedOperationException: Detected a data update (for example part-
00000-d026d92e-1798-4d21-a505-67ec72d334e2-c000.snappy.parquet) in the source table
at version 4. This is currently not supported. If you'd like to ignore updates, set
the option 'ignoreChanges' to 'true'. If you would like the data update to be
reflected, please restart this query with a fresh checkpoint directory.
Dependencies :
--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2
--packages io.delta:delta-core_2.12:0.7.0
The update query works when the datasets are not streamed and only hardcoded.
Am I doing something wrong here ?

Performance issue with UDF.. is there a better to solve the transformation. Database write is getting stuck

Table
userid
data
123234
{"type":1,"actionData":{"actionType":"Category","id":"1233232","title":"BOSS","config":{"type":"X"}}}
And I need a output table like this..
userid
action
123234
{"type":"Category","template":{"data":"{"title":"BOSS"}" },"additionalInfo":{"type":1,"config":{"type":"X"} } }
Scala spark..
It is getting stuck while database write with UDF.
running
bin/spark-shell --master local[*] --packages com.datastax.spark:spark-cassandra-connector_2.11:2.5.0 --driver-memory 200g
Need a better way to solve it..
object testDataMigration extends Serializable {
def main(cassandra: String): Unit = {
implicit val spark: SparkSession =
SparkSession
.builder()
.appName("UserLookupMigration")
.config("spark.master", "local[*]")
.config("spark.cassandra.connection.host",cassandra)
.config("spark.cassandra.output.batch.size.rows", "10000")
.config("spark.cassandra.read.timeoutMS","60000")
.getOrCreate()
val res = time(migrateData());
Console.println("Time taken to execute script", res._1);
spark.stop();
}
def migrateData()(implicit spark: SparkSession) {
)
val file = new File("validation_count.txt" )
val print_Writer = new PrintWriter(file)
//Reading data from user_feed table
val userFeedData = spark.read.format("org.apache.spark.sql.cassandra")
.option("keyspace", "sunbird").option("table", "TABLE1").load();
print_Writer.write("User Feed Table records:"+ userFeedData.count() );
//Persisting user feed data into memory
userFeedData.persist()
val userFeedWithNewUUID = userFeedData
.withColumn("newId",expr("uuid()"))
.withColumn("action", myColUpdate(userFeedData("data"),
userFeedData("createdby"), userFeedData("category")))
userFeedWithNewUUID.persist()
val userFeedV2Format = userFeedWithNewUUID.select(
col("newId"),col("category"),col("createdby"),
col("createdon"),col("action"),col("expireon"),
col("priority"),col("status"),col("updatedby"),
col("updatedon"),col("userid"))
.withColumnRenamed("newId","id")
.withColumn("version",lit("v2").cast(StringType))
//Persist v2 format data to memory
userFeedV2Format.persist()
print_Writer.write("User Feed V2 Format records:"+ userFeedV2Format.count() );
userFeedV2Format.write.format("org.apache.spark.sql.cassandra")
.option("keyspace", "sunbird_notifications")
.option("table", "TABLE2")
.mode(SaveMode.Append).save();
//Remove from memory
userFeedV2Format.unpersist()
userFeedData.unpersist()
print_Writer.close()
}
def myColUpdate= udf((data: String, createdby: String, category: String)=> {
val jsonMap = parse(data).values.asInstanceOf[Map[String, Object]]
val actionDataMap = new HashMap[String, Object]
val additionalInfo = new HashMap[String,Object]
val dataTemplate = new HashMap[String,String]
val templateMap = new HashMap[String,Object]
val createdByMap = new HashMap[String,Object]
createdByMap("type")="System"
createdByMap("id")=createdby
var actionType: String = null
for((k,v)<-jsonMap){
if(k == "actionData"){
val actionMap = v.asInstanceOf[Map[String,Object]]
if(actionMap.contains("actionType")){
actionType = actionMap("actionType").asInstanceOf[String]
}
for((k1,v1)<-actionMap){
if(k1 == "title" || k1 == "description"){
dataTemplate(k1)=v1.asInstanceOf[String]
}else{
additionalInfo(k1)=v1
}
}
}else{
additionalInfo(k)=v
}
}
val mapper = new ObjectMapper()
mapper.registerModule(DefaultScalaModule)
templateMap("data")=mapper.writeValueAsString(dataTemplate)
templateMap("ver")="4.4.0"
templateMap("type")="JSON"
actionDataMap("type")=actionType
actionDataMap("category")=category.asInstanceOf[String]
actionDataMap("createdBy")=createdByMap;
actionDataMap("template") =templateMap;
actionDataMap("additionalInfo")=additionalInfo
mapper.writeValueAsString(actionDataMap)
})
}
Getting stuck Table 1 has 40 million data.

How to fix 'No symbol could be loaded from org.apache.hbase.classification.InterfaceAudience'?

I'm trying to prepare a DataFrame to be stored in HFile format on HBase using Apache Spark. I'm using Spark 2.1.0, Scala 2.11 and HBase 1.1.2
Here is my code:
val df = createDataframeFromRow(Row("mlk", "kpo", "opi"), "a b c")
val cols = df.columns.sorted
val colsorteddf = df.select(cols.map(x => col(x)): _*)
val valcols = cols.filterNot(x => x.equals("U_ID"))
So far so good. I only sort the columns of my dataframe
val pdd = colsorteddf.map(row => {
(row(0).toString, (row(1).toString, row(2).toString))
})
val tdd = pdd.flatMap(x => {
val rowKey = PLong.INSTANCE.toBytes(x._1)
for(i <- 0 until valcols.length - 1) yield {
val colname = valcols(i).toString
val colvalue = x._2.productElement(i).toString
val colfam = "data"
(rowKey, (colfam, colname, colvalue))
}
})
After this, I transform each row into this key value format (rowKey, (colfam, colname, colvalue))
No here's when the problem happens. I try to map each row of tdd into a pair of (ImmutableBytesWritable, KeyValue)
import org.apache.hadoop.hbase.KeyValue
val output = tdd.map(x => {
val rowKey: Array[Byte] = x._1
val immutableRowKey = new ImmutableBytesWritable(rowKey)
val colfam = x._2._1
val colname = x._2._2
val colvalue = x._2._3
val kv = new KeyValue(
rowKey,
colfam.getBytes(),
colname.getBytes(),
Bytes.toBytes(colvalue.toString)
)
(immutableRowKey, kv)
})
It renders this stack trace :
java.lang.AssertionError: assertion failed: no symbol could be loaded from interface org.apache.hadoop.hbase.classification.InterfaceAudience$Public in object InterfaceAudience with name Public and classloader scala.reflect.internal.util.ScalaClassLoader$URLClassLoader#3269cbb7
at scala.reflect.runtime.JavaMirrors$JavaMirror.scala$reflect$runtime$JavaMirrors$JavaMirror$$classToScala1(JavaMirrors.scala:1021)
at scala.reflect.runtime.JavaMirrors$JavaMirror$$anonfun$classToScala$1.apply(JavaMirrors.scala:980)
at scala.reflect.runtime.JavaMirrors$JavaMirror$$anonfun$classToScala$1.apply(JavaMirrors.scala:980)
at scala.reflect.runtime.JavaMirrors$JavaMirror$$anonfun$toScala$1.apply(JavaMirrors.scala:97)
at scala.reflect.runtime.TwoWayCaches$TwoWayCache$$anonfun$toScala$1.apply(TwoWayCaches.scala:38)
at scala.reflect.runtime.Gil$class.gilSynchronized(Gil.scala:19)
at scala.reflect.runtime.JavaUniverse.gilSynchronized(JavaUniverse.scala:16)
at scala.reflect.runtime.TwoWayCaches$TwoWayCache.toScala(TwoWayCaches.scala:33)
at scala.reflect.runtime.JavaMirrors$JavaMirror.toScala(JavaMirrors.scala:95)
at scala.reflect.runtime.JavaMirrors$JavaMirror.classToScala(JavaMirrors.scala:980)
at scala.reflect.runtime.JavaMirrors$JavaMirror$JavaAnnotationProxy.<init>(JavaMirrors.scala:163)
at scala.reflect.runtime.JavaMirrors$JavaMirror$JavaAnnotationProxy$.apply(JavaMirrors.scala:162)
at scala.reflect.runtime.JavaMirrors$JavaMirror$JavaAnnotationProxy$.apply(JavaMirrors.scala:162)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:186)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:186)
at scala.reflect.runtime.JavaMirrors$JavaMirror.scala$reflect$runtime$JavaMirrors$JavaMirror$$copyAnnotations(JavaMirrors.scala:683)
at scala.reflect.runtime.JavaMirrors$JavaMirror$FromJavaClassCompleter.load(JavaMirrors.scala:733)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$$anonfun$typeParams$1.apply(SynchronizedSymbols.scala:140)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$$anonfun$typeParams$1.apply(SynchronizedSymbols.scala:133)
at scala.reflect.runtime.Gil$class.gilSynchronized(Gil.scala:19)
at scala.reflect.runtime.JavaUniverse.gilSynchronized(JavaUniverse.scala:16)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$class.gilSynchronizedIfNotThreadsafe(SynchronizedSymbols.scala:123)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$$anon$8.gilSynchronizedIfNotThreadsafe(SynchronizedSymbols.scala:168)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$class.typeParams(SynchronizedSymbols.scala:132)
at scala.reflect.runtime.SynchronizedSymbols$SynchronizedSymbol$$anon$8.typeParams(SynchronizedSymbols.scala:168)
at scala.reflect.internal.Types$NoArgsTypeRef.typeParams(Types.scala:1926)
at scala.reflect.internal.Types$NoArgsTypeRef.isHigherKinded(Types.scala:1925)
at scala.reflect.internal.transform.UnCurry$class.scala$reflect$internal$transform$UnCurry$$expandAlias(UnCurry.scala:22)
at scala.reflect.internal.transform.UnCurry$$anon$2.apply(UnCurry.scala:26)
at scala.reflect.internal.transform.UnCurry$$anon$2.apply(UnCurry.scala:24)
at scala.collection.immutable.List.loop$1(List.scala:173)
at scala.collection.immutable.List.mapConserve(List.scala:189)
at scala.reflect.internal.tpe.TypeMaps$TypeMap.mapOver(TypeMaps.scala:115)
at scala.reflect.internal.transform.UnCurry$$anon$2.apply(UnCurry.scala:46)
at scala.reflect.internal.transform.Transforms$class.transformedType(Transforms.scala:43)
at scala.reflect.internal.SymbolTable.transformedType(SymbolTable.scala:16)
at scala.reflect.internal.Types$TypeApiImpl.erasure(Types.scala:225)
at scala.
It seems like a scala issue. Has anyone ever run into the same problem? If so how did you overcome this?
PS: I'm using running this code through spark-shell.

Spark Streaming scala performance drastic slow

I have following code :-
case class event(imei: String, date: String, gpsdt: String,dt: String,id: String)
case class historyevent(imei: String, date: String, gpsdt: String)
object kafkatesting {
def main(args: Array[String]) {
val clients = new RedisClientPool("192.168.0.40", 6379)
val conf = new SparkConf()
.setAppName("KafkaReceiver")
.set("spark.cassandra.connection.host", "192.168.0.40")
.set("spark.cassandra.connection.keep_alive_ms", "20000")
.set("spark.executor.memory", "3g")
.set("spark.driver.memory", "4g")
.set("spark.submit.deployMode", "cluster")
.set("spark.executor.instances", "4")
.set("spark.executor.cores", "3")
.set("spark.streaming.backpressure.enabled", "true")
.set("spark.streaming.backpressure.initialRate", "100")
.set("spark.streaming.kafka.maxRatePerPartition", "7")
val sc = SparkContext.getOrCreate(conf)
val ssc = new StreamingContext(sc, Seconds(10))
val sqlContext = new SQLContext(sc)
val kafkaParams = Map[String, String](
"bootstrap.servers" -> "192.168.0.113:9092",
"group.id" -> "test-group-aditya",
"auto.offset.reset" -> "largest")
val topics = Set("random")
val kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
kafkaStream.foreachRDD { rdd =>
val updatedRDD = rdd.map(a =>
{
implicit val formats = DefaultFormats
val jValue = parse(a._2)
val fleetrecord = jValue.extract[historyevent]
val hash = fleetrecord.imei + fleetrecord.date + fleetrecord.gpsdt
val md5Hash = DigestUtils.md5Hex(hash).toUpperCase()
val now = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime())
event(fleetrecord.imei, fleetrecord.date, fleetrecord.gpsdt, now, md5Hash)
})
.collect()
updatedRDD.foreach(f =>
{
clients.withClient {
client =>
{
val value = f.imei + " , " + f.gpsdt
val zscore = Calendar.getInstance().getTimeInMillis
val key = new SimpleDateFormat("yyyy-MM-dd").format(Calendar.getInstance().getTime())
val dt = new SimpleDateFormat("HH:mm:ss").format(Calendar.getInstance().getTime())
val q1 = "00:00:00"
val q2 = "06:00:00"
val q3 = "12:00:00"
val q4 = "18:00:00"
val quater = if (dt > q1 && dt < q2) {
System.out.println(dt + " lies in quarter 1");
" -> 1"
} else if (dt > q2 && dt < q3) {
System.out.println(dt + " lies in quarter 2");
" -> 2"
} else if (dt > q3 && dt < q4) {
System.out.println(dt + " lies in quarter 3");
" -> 3"
} else {
System.out.println(dt + " lies in quarter 4");
" -> 4"
}
client.zadd(key + quater, zscore, value)
println(f.toString())
}
}
})
val collection = sc.parallelize(updatedRDD)
collection.saveToCassandra("db", "table", SomeColumns("imei", "date", "gpsdt","dt","id"))
}
ssc.start()
ssc.awaitTermination()
}
}
I'm using this code to insert data from Kafka into Cassandra and Redis, but facing following issues:-
1) application creates a long queue of active batches while the previous batch is currently being processed. So, I want to have next batch only once the previous batch is finished executing.
2) I have four-node cluster which is processing each batch but it takes around 30-40 sec for executing 700 records.
Is my code is optimized or I need to work on my code for better performance?
Yes you can do all your stuff inside mapPartition. There are APIs from datastax that allow you to save the Dstream directly. Here is how you can do it for C*.
val partitionedDstream = kafkaStream.repartition(5) //change this value as per your data and spark cluster
//Now instead of iterating each RDD work on each partition.
val eventsStream: DStream[event] = partitionedDstream.mapPartitions(x => {
val lst = scala.collection.mutable.ListBuffer[event]()
while (x.hasNext) {
val a = x.next()
implicit val formats = DefaultFormats
val jValue = parse(a._2)
val fleetrecord = jValue.extract[historyevent]
val hash = fleetrecord.imei + fleetrecord.date + fleetrecord.gpsdt
val md5Hash = DigestUtils.md5Hex(hash).toUpperCase()
val now = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(Calendar.getInstance().getTime())
lst += event(fleetrecord.imei, fleetrecord.date, fleetrecord.gpsdt, now, md5Hash)
}
lst.toList.iterator
})
eventsStream.cache() //because you are using same Dstream for C* and Redis
//instead of collecting each RDD save whole Dstream at once
import com.datastax.spark.connector.streaming._
eventsStream.saveToCassandra("db", "table", SomeColumns("imei", "date", "gpsdt", "dt", "id"))
Also cassandra accepts timestamp as Long value, so you can also change some part of your code as below
val now = System.currentTimeMillis()
//also change your case class to take `Long` instead of `String`
case class event(imei: String, date: String, gpsdt: String, dt: Long, id: String)
Similarly you can change for Redis as well.

Saving Spark DataFrames into a database via SparkSQL works in "local[*]" but not in YARN mode

I process a set of files using Spark. The results after conversion to a Spark Dataframe should be saved to a database. The following code works when Spark runs in the "local[*]" mode. But when I run it on a cluster using YARN mode, processing ends without errors (except some these errors at the very beginning) but the database remains empty.
import java.sql.{Connection, DriverManager, Timestamp, SQLException}
import java.util.Properties
import org.apache.spark.sql.SparkSession
import scala.collection.JavaConverters._
import java.util.Calendar
import scala.collection.mutable.ListBuffer
import com.qbeats.cortex.library.{PartialDateTime, TimeExtractor}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._
object CommoncrawlExtractor extends App {
var driver: String = null
var connectionString: String = null
var helper: Helper = null
var sc: SparkContext = null
var pte = sc.broadcast(new TimeExtractor)
def uncertainty = 60 * 60 * 12
case class SectionData(warcinfoID: String, recordID: String, sectionName: Int,
timestamp: Timestamp, uncertainty: Int, wordsets: Array[Array[String]])
case class Word(word: String)
case class Wordset(section_id: Int, wordset: Seq[Int])
def dropFirst(iterator: Iterator[String]): Iterator[String] = {
if (iterator.hasNext) {
iterator.next
}
iterator
}
def extractSentences(entity: String) = {
val result = ListBuffer[(String, String, Int, Timestamp, Int, Array[Array[String]])]()
val warcinfoIDPattern = """WARC-Warcinfo-ID: <urn:uuid:(.+)>""".r
val warcinfoID = warcinfoIDPattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
val recordIDPattern = """WARC-Record-ID: <urn:uuid:(.+)>""".r
val recordID = recordIDPattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
val requestTimePattern = """WARC-Date: (.+)""".r
val requestTimeString = requestTimePattern.findFirstMatchIn(entity).map(_ group 1).getOrElse("")
val requestTimeFormat = new java.text.SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'")
val requestTime = requestTimeFormat.parse(requestTimeString)
var cal: Calendar = Calendar.getInstance()
cal.setTime(requestTime)
val referenceDate1 = new PartialDateTime(cal, null)
val contentPattern = """(?s)\r\nHTTP/1\.. 200(.+?)(\r\n){2,}(.+)WARC/1.0\r\nWARC-Type: metadata""".r
val contentString = contentPattern.findFirstMatchIn(entity).map(_ group 3).getOrElse("")
try {
val de = pte.value.extractTimes(contentString)
if (de.getEntries != null) {
for (entry <- de.getEntries.asScala) {
val pdt = entry.resolve(12 * 3600, referenceDate1)
if (pdt != null) {
val sectionWordsets = entry.getSentences.asScala.map(x => x.getTokens.asScala.toArray[String]).toArray
val sectionData = (
warcinfoID, recordID, entry.getId,
new Timestamp(pdt.secondsSinceEpoch * 1000), pdt.uncertaintyInterval.toInt, sectionWordsets
)
result += sectionData
}
}
}
} catch {
case e: Exception => println("\n" + "-" * 100 + "\n" + entity)
}
result
}
def initDB() = {
driver = "org.postgresql.Driver"
connectionString = "jdbc:postgresql://lv-ws10.lviv:5432/commoncrawl?user=postgres&password=postgres"
Class.forName(driver)
}
def prepareDB() = {
var conn: Connection = null
try {
conn = DriverManager.getConnection(connectionString)
val statement = conn.createStatement()
val tableResultSet = statement.executeQuery(
"""
|SELECT table_name
| FROM information_schema.tables
| WHERE table_schema='public'
| AND table_type='BASE TABLE';
""".stripMargin)
val tablesToDelete = ListBuffer[String]()
while (tableResultSet.next()) {
tableResultSet.getString("table_name") match {
case "warcinfo" => tablesToDelete.append("warcinfo")
case "record" => tablesToDelete.append("record")
case "section" => tablesToDelete.append("section")
case "word" => tablesToDelete.append("word")
case "wordset" => tablesToDelete.append("wordset")
case _ =>
}
}
for (tableName <- tablesToDelete) statement.executeUpdate("DROP TABLE " + tableName + ";")
val storedProcedureResultSet = statement.executeQuery(
"""
|SELECT proname, prosrc
|FROM pg_catalog.pg_namespace n
|JOIN pg_catalog.pg_proc p
|ON pronamespace = n.oid
|WHERE nspname = 'public';
""".stripMargin)
val storedProcedureDeletions = ListBuffer[String]()
while (storedProcedureResultSet.next()) {
storedProcedureResultSet.getString("proname") match {
case "update_word_ids" =>
storedProcedureDeletions.append("DROP FUNCTION update_word_ids();")
case _ =>
}
}
statement.executeUpdate("DROP TRIGGER IF EXISTS update_word_ids_trigger ON wordset_occurrence;")
for (storedProcedureDeletion <- storedProcedureDeletions) statement.executeUpdate(storedProcedureDeletion)
statement.executeUpdate(
"""
|CREATE TABLE warcinfo (
| warcinfo_id serial PRIMARY KEY,
| batch_name varchar NOT NULL,
| warcinfo_uuid char(36) NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE record (
| record_id serial PRIMARY KEY,
| record_uuid char(36) NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE section (
| section_id serial PRIMARY KEY,
| record_id integer NOT NULL,
| section_name integer NOT NULL,
| timestamp timestamp NOT NULL,
| uncertainty integer NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE word (
| word_id serial PRIMARY KEY,
| word varchar NOT NULL
|);
""".stripMargin)
statement.executeUpdate(
"""
|CREATE TABLE wordset (
| section_id integer NOT NULL,
| wordset integer ARRAY
|);
""".stripMargin)
} catch {
case e: SQLException => println("exception caught: " + e)
} finally {
if (conn != null) conn.close()
}
}
def processFile(fileNames: Array[String], accessKeyId: String = "", secretAccessKey: String = ""): Unit = {
val delimiter = "WARC/1.0\r\nWARC-Type: request\r\n"
pte = sc.broadcast(new TimeExtractor)
val spark = SparkSession
.builder()
.appName("CommoncrawlExtractor")
.getOrCreate()
import spark.implicits._
val connString = "jdbc:postgresql://lv-ws10.lviv:5432/commoncrawl"
val prop = new Properties()
prop.put("user", "postgres")
prop.put("password", "postgres")
val entities = sc.
textFile(fileNames.mkString(",")).
mapPartitions(dropFirst).
map(delimiter + _).
flatMap(extractSentences).
map(x => SectionData(x._1, x._2, x._3, x._4, x._5, x._6)).toDF().
cache()
val warcinfo = entities.select("warcinfoID").distinct().
withColumnRenamed("warcinfoID", "warcinfo_uuid").
withColumn("batch_name", lit("June 2016, batch 1"))
val warcinfoWriter = warcinfo.write.mode("append")
println("Saving warcinfo.")
println(Calendar.getInstance().getTime)
warcinfoWriter.jdbc(connString, "warcinfo", prop)
println(Calendar.getInstance().getTime)
val record = entities.select("recordID").distinct().
withColumnRenamed("recordID", "record_uuid")
val recordWriter = record.write.mode("append")
println("Saving records.")
println(Calendar.getInstance().getTime)
recordWriter.jdbc(connString, "record", prop)
println(Calendar.getInstance().getTime)
val recordFull = spark.read.
format("jdbc").
options(Map("url" -> connString, "dbtable" -> "public.record", "user" -> "postgres", "password" -> "postgres")).
load().cache()
val section = entities.
join(recordFull, entities.col("recordID").equalTo(recordFull("record_uuid"))).
select("record_id", "sectionName", "timestamp", "uncertainty").distinct().
withColumnRenamed("sectionName", "section_name")
val sectionWriter = section.write.mode("append")
println("Saving sections.")
println(Calendar.getInstance().getTime)
sectionWriter.jdbc(connString, "section", prop)
println(Calendar.getInstance().getTime)
val sectionFull = spark.read.
format("jdbc").
options(Map("url" -> connString, "dbtable" -> "public.section", "user" -> "postgres", "password" -> "postgres")).
load()
val word = entities.
select("wordsets").
flatMap(r => r.getAs[Seq[Seq[String]]]("wordsets").flatten).
distinct().
map(Word(_))
val wordWriter = word.write.mode("append")
wordWriter.jdbc(connString, "word", prop)
val wordFull = spark.read.
format("jdbc").
options(Map("url" -> connString, "dbtable" -> "public.word", "user" -> "postgres", "password" -> "postgres")).
load().
map(row => (row.getAs[String]("word"), row.getAs[Int]("word_id"))).
collect().
toMap
val wordsetTemp = entities.
join(recordFull, entities.col("recordID").equalTo(recordFull("record_uuid"))).
withColumnRenamed("sectionName", "section_name")
val wordset = wordsetTemp.
join(sectionFull, Seq("record_id", "section_name")).
select("section_id", "wordsets").
flatMap(r => r.getAs[Seq[Seq[String]]]("wordsets").map(x => Wordset(r.getAs[Int]("section_id"), x.map(wordFull))))
val wordsetWriter = wordset.write.mode("append")
println("Saving wordsets.")
println(Calendar.getInstance().getTime)
wordsetWriter.jdbc(connString, "wordset", prop)
println(Calendar.getInstance().getTime)
// entities.saveAsTextFile(helper.outputDirectory + "xyz")
sc.stop
}
override def main(args: Array[String]): Unit = {
if (args.length >= 2) {
initDB()
prepareDB()
helper = new Helper
val files =
if (args(0).startsWith("hdfs://")) helper.getHDFSFiles(args(0)).slice(0, args(3).toInt)
else helper.getLocalFiles(args(0))
val appName = "CommoncrawlExtractor"
val conf = new SparkConf().setAppName(appName)
if (args(0).startsWith("hdfs://")) {
conf.set("spark.executor.instances", args(1))
conf.set("spark.executor.cores", args(2))
} else conf.setMaster(args(1))
sc = new SparkContext(conf)
val delimiter = "WARC/1.0\r\nWARC-Type: request"
sc.hadoopConfiguration.set("textinputformat.record.delimiter", delimiter)
processFile(files)
}
}
}
I copied postgresql-9.4.1209.jre7.jar to /home/user/Programs/libs on every machine in the claster and use the following command (run from Spark's directory):
./bin/spark-submit --master yarn --deploy-mode client --driver-class-path /home/user/Programs/libs/postgresql-9.4.1209.jre7.jar --jars /home/user/Programs/libs/postgresql-9.4.1209.jre7.jar --conf "spark.driver.extraClassPath=/home/user/Programs/libs/postgresql-9.4.1209.jre7.jar" --conf "spark.executor.extraClassPath=/home/user/Programs/libs/postgresql-9.4.1209.jre7.jar" spark-cortex-fat.jar hdfs://LV-WS10.lviv:9000/commoncrawl 2 4 8
Please suggest how I can make it work on the cluster.
ADDED LATER:
I discovered that these lines
val warcinfo = entities.select("warcinfoID").
withColumnRenamed("warcinfoID", "warcinfo_uuid").
withColumn("batch_name", lit("June 2016, batch 1"))
val warcinfoWriter = warcinfo.write.mode("append")
println("Saving warcinfo.")
println(Calendar.getInstance().getTime)
warcinfoWriter.jdbc(connString, "warcinfo", prop)
println(Calendar.getInstance().getTime)
lead to exception
16/09/01 17:31:51 WARN scheduler.TaskSetManager: Lost task 0.1 in stage 1.0 (TID 5, LV-WS09): org.apache.spark.storage.BlockFetchException: Failed to fetch block after 1 fetch failures. Most recent failure cause:
at org.apache.spark.storage.BlockManager.getRemoteBytes(BlockManager.scala:565)
at org.apache.spark.storage.BlockManager.getRemoteValues(BlockManager.scala:522)
at org.apache.spark.storage.BlockManager.get(BlockManager.scala:609)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:661)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:330)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:281)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:319)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:283)
at org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:96)
at org.apache.spark.rdd.CoalescedRDD$$anonfun$compute$1.apply(CoalescedRDD.scala:95)
at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:434)
at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:440)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply$mcV$sp(PairRDDFunctions.scala:1203)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1203)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1203)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1325)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1211)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1190)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70)
at org.apache.spark.scheduler.Task.run(Task.scala:85)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.SparkException: Exception thrown in awaitResult:
at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:194)
at org.apache.spark.network.BlockTransferService.fetchBlockSync(BlockTransferService.scala:104)
at org.apache.spark.storage.BlockManager.getRemoteBytes(BlockManager.scala:554)
... 31 more
Caused by: java.io.IOException: Failed to connect to ubuntu-cluster-4/192.168.100.139:36378
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:228)
at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:179)
at org.apache.spark.network.netty.NettyBlockTransferService$$anon$1.createAndStart(NettyBlockTransferService.scala:96)
at org.apache.spark.network.shuffle.RetryingBlockFetcher.fetchAllOutstanding(RetryingBlockFetcher.java:140)
at org.apache.spark.network.shuffle.RetryingBlockFetcher.access$200(RetryingBlockFetcher.java:43)
at org.apache.spark.network.shuffle.RetryingBlockFetcher$1.run(RetryingBlockFetcher.java:170)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
... 3 more
Caused by: java.net.ConnectException: Connection refused: ubuntu-cluster-4/192.168.100.139:36378
at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)
at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:717)
at io.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:224)
at io.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:289)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:528)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
... 1 more
However, some records are stored in a database.
What would you suggest?
ADDED LATER:
I looked to YARN logs on the node which stopped responding but they weren't helpful: logs.