I need parse text with spark
text file:
Struct{event=x,key=x,cas=x,content={"recordData":{JSON}}
i used my udf but spark executor oom killed and finish task .How can I improve performance?
def getKafkaMessageHeader = udf((sampleDF: Column) => {
val segments: Array[String] = sampleDF.toString().split("content=", 2)
val output: util.Map[String, String] = new util.HashMap[String, String]
var tmp: String = null
if (segments.length == 2) {
tmp = StringUtils.substringBetween(segments(0), "cas=", ",")
output.put("cas", StringUtils.substring(tmp, 0, 16))
output.put("event", StringUtils.substringBetween(segments(0), "event=", ","))
output.put("key", StringUtils.substringBetween(segments(0), "key=", ","))
output.put("content", StringUtils.chop(segments(1)))
}
else if (segments.length == 1) {
tmp = StringUtils.substringBetween(segments(0), "cas=", "}")
output.put("cas", StringUtils.substring(tmp, 0, 16))
output.put("event", StringUtils.substringBetween(segments(0), "event=", ","))
output.put("key", StringUtils.substringBetween(segments(0), "key=", ","))
output.put("content", null)
}
else {
output.put("cas", null)
output.put("event", null)
output.put("key", null)
output.put("content", null)
}
val recordData = new recordData()
cas = output.get("cas")
event= output.get("event")
key= output.get("key")
content= output.get("content")
})
Related
I define val like this :
val config = Config(args)
val product_type = config.product_type
then I send product_type as "AA"
and my code is this :
val scores = df.mapPartitions(iterator => {
val inputStream =
if(product_type == "AA" ) {
getClass().getClassLoader().getResourceAsStream("my_aa.hdf5")
}
else {
getClass().getClassLoader().getResourceAsStream("my_bb.hdf5")
}
val multiLayerNetwork: MultiLayerNetwork = KerasModelImport.importKerasSequentialModelAndWeights(inputStream, false)
val wrapped: ParallelInference = new ParallelInference.Builder(multiLayerNetwork).build()
val res = iterator.map(row => {
wrapped.output(row).toDoubleVector
})
res
})
But my inputStream equals "my_bb.hdf5" which is not correct. This value comes from else statement. So why my product_type variable cant read in mappartition?
I print my product_type value before code and I checked it , it is : "AA"
it occurs because of i get this variable from argument in spark submit.sh
and it can not read from mappartition.
It works like this:
val scores =
if (product_type == "AA") {
df.mapPartitions(iterator => {
val inputStream = getClass().getClassLoader().getResourceAsStream("AA.hdf5")
val multiLayerNetwork: MultiLayerNetwork = KerasModelImport.importKerasSequentialModelAndWeights(inputStream, false)
val wrapped: ParallelInference = new ParallelInference.Builder(multiLayerNetwork).build()
val res = iterator.map(row => {
wrapped.output(row).toDoubleVector
})
res
})
} else {
df.mapPartitions(iterator => {
val inputStream = getClass().getClassLoader().getResourceAsStream("BB.hdf5")
val multiLayerNetwork: MultiLayerNetwork = KerasModelImport.importKerasSequentialModelAndWeights(inputStream, false)
val wrapped: ParallelInference = new ParallelInference.Builder(multiLayerNetwork).build()
val res = iterator.map(row => {
wrapped.output(row).toDoubleVector
})
res
})
}
I am trying to read data from MySQL but it is throwing NullPointerException. Not sure what is the reason.
code in main.scala
object main extends App {
val dt = args.lift(0)
if (dt.isEmpty || !PairingbatchUtil.validatePartitionDate(dt.get)) {
throw new Exception("Partition date is mandatory or enter valid format 'yyyy-MM-dd'")
}
var mailProperties:Properties = new Properties
var templateMappingData: Map[String, Object] = Map(
"job" -> "Load merchant count Data from hdfs to mongo",
"jobProcessedDate" -> dt.get,
"batch" -> "Pairing Batch")
val startTime = System.currentTimeMillis()
try {
val conf = new SparkConf().setAppName("read_from_mysql") //.setMaster("local")
conf.set("spark.sql.warehouse.dir", "/user/local/warehouse/")
conf.set("hive.exec.dynamic.partition", "true")
conf.set("hive.exec.dynamic.partition.mode", "nonstrict")
conf.set("spark.mongodb.input.uri", "mongodb://127.0.0.1/db.table_name")
conf.set("spark.mongodb.output.uri", "mongodb://127.0.0.1/db.table_name")
val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate()
val schemaName = "/user/local/warehouse/"
val aid = "1000"
val resultPath = "/usr/local/proad" + "/" + dt.get
val dbDataPartitionsMap = Map("aid" -> aid, "dt" -> dt.get)
spark.sql("set aid=" + aid)
spark.sql("set dt=" + dt.get)
val configs = spark.sparkContext.getConf.getAll
configs.foreach(i => println(i))
val registerBaseTablesMap = Map(
"DAILY_COUNT" -> ("SELECT * FROM " + schemaName + ".table_name WHERE aid = '${aid}' and dt ='${dt}'"),
"DAILY_COUNT_FINAL" -> ("SELECT * FROM " + schemaName + ".second_table_name WHERE aid = '${aid}' and dt ='${dt}'"))
val parentDF = PairingbatchUtil.readDataFromHive(registerBaseTablesMap.get("DAILY_COUNT").get, spark)
val finalMerchantAffiliateDailyCountDF = Processor.process(parentDF, dbDataPartitionsMap, spark)
}
code in Processor.scala
object Processor {
case class MerchantDailyCount( _id: String, date: Date, totalClicks: String, totalLinks: String, shopUrl: String, shopUUID: String, shopName: String, publisherId: String)
def process(parentDF: DataFrame, partitionsMap: Map[String, String], spark: SparkSession): DataFrame = {
val schemaString = "_id date total_clicks total_links shop_url shop_uuid shop_name publisher_id"
val fields = schemaString.split(" ")
.map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
var finalDF = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)
parentDF.foreach(row => {
if (parentDF == null || row.getAs("publisher_id") == null || StringUtils.isBlank(row.getAs("shop_uuid"))) {
} else {
val shopUUID = row.getAs("shop_uuid").toString
val currentDate = row.getAs("cur_date").toString
val date = PairingbatchUtil.parseDate(currentDate, Constants.DATE_FORMAT_YYYY_MM_DD, Constants.TAIWAN_TIMEZONE)
val publisherId = row.getAs("publisher_id").toString
val totalClicks = row.getAs("total_clicks").toString
val totalLinks = row.getAs("total_links").toString
val shopUrl = PairingbatchUtil.setShopUrlInfo(shopUUID, "com.mysql.jdbc.Driver", "user_mame", "password", s"""select shop_url, shop_name from db.table_name where shop_uuid ='$shopUUID'""", "shopUrl", spark)._1
val id = PairingbatchUtil.isNeedToSet(spark, shopUrl, publisherId, date)
val merchantDailyCount = MerchantDailyCount(id, date, totalClicks, totalLinks, shopUrl,shopUUID,shopName,publisherId)
import spark.implicits._
val merchantCountDF = Seq(merchantDailyCount).toDF()
finalDF = finalDF.union(merchantCountDF)
}
})
finalDF
}
}
code in PairingBatchUtil.scala:
def setShopUrlInfo(shopUUID: String, driverClass: String, user: String, pass: String, query: String, url: String, sparkSession: SparkSession)={
val merchantDetailsDF = sparkSession.read //line no 139
.format("jdbc")
.option("url", url)
.option("driver", driverClass)
.option("dbtable", s"( $query ) t")
.option("user",user)
.option("password", pass)
.load()
if (merchantDetailsDF.count() == 0) {
("INVALID SHOP URL","INVALID SHOP NAME")
}else {
(merchantDetailsDF.select(col = "shop_url").first().getAs("shop_url"),merchantDetailsDF.select(col = "shop_name").first().getAs("shop_name"))
}
}
i expect output of the query to be :
+--------------+---------+
| shop_url|shop_name|
+--------------+---------+
| parimal | roy |
+--------------+---------+
but actual output is :
19/07/04 14:48:50 ERROR executor.Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.NullPointerException
at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:117)
at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:115)
at org.apache.spark.sql.DataFrameReader.<init>(DataFrameReader.scala:549)
at org.apache.spark.sql.SparkSession.read(SparkSession.scala:613)
at com.rakuten.affiliate.order.pairing.batch.util.PairingbatchUtil$.setShopUrlInfo(PairingbatchUtil.scala:139)
at com.rakuten.affiliate.order.pairing.batch.Processors.MechantAffDailyCountProcessor$$anonfun$process$1.apply(MechantAffDailyCountProcessor.scala:40)
at com.rakuten.affiliate.order.pairing.batch.Processors.MechantAffDailyCountProcessor$$anonfun$process$1.apply(MechantAffDailyCountProcessor.scala:30)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1954)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1954)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Are you using Spark 2.1?
In that case I think you might have a problem with a configuration as you can see in the source on line 117
https://github.com/apache/spark/blob/branch-2.1/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala
I have created a dataframe from below sequence.
val df = sc.parallelize(Seq((100,23,9.50),
(100,23,9.51),
(100,24,9.52),
(100,25,9.54),
(100,23,9.55),
(101,21,8.51),
(101,23,8.52),
(101,24,8.55),
(101,20,8.56))).toDF("id", "temp","time")
I wanted to update the DF by addin few more rows where data is missing for the time. So I have iterated the DF from mapPartitions to add new rows.
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Row, Column}
#transient val w = org.apache.spark.sql.expressions.Window.partitionBy("id").orderBy("time")
val leadDf = df.withColumn("time_diff", ((lead("time", 1).over(w) - df("time")).cast("Float")*100).cast("int"))
Dataframe iteration goes here:
val result = leadDf.rdd.mapPartitions(itr =>
new Iterator[Row] {
var prevRow = null: Row
var prevDone = true
var firstRow = true
var outputRow: Row = null: Row
var counter = 0
var currRecord = null :Row
var currRow: Row = if (itr.hasNext) {currRecord = itr.next; currRecord } else null
prevRow = currRow
override def hasNext: Boolean = {
if (!prevDone) {
prevRow = incrementValue(prevRow,2)
outputRow = prevRow
counter = counter -1
if(counter == 0) {
prevDone = true
}
true
} else if (itr.hasNext) {
prevRow = currRow
if(counter == 0 && prevRow.getAs[Int](3) != 1 && !isNullValue(prevRow,3 )){
outputRow = prevRow
counter = prevRow.getAs[Int](3) - 1
prevDone = false
}else if(counter > 0) {
counter = counter -1
prevDone = false
}
else {
outputRow = currRow
}
//if(counter == 0){
currRow = itr.next
true
} else if (currRow != null) {
outputRow = currRow
currRow =null
true
} else {
false
}
}
override def next(): Row = outputRow
})
val newDf = spark.createDataFrame(result,leadDf.schema)
After this, I can see 12 records in dataframe. But got 10 records from the physical table created by the temp table created from "newDf" dataframe.
newDf.registerTempTable("test")
spark.sql("create table newtest as select * from test")
scala> newDf.count
res14: Long = 12
scala> spark.sql("select * from newtest").count
res15: Long = 10
The same code works fine in Spark 1.6 and final table count matches with dataframe record count.
Can someone explain why this is happening ? and any solution or workaround to solve the problem
I found a solution or workaround that is calling reparation method on newly created dataframe from RDD[Row].
val newDf = spark.createDataFrame(result,leadDf.schema).repartition(result.getNumPartitions)
I have this code:
val rdd = sc.textFile(sample.log")
val splitRDD = rdd.map(r => StringUtils.splitPreserveAllTokens(r, "\\|"))
val rdd2 = splitRDD.filter(...).map(row => createRow(row, fieldsMap))
sqlContext.createDataFrame(rdd2, structType).save(
org.apache.phoenix.spark, SaveMode.Overwrite, Map("table" -> table, "zkUrl" -> zkUrl))
def createRow(row: Array[String], fieldsMap: ListMap[Int, FieldConfig]): Row = {
//add additional index for invalidValues
val arrSize = fieldsMap.size + 1
val arr = new Array[Any](arrSize)
var invalidValues = ""
for ((k, v) <- fieldsMap) {
val valid = ...
var value : Any = null
if (valid) {
value = row(k)
// if (v.code == "SOURCE_NAME") --> 5th column in the row
// sourceNameCount = row(k).split(",").size
} else {
invalidValues += v.code + " : " + row(k) + " | "
}
arr(k) = value
}
arr(arrSize - 1) = invalidValues
Row.fromSeq(arr.toSeq)
}
fieldsMap contains the mapping of the input columns: (index, FieldConfig). Where FieldConfig class contains "code" and "dataType" values.
TOPIC -> (0, v.code = "TOPIC", v.dataType = "String")
GROUP -> (1, v.code = "GROUP")
SOURCE_NAME1,SOURCE_NAME2,SOURCE_NAME3 -> (4, v.code = "SOURCE_NAME")
This is the sample.log:
TOPIC|GROUP|TIMESTAMP|STATUS|SOURCE_NAME1,SOURCE_NAME2,SOURCE_NAME3|
SOURCE_TYPE1,SOURCE_TYPE2,SOURCE_TYPE3|SOURCE_COUNT1,SOURCE_COUNT2,SOURCE_COUNT3|
DEST_NAME1,DEST_NAME2,DEST_NAME3|DEST_TYPE1,DEST_TYPE2,DEST_TYPE3|
DEST_COUNT1,DEST_COUNT2,DEST_COUNT3|
The goal is to split the input (sample.log), based on the number of source_name(s).. In the example above, the output will have 3 rows:
TOPIC|GROUP|TIMESTAMP|STATUS|SOURCE_NAME1|SOURCE_TYPE1|SOURCE_COUNT1|
|DEST_NAME1|DEST_TYPE1|DEST_COUNT1|
TOPIC|GROUP|TIMESTAMP|STATUS|SOURCE_NAME2|SOURCE_TYPE2|SOURCE_COUNT2|
DEST_NAME2|DEST_TYPE2|DEST_COUNT2|
TOPIC|GROUP|TIMESTAMP|STATUS|SOURCE_NAME3|SOURCE_TYPE3|SOURCE_COUNT3|
|DEST_NAME3|DEST_TYPE3|DEST_COUNT3|
This is the new code I am working on (still using createRow defined above):
val rdd2 = splitRDD.filter(...).flatMap(row => {
val srcName = row(4).split(",")
val srcType = row(5).split(",")
val srcCount = row(6).split(",")
val destName = row(7).split(",")
val destType = row(8).split(",")
val destCount = row(9).split(",")
var newRDD: ArrayBuffer[Row] = new ArrayBuffer[Row]()
//if (srcName != null) {
println("\n\nsrcName.size: " + srcName.size + "\n\n")
for (i <- 0 to srcName.size - 1) {
// missing column: destType can sometimes be null
val splittedRow: Array[String] = Row.fromSeq(Seq((row(0), row(1), row(2), row(3),
srcName(i), srcType(i), srcCount(i), destName(i), "", destCount(i)))).toSeq.toArray[String]
newRDD = newRDD ++ Seq(createRow(splittedRow, fieldsMap))
}
//}
Seq(Row.fromSeq(Seq(newRDD)))
})
Since I am having an error in converting my splittedRow to Array[String]
(".toSeq.toArray[String]")
error: type arguments [String] do not conform to method toArray's type parameter bounds [B >: Any]
I decided to update my splittedRow to:
val rowArr: Array[String] = new Array[String](10)
for (j <- 0 to 3) {
rowArr(j) = row(j)
}
rowArr(4) = srcName(i)
rowArr(5) = row(5).split(",")(i)
rowArr(6) = row(6).split(",")(i)
rowArr(7) = row(7).split(",")(i)
rowArr(8) = row(8).split(",")(i)
rowArr(9) = row(9).split(",")(i)
val splittedRow = rowArr
You could use a flatMap operation instead of a map operation to return multiple rows. Consequently, your createRow would be refactored to createRows(row: Array[String], fieldsMap: List[Int, IngestFieldConfig]): Seq[Row].
I am new to scala. When i try to run the example program PageRank its showing the following error..
Exception in thread "main" java.lang.NumberFormatException: For input
string: "5" at
scala.collection.immutable.StringLike$class.parseBoolean(StringLike.scala:240)
at
scala.collection.immutable.StringLike$class.toBoolean(StringLike.scala:228)
at scala.collection.immutable.StringOps.toBoolean(StringOps.scala:31)
at
spark.bagel.examples.WikipediaPageRank$.main(WikipediaPageRank.scala:30)
at
spark.bagel.examples.WikipediaPageRank.main(WikipediaPageRank.scala)
import spark._
import spark.SparkContext._
import spark.bagel._
import spark.bagel.Bagel._
import scala.xml.{XML,NodeSeq}
object WikipediaPageRank {
def main(args: Array[String]) {
if (args.length < 5) {
System.err.println("Usage: WikipediaPageRank <inputFile> <threshold> <numPartitions> <host> <usePartitioner>")
System.exit(-1)
}
System.setProperty("spark.serializer", "spark.KryoSerializer")
System.setProperty("spark.kryo.registrator", classOf[PRKryoRegistrator].getName)
val inputFile = args(0)
val threshold = args(1).toDouble
val numPartitions = args(2).toInt
val host = args(3)
val usePartitioner = args(4).toBoolean
val sc = new SparkContext(host, "WikipediaPageRank")
// Parse the Wikipedia page data into a graph
val input = sc.textFile(inputFile)
println("Counting vertices...")
val numVertices = input.count()
println("Done counting vertices.")
println("Parsing input file...")
var vertices = input.map(line => {
val fields = line.split("\t")
val (title, body) = (fields(1), fields(3).replace("\\n", "\n"))
val links =
if (body == "\\N")
NodeSeq.Empty
else
try {
XML.loadString(body) \\ "link" \ "target"
} catch {
case e: org.xml.sax.SAXParseException =>
System.err.println("Article \""+title+"\" has malformed XML in body:\n"+body)
NodeSeq.Empty
}
val outEdges = links.map(link => new String(link.text)).toArray
val id = new String(title)
(id, new PRVertex(1.0 / numVertices, outEdges))
})
if (usePartitioner)
vertices = vertices.partitionBy(new HashPartitioner(sc.defaultParallelism)).cache
else
vertices = vertices.cache
println("Done parsing input file.")
// Do the computation
val epsilon = 0.01 / numVertices
val messages = sc.parallelize(Array[(String, PRMessage)]())
val utils = new PageRankUtils
val result =
Bagel.run(
sc, vertices, messages, combiner = new PRCombiner(),
numPartitions = numPartitions)(
utils.computeWithCombiner(numVertices, epsilon))
// Print the result
System.err.println("Articles with PageRank >= "+threshold+":")
val top =
(result
.filter { case (id, vertex) => vertex.value >= threshold }
.map { case (id, vertex) => "%s\t%s\n".format(id, vertex.value) }
.collect.mkString)
println(top)
}
}
Please help me in solving the error.