Hi I am trying to generate output of Salt Examples but without using docker as mentioned in it's documentation. I found the scala code that helps in generating the output which is the Main.scala. I took to modify the Main.scala to a convenient one,
package BinExTest
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.Row
import software.uncharted.salt.core.projection.numeric._
import software.uncharted.salt.core.generation.request._
import software.uncharted.salt.core.generation.Series
import software.uncharted.salt.core.generation.TileGenerator
import software.uncharted.salt.core.generation.output.SeriesData
import software.uncharted.salt.core.analytic.numeric._
import java.io._
import scala.util.parsing.json.JSONObject
object Main {
// Defines the tile size in both x and y bin dimensions
val tileSize = 256
// Defines the output layer name
val layerName = "pickups"
// Creates and returns an Array of Double values encoded as 64bit Integers
def createByteBuffer(tile: SeriesData[(Int, Int, Int), (Int, Int), Double, (Double, Double)]): Array[Byte] = {
val byteArray = new Array[Byte](tileSize * tileSize * 8)
var j = 0
tile.bins.foreach(b => {
val data = java.lang.Double.doubleToLongBits(b)
for (i <- 0 to 7) {
byteArray(j) = ((data >> (i * 8)) & 0xff).asInstanceOf[Byte]
j += 1
}
})
byteArray
}
def main(args: Array[String]): Unit = {
val jarFile = "/home/kesava/Studies/BinExTest/BinExTest.jar";
val inputPath = "/home/kesava/Downloads/taxi_micro.csv"
val outputPath = "/home/kesava/SoftWares/salt/salt-examples/bin-example/Output"
val conf = new SparkConf().setAppName("salt-bin-example").setJars(Array(jarFile))
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
sqlContext.read.format("com.databricks.spark.csv")
.option("header", "true")
.option("inferSchema", "true")
.load(s"file://$inputPath")
.registerTempTable("taxi_micro")
// Construct an RDD of Rows containing only the fields we need. Cache the result
val input = sqlContext.sql("select pickup_lon, pickup_lat from taxi_micro")
.rdd.cache()
// Given an input row, return pickup longitude, latitude as a tuple
val pickupExtractor = (r: Row) => {
if (r.isNullAt(0) || r.isNullAt(1)) {
None
} else {
Some((r.getDouble(0), r.getDouble(1)))
}
}
// Tile Generator object, which houses the generation logic
val gen = TileGenerator(sc)
// Break levels into batches. Process several higher levels at once because the
// number of tile outputs is quite low. Lower levels done individually due to high tile counts.
val levelBatches = List(List(0, 1, 2, 3, 4, 5, 6, 7, 8), List(9, 10, 11), List(12), List(13), List(14))
// Iterate over sets of levels to generate.
val levelMeta = levelBatches.map(level => {
println("------------------------------")
println(s"Generating level $level")
println("------------------------------")
// Construct the definition of the tiling jobs: pickups
val pickups = new Series((tileSize - 1, tileSize - 1),
pickupExtractor,
new MercatorProjection(level),
(r: Row) => Some(1),
CountAggregator,
Some(MinMaxAggregator))
// Create a request for all tiles on these levels, generate
val request = new TileLevelRequest(level, (coord: (Int, Int, Int)) => coord._1)
val rdd = gen.generate(input, pickups, request)
// Translate RDD of Tiles to RDD of (coordinate,byte array), collect to master for serialization
val output = rdd
.map(s => pickups(s).get)
.map(tile => {
// Return tuples of tile coordinate, byte array
(tile.coords, createByteBuffer(tile))
})
.collect()
// Save byte files to local filesystem
output.foreach(tile => {
val coord = tile._1
val byteArray = tile._2
val limit = (1 << coord._1) - 1
// Use standard TMS path structure and file naming
val file = new File(s"$outputPath/$layerName/${coord._1}/${coord._2}/${limit - coord._3}.bins")
file.getParentFile.mkdirs()
val output = new FileOutputStream(file)
output.write(byteArray)
output.close()
})
// Create map from each level to min / max values.
rdd
.map(s => pickups(s).get)
.map(t => (t.coords._1.toString, t.tileMeta.get))
.reduceByKey((l, r) => {
(Math.min(l._1, r._1), Math.max(l._2, r._2))
})
.mapValues(minMax => {
JSONObject(Map(
"min" -> minMax._1,
"max" -> minMax._2
))
})
.collect()
.toMap
})
// Flatten array of maps into a single map
val levelInfoJSON = JSONObject(levelMeta.reduce(_ ++ _)).toString()
// Save level metadata to filesystem
val pw = new PrintWriter(s"$outputPath/$layerName/meta.json")
pw.write(levelInfoJSON)
pw.close()
}
}
I created a separate folder for this scala with another folder in it named lib that had the jars required and I compiled it with scalac as follows,
scalac -cp "lib/salt.jar:lib/spark.jar" Main.scala
This ran successfully and generated classes under a folder BinExTest.
Now, the project's build.gradle had the following lines of code with which identified that this is the command that would help in generating the output dataset,
task run(overwrite: true, type: Exec, dependsOn: [assemble]) {
executable = 'spark-submit'
args = ["--class","software.uncharted.salt.examples.bin.Main","/opt/salt/build/libs/salt-bin-example-${version}.jar", "/opt/data/taxi_one_day.csv", "/opt/output"]
}
Seeing this, I made the following command,
spark-submit --class BinExTest.Main lib/salt.jar
When I do this, I get the following error,
java.lang.ClassNotFoundException: Main.BinExTest at
java.net.URLClassLoader$1.run(URLClassLoader.java:366) at
java.net.URLClassLoader$1.run(URLClassLoader.java:355) at
java.security.AccessController.doPrivileged(Native Method) at
java.net.URLClassLoader.findClass(URLClassLoader.java:354) at
java.lang.ClassLoader.loadClass(ClassLoader.java:425) at
java.lang.ClassLoader.loadClass(ClassLoader.java:358) at
java.lang.Class.forName0(Native Method) at
java.lang.Class.forName(Class.java:278) at
org.apache.spark.util.Utils$.classForName(Utils.scala:174) at
org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:689)
at
org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Can somebody help me out in this? I am completely new to this and came this far just by exploration.
[Update 1]
Taking in YoYo's suggestion,
spark-submit --class BinExTest.Main --jars "BinExTest.jar" "lib/salt.jar"
I got the ClassNotFoundException gone generating new error and is as follows,
Exception in thread "main" org.apache.spark.SparkException: Job
aborted due to stage failure: Task 1 in stage 3.0 failed 1 times, most
recent failure: Lost task 1.0 in stage 3.0 (TID 6, localhost):
java.lang.NoSuchMethodError:
scala.runtime.IntRef.create(I)Lscala/runtime/IntRef; at
BinExTest.Main$.createByteBuffer(Main.scala:29) at
BinExTest.Main$$anonfun$2$$anonfun$6.apply(Main.scala:101) at
BinExTest.Main$$anonfun$2$$anonfun$6.apply(Main.scala:99) at
scala.collection.Iterator$$anon$11.next(Iterator.scala:328) at
scala.collection.Iterator$class.foreach(Iterator.scala:727) at
scala.collection.AbstractIterator.foreach(Iterator.scala:1157) at
scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at
scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at
scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157) at
scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at
scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157) at
org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:927)
at
org.apache.spark.rdd.RDD$$anonfun$collect$1$$anonfun$12.apply(RDD.scala:927)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
at
org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1858)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89) at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Any idea what's going on?
[Update 2]
Building Spark from source with Scala2.11 support solved my previous issue. However I got a new error and it is,
6/05/10 18:39:15 ERROR TaskSetManager: Task 0 in stage 2.0 failed 1
times; aborting job Exception in thread "main"
org.apache.spark.SparkException: Job aborted due to stage failure:
Task 0 in stage 2.0 failed 1 times, most recent failure: Lost task 0.0
in stage 2.0 (TID 3, localhost): java.lang.NoClassDefFoundError:
scala/collection/GenTraversableOnce$class at
software.uncharted.salt.core.util.SparseArray.(SparseArray.scala:37)
at
software.uncharted.salt.core.util.SparseArray.(SparseArray.scala:57)
at
software.uncharted.salt.core.generation.rdd.RDDSeriesWrapper.makeBins(RDDTileGenerator.scala:224)
at
software.uncharted.salt.core.generation.rdd.RDDTileGeneratorCombiner.createCombiner(RDDTileGenerator.scala:128)
at
software.uncharted.salt.core.generation.rdd.RDDTileGenerator$$anonfun$3.apply(RDDTileGenerator.scala:100)
at
software.uncharted.salt.core.generation.rdd.RDDTileGenerator$$anonfun$3.apply(RDDTileGenerator.scala:100)
at
org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:187)
at
org.apache.spark.util.collection.ExternalSorter$$anonfun$5.apply(ExternalSorter.scala:186)
at
org.apache.spark.util.collection.AppendOnlyMap.changeValue(AppendOnlyMap.scala:148)
at
org.apache.spark.util.collection.SizeTrackingAppendOnlyMap.changeValue(SizeTrackingAppendOnlyMap.scala:32)
at
org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:192)
at
org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:64)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89) at
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745) Caused by:
java.lang.ClassNotFoundException:
scala.collection.GenTraversableOnce$class at
java.net.URLClassLoader$1.run(URLClassLoader.java:366) at
java.net.URLClassLoader$1.run(URLClassLoader.java:355) at
java.security.AccessController.doPrivileged(Native Method) at
java.net.URLClassLoader.findClass(URLClassLoader.java:354) at
java.lang.ClassLoader.loadClass(ClassLoader.java:425) at
java.lang.ClassLoader.loadClass(ClassLoader.java:358)
Is this because scala2.11 does not have the mentioned class?
[Final Update]
Adding scala2.10 to the spark-submit did the trick.
spark-submit --class "BinExTest.Main" --jars
"BinExTest.jar,lib/scala210.jar" "lib/salt.jar"
For a Spark job to run, it need to self-replicate it's code over the different nodes that make part of your spark cluster. It does that by literally copying over the jar file to the other nodes.
That means that you need to make sure that your class files are packaged in a .jar file. In my typical solutions, I would build an Uber jar that packages the class files, and the dependent jar files together in a single .jar file. For that I use the Maven Shade plugin. That doesn't have to be your solution, but at least you should build a .jar file out of your generated classes.
To provide manually additional jar files - you will need to add them using the --jars option which will expect a comma delimited list.
Update 1
Actually, even for me there is a lot of confusion about all the available options, specifically to the jar files and how they are distributed, or modify the classpath in spark. See another topic I just posted.
Update 2
For the second part of your question that is already answered on another thread.
Related
I'm running a simple Spark project on a EMR YARN cluster to:
read a textfile on S3 into an RDD[String]
define a schema and convert that RDD into a DF
I am doing a mapPartition on the RDD to convert that RDD[String] into an RDD[Row].
My problem - I get a java.Lang.NullPointerException and I can't figure out what the problem is.
The stacktrace lists these 2 line numbers in the source code -
the line of rdd1.mapPartition
within the anonymous function, the line with the match case that matches the regular
Here's the stacktrace excerpt -
Caused by: java.lang.NullPointerException
at packageA.Herewego$$anonfun$3.apply(Herewego.scala:107)
at packageA.Herewego$$anonfun$3.apply(Herewego.scala:88)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:801)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:337)
at org.apache.spark.rdd.RDD$$anonfun$7.apply(RDD.scala:335)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1165)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1091)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1156)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:882)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:402)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:408)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
I've tried -
The error occurs when running in YARN cluster mode - and not in Local mode (in my IDE). This made me think that something isn't defined on the Executor? I moved the createrow function def into the anonymous function def - it didn't work though.
Here's the code block
val rdd4: RDD[Row] = rdd1.mapPartitions((it:Iterator[String]) => {
def createrow(a: List[String]): Row = {
val format = new java.text.SimpleDateFormat("dd/MMM/yyyy HH:mm:ss Z")
val re1: Row = Row.apply(a.head)
val d: Date = format.parse(a.tail.mkString(" "))
val t = new Timestamp(d.getTime)
val re2: Row = Row.apply(t)
Row.merge(re1, re2)
}
var output: List[Row] = List()
while (it.hasNext) {
val data: String = it.next()
val res = data match {
case rx(ipadd, date, time) => createrow(List(ipadd, date, time))
case _ => createrow(List("0.0.0.0", "00/Jan/0000", "00:00:00 0"))
}
output = output :+ res
}
output.toIterator
}).persist(MEMORY_ONLY)
// Collect and Persist the RDD in Memory
val tmp = rdd4.collect()
Do I need to broadcast any variables or functions used within the mapPartition?
Any pointers in the right direction will be more than appreciated.
I have a hive table partitionned by timestamp on top of parquet files with snappy conversion.
basically the paths look like :
s3:/bucketname/project/flowtime=0/
s3:/bucketname/project/flowtime=1/
s3:/bucketname/project/flowtime=2/
...
I detect some inconsistency considering this table. The problem is that because of one field that gives LongType at some parquet schemas and String at another, runing queries throws ClassCastException.
So what I am trying to do now is to read all my parquet files and check their schemas so I can recreate them. I want to map my filenames to the schema of the associate parquet. so that I can have :
filename | schema
s3:/bucketname/project/flowtime |StructField(StructField(Id,StringType,True),
|StructField(Date,StringType,True)
So I tried to use spark with Scala and function input_file_name of org.apache.spark.sql.functions which I wrap in an UDF. It works pretty fine.
val filename = (path: String) => path
val filenameUDF = udf(filename)
val df=sqlContext.parquetFile("s3a://bucketname/").select(filenameUDF(input_file_name())).toDF()
df.map(lines =>(lines.toString,sqlContext.read.parquet(lines.toString.replace("[","").replace("]","")).schema.toString)})
It is to give an RDD[(String,String)]
Only it seems that the part that reads the parquet within my map thows a nullPointerException.
ERROR scheduler.TaskSetManager: Task 0 in stage 14.0 failed 4 times; aborting job
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 14.0 failed 4 times, most recent failure: Lost task 0.3 in stage 14.0 (TID 35, CONFIDENTIAL-SERVER-NAME, executor 13): java.lang.NullPointerException
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:32)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:32)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:312)
at scala.collection.Iterator$class.foreach(Iterator.scala:727)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1157)
at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103)
at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47)
at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273)
at scala.collection.AbstractIterator.to(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265)
at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157)
at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252)
at scala.collection.AbstractIterator.toArray(Iterator.scala:1157)
at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$28.apply(RDD.scala:1328)
at org.apache.spark.rdd.RDD$$anonfun$take$1$$anonfun$28.apply(RDD.scala:1328)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1888)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1888)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:242)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
If you have any idea why the read parquet seems to not work inside the map please let me know why, because both parts of the pair I wanna create (the filename AND the schema) seems to work fine but joining them dont.
If also, you have better ideas how to solve the inconsistency among my parquet files that makes my hive table corrupted, because I don't see another choice than to work it that way because parquet are immutable and changing hive metadata don't change the embedded parquet metadata in each file.
Thank you for your attention.
Renaud
Let me suggest you an other get and loop on yout bucket list.
first you can read and store s3 bucket name by using listStatus
then loop on each path.
import java.net.URI
import org.apache.hadoop.fs._
import org.apache.hadoop.conf._
import java.io._
val file = new File("/home/.../fileName.txt")
val path = "s3:/bucketname/project/"
val fileSystem = FileSystem.get(URI.create(path), new Configuration())
val folders = fileSystem.listStatus(new Path(path))
val bw = new BufferedWriter(new FileWriter(file))
for (folder <- folders) { bw.write(folder.getPath.toString().split("/")(6) + " => " + spark.read.parquet(folder.getPath.toString()).select("myColum").schema.toString() + "\n") }
bw.close
hope it will help you.
Regards.
Steven
I have two columns in a Spark SQL DataFrame with each entry in either column as an array of strings.
val ngramDataFrame = Seq(
(Seq("curious", "bought", "20"), Seq("iwa", "was", "asj"))
).toDF("filtered_words", "ngrams_array")
I want to merge the arrays in each row to make a single array in a new column. My code is as follows:
def concat_array(firstarray: Array[String],
secondarray: Array[String]) : Array[String] =
{ (firstarray ++ secondarray).toArray }
val concatUDF = udf(concat_array _)
val concatFrame = ngramDataFrame.withColumn("full_array", concatUDF($"filtered_words", $"ngrams_array"))
I can successfully use the concat_array function on two arrays. However when I run the above code, I get the following exception:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 16.0 failed 1 times, most recent failure: Lost task 0.0 in stage 16.0 (TID 12, localhost): org.apache.spark.SparkException: Failed to execute user defined function(anonfun$1: (array, array) => array) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$8$$anon$1.hasNext(WholeStageCodegenExec.scala:370) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:389) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) at org.apache.spark.scheduler.Task.run(Task.scala:86) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: java.lang.ClassCastException: scala.collection.mutable.WrappedArray$ofRef cannot be cast to [Ljava.lang.String; at $line80.$read$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$iw$$anonfun$1.apply(:76) ... 13 more Driver stacktrace:
In Spark 2.4 or later you can use concat (if you want to keep duplicates):
ngramDataFrame.withColumn(
"full_array", concat($"filtered_words", $"ngrams_array")
).show
+--------------------+---------------+--------------------+
| filtered_words| ngrams_array| full_array|
+--------------------+---------------+--------------------+
|[curious, bought,...|[iwa, was, asj]|[curious, bought,...|
+--------------------+---------------+--------------------+
or array_union (if you want to drop duplicates):
ngramDataFrame.withColumn(
"full_array",
array_union($"filtered_words", $"ngrams_array")
)
These can be also composed from the other higher order functions, for example
ngramDataFrame.withColumn(
"full_array",
flatten(array($"filtered_words", $"ngrams_array"))
)
with duplicates, and
ngramDataFrame.withColumn(
"full_array",
array_distinct(flatten(array($"filtered_words", $"ngrams_array")))
)
without.
On a side note, you shouldn't use WrappedArray when working with ArrayType columns. Instead you should expect the guaranteed interface, which is Seq. So the udf should use function with following signature:
(Seq[String], Seq[String]) => Seq[String]
Please refer to SQL Programming Guide for details.
Arjun there is an error in the udf you had created.when you are passing the array type columns .data type is not Array[String] it is WrappedArray[String].below i am pasting the modified udf along with output.
val SparkCtxt = new SparkContext(sparkConf)
val sqlContext = new SQLContext(SparkCtxt)
import sqlContext.implicits
import org.apache.spark.sql.functions._
val temp=SparkCtxt.parallelize(Seq(Row(Array("String1","String2"),Array("String3","String4"))))
val df= sqlContext.createDataFrame(temp,
StructType(List(
StructField("Col1",ArrayType(StringType),true),
StructField("Col2",ArrayType(StringType),true)
)
) )
def concat_array(firstarray: mutable.WrappedArray[String],
secondarray: mutable.WrappedArray[String]) : mutable.WrappedArray[String] =
{
(firstarray ++ secondarray)
}
val concatUDF = udf(concat_array _)
val df2=df.withColumn("udftest",concatUDF(df.col("Col1"), df.col("Col2")))
df2.select("udftest").foreach(each=>{println("***********")
println(each(0))})
df2.show(true)
OUTPUT:
+------------------+------------------+--------------------+
| Col1| Col2| udftest|
+------------------+------------------+--------------------+
|[String1, String2]|[String3, String4]|[String1, String2...|
+------------------+------------------+--------------------+
WrappedArray(String1, String2, String3, String4)
Here is sample file
Department,Designation,costToCompany,State
Sales,Trainee,12000,UP
Sales,Lead,32000,AP
Sales,Lead,32000,LA
Sales,Lead,32000,TN
Sales,Lead,32000,AP
Sales,Lead,32000,TN
Sales,Lead,32000,LA
Sales,Lead,32000,LA
Marketing,Associate,18000,TN
Marketing,Associate,18000,TN
HR,Manager,58000,TN
Produce an output as csv
Group by department, desigination, State
With additional columns with sum(costToCompany) and sum(TotalEmployeeCount)
Result should be like
Dept,Desg,state,empCount,totalCost
Sales,Lead,AP,2,64000
Sales,Lead,LA,3,96000
Sales,Lead,TN,2,64000
Following is the solution and writing to file is resulting in an error. What am i doing wrong here?
Step #1: Load file
val file = sc.textFile("data/sales.txt")
Step #2: Create a case class to represt the data
scala> case class emp(Dept:String, Desg:String, totalCost:Double, State:String)
defined class emp
Step #3: Split data and create RDD of emp object
scala> val fileSplit = file.map(_.split(","))
scala> val data = fileSplit.map(x => emp(x(0), x(1), x(2).toDouble, x(3)))
Step #4: Turn the data into Key/value par with key=(dept, desg,state) and value=(1,totalCost)
scala> val keyVals = data.map(x => ((x.Dept,x.Desg,x.State),(1,x.totalCost)))
Step #5: Group by using reduceByKey, as we want summation as well for total number of employees and the cost
scala> val results = keyVals.reduceByKey{(a,b) => (a._1+b._1, a._2+b._2)} //(a.count+ b.count, a.cost+b.cost)
results: org.apache.spark.rdd.RDD[((String, String, String), (Int, Double))] = ShuffledRDD[41] at reduceByKey at <console>:55
Step #6: save the results
scala> results.repartition(1).saveAsTextFile("data/result")
Error
17/08/16 22:16:59 ERROR executor.Executor: Exception in task 0.0 in stage 20.0 (TID 23)
java.lang.NumberFormatException: For input string: "costToCompany"
at sun.misc.FloatingDecimal.readJavaFormatString(FloatingDecimal.java:1250)
at java.lang.Double.parseDouble(Double.java:540)
at scala.collection.immutable.StringLike$class.toDouble(StringLike.scala:232)
at scala.collection.immutable.StringOps.toDouble(StringOps.scala:31)
at $line85.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:51)
at $line85.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:51)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:194)
at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:64)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:242)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
17/08/16 22:16:59 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 20.0 (TID 23, localhost, executor driver): java.lang.NumberFormatException: For input string: "costToCompany"
Update 1
Forgot to remove header. update code here. Save is throwing a different error now. also, need to put the header back in the file.
scala> val file = sc.textFile("data/sales.txt")
scala> val header = fileSplit.first()
scala> val noHeaderData = fileSplit.filter(_(0) != header(0))
scala> case class emp(Dept:String, Desg:String, totalCost:Double, State:String)
scala> val data = noHeaderData.map(x => emp(x(0), x(1), x(2).toDouble, x(3)))
scala> val keyVals = data.map(x => ((x.Dept,x.Desg,x.State),(1,x.totalCost)))
scala> val resultSpecific = results.map(x => (x._1._1, x._1._2, x._1._3, x._2._1, x._2._2))
scala> resultSpecific.repartition(1).saveASTextFile("data/specific")
<console>:64: error: value saveASTextFile is not a member of org.apache.spark.rdd.RDD[(String, String, String, Int, Double)]
resultSpecific.repartition(1).saveASTextFile("data/specific")
To answer your question as well as comments:
It would be easier for you to utilize dataframes in this case, as your file is in csv format you can use the following way to load and save the data. In this way, you do not need to concern yourself with splitting the rows in the file as well as taking care of the header (both when loading and saving).
val spark = SparkSession.builder.getOrCreate()
import spark.implicits._
val df = spark.read
.format("com.databricks.spark.csv")
.option("header", "true") //reading the headers
.load("csv/file/path");
The dataframe column names will then be the same as the header in the file. Instead of reduceByKey() you can use the dataframe's groupBy() and agg():
val res = df.groupBy($"Department", $"Designation", $"State")
.agg(count($"costToCompany").alias("empCount"), sum($"costToCompany").alias("totalCost"))
Then save it:
res.coalesce(1)
.write.format("com.databricks.spark.csv")
.option("header", "true")
.save("results.csv")
when you are trying to cast into double, costToCompany string wont cast thats why its stuck when try to fire action. just drop first record from file and then it will work . you can also do such operation on dataframe also which will be easy
The error is straight forward and it says that
:64: error: value saveASTextFile is not a member of
org.apache.spark.rdd.RDD[(String, String, String, Int, Double)]
resultSpecific.repartition(1).saveASTextFile("data/specific")
In fact, you got no method called saveASTextFile(...) but saveAsTextFile(???). you have case error on your method name.
I have problem when write data to mongo after read and map data.
This is script I use to run the program.
I am using Spark 1.4.0, Scala 2.11.7 and mongo 2.6.10
#!/usr/bin/env bash
SPARK_PATH="/Users/username/spark-1.4.0-bin-hadoop2.6/bin/spark-submit"
CLASS_NAME="com.knx.conversion.ScalaWordCount"
CLUSTER='local[2]'
JARS="/Users/username/spark-1.4.0-bin-hadoop2.6/lib/mongo-hadoop-core-1.4.0.jar,/Users/username/spark-1.4.0-bin-hadoop2.6/lib/mongo-java-driver-3.0.3.jar"
JAR="/Users/username/AggragateConversionFunnel/target/scala-2.11/aggragateconversionfunnel_2.11-1.0.jar"
PROJECT_PATH="/Users/username/AggragateConversionFunnel"
cd ${PROJECT_PATH} && sbt package
${SPARK_PATH} --class ${CLASS_NAME} --master ${CLUSTER} --jars ${JARS} $JAR
and here is the main program here. Just copy from [here][1] and change the input output collection.
package com.knx.conversion
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.hadoop.conf.Configuration
import org.bson.BSONObject
import org.bson.BasicBSONObject
object ScalaWordCount {
def main(args: Array[String]) {
val sc = new SparkContext("local", "Scala Word Count")
val config = new Configuration()
config.set("mongo.input.uri", "mongodb://127.0.0.1:27017/first-week.interactions")
config.set("mongo.output.uri", "mongodb://127.0.0.1:27017/visit_06_2015.output")
val mongoRDD = sc.newAPIHadoopRDD(config, classOf[com.mongodb.hadoop.MongoInputFormat], classOf[Object], classOf[BSONObject])
// Input contains tuples of (ObjectId, BSONObject)
// Output contains tuples of (null, BSONObject) - ObjectId will be generated by Mongo driver if null
val countsRDD = mongoRDD.flatMap(arg => {
val str = arg._2.get("referer").toString
str.split("h")
})
.map(word => (word, 1))
.reduceByKey((a, b) => a + b)
countsRDD.foreach(println)
val saveRDD = countsRDD.map((tuple) => {
val bson = new BasicBSONObject()
bson.put("word", tuple._1)
bson.put("count", tuple._2.toString)
(null, bson)
})
// Only MongoOutputFormat and config are relevant
saveRDD.saveAsNewAPIHadoopFile("file:///bogus", classOf[Any], classOf[Any], classOf[com.mongodb.hadoop.MongoOutputFormat[Any, Any]], config)
}
}
When run I got error
5/07/24 15:53:03 INFO DAGScheduler: Job 0 finished: foreach at ScalaWordCount.scala:39, took 1.111442 s
Exception in thread "main" java.lang.NoSuchMethodError: scala.Predef$.$conforms()Lscala/Predef$$less$colon$less;
at com.knx.conversion.ScalaWordCount$.main(ScalaWordCount.scala:48)
at com.knx.conversion.ScalaWordCount.main(ScalaWordCount.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:664)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:169)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:192)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:111)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
15/07/24 15:53:03 INFO SparkContext: Invoking stop() from shutdown hook
Just don't know why and how it happened.
[1]: https://github.com/plaa/mongo-spark/blob/master/src/main/scala/ScalaWordCount.scala
This issued is about Scala version that I am currently using is not matching with Spark Scala version.
I am using Scala 2.11.7 to compile and package the jar but Spark 1.4.1 is using Scala 2.10.4.
The answer I found out here.
Then this issue solve by switching version of Scala to 2.10.4.