How to add current_timestamp() column to a streaming dataframe? - scala
I'm using Spark 2.4.3 and Scala.
I'm fetching messages from a streaming kafka source of the following structure:
{"message": "Jan 7 17:53:48 PA-850.abc.com 1,2020/01/07 17:53:41,001801012404,TRAFFIC,drop,2304,2020/01/07 17:53:41,10.7.26.51,10.8.3.11,0.0.0.0,0.0.0.0,interzone-default,,,not-applicable,vsys1,SERVER-VLAN,VPN,ethernet1/6.18,,test-1,2020/01/07 17:53:41,0,1,45194,514,0,0,0x0,udp,deny,588,588,0,1,2020/01/07 17:53:45,0,any,0,35067255521,0x8000000000000000,10.0.0.0-10.255.255.255,10.0.0.0-10.255.255.255,0,1,0,policy-deny,0,0,0,0,,PA-850,from-policy,,,0,,0,,N/A,0,0,0,0,b804eab2-f240-467a-be97-6f8c382afd4c,0","source_host": "10.7.26.51"}
My goal is to add a new timestamp column to each row with the current timestamp in my streaming data. I have to insert all these rows into a cassandra table.
package devices
import configurations._
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.{col, from_json, lower, split}
import org.apache.spark.sql.cassandra._
import scala.collection.mutable.{ListBuffer, Map}
import scala.io.Source
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StringType,TimestampType}
import org.apache.spark.sql.functions.to_timestamp
import org.apache.spark.sql.functions.unix_timestamp
object PA {
def main(args: Array[String]): Unit = {
val spark = SparkBuilder.spark
val df = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", configHelper.kafka_host)
.option("subscribe", configHelper.log_topic)
.option("startingOffsets", "earliest")
.option("multiLine", true)
.option("includeTimestamp",true)
.load()
df.printSchema()
def getDeviceNameOSMapping():Map[String,String]= {
val osmapping=scala.collection.mutable.Map[String, String]()
val bufferedSource = Source.fromFile(configHelper.os_mapping_file)
for (line <- bufferedSource.getLines) {
val cols = line.split(",").map(_.trim)
osmapping+=(cols(0).toLowerCase()->cols(1).toLowerCase())
}
bufferedSource.close
return osmapping
}
val deviceOSMapping = spark.sparkContext.broadcast(getDeviceNameOSMapping())
val debug = true
val msg = df.selectExpr("CAST(value AS STRING)")
.withColumn("value", lower(col("value")))
.select(from_json(col("value"), cefFormat.cef_json).as("data"))
.select("data.*")
import spark.sqlContext.implicits._
val newDF = msg.withColumn("created", lit(current_timestamp()))
msg.writeStream
.foreachBatch { (batchDF, _) =>
val syslogDF=batchDF.filter(!$"message".contains("invalid syslog message:"))
.filter(!$"message".contains("fluentd worker is now stopping worker=0"))
.filter(!$"message".contains("fluentd worker is now running worker=0"))
val syslogRDD=syslogDF.rdd.map(r=>{
r.getString(0)
}).map(x=>{
parseSysLog(x)
})
.filter(x=>deviceOSMapping.value.contains(x._1))
try {
val threat_9_0_DF = spark.sqlContext.createDataFrame(syslogRDD.filter(x => deviceOSMapping.value(x._1).equals("9.0") & x._2.equals("threat"))
.map(x => Row.fromSeq(x._3)),formatPA.threat_9_0)
if(debug)
threat_9_0_DF.show(true)
threat_9_0_DF.write
.cassandraFormat(configHelper.cassandra_table_syslog, configHelper.cassandra_keyspace)
.mode("append")
.save
println("threat_9_0_DF saved")
}
catch {
case e:Exception=>{
println(e.getMessage)
}
}
try {
val traffic_9_0_DF = spark.sqlContext.createDataFrame(syslogRDD.filter(x => deviceOSMapping.value(x._1).equals("9.0") & x._2.equals("traffic"))
.map(x => Row.fromSeq(x._3)),formatPA.traffic_9_0)
if(debug)
traffic_9_0_DF.show(true)
traffic_9_0_DF.write
.cassandraFormat(configHelper.cassandra_table_syslog, configHelper.cassandra_keyspace)
.mode("append")
.save
println("traffic_9_0_DF saved")
}
catch {
case e:Exception=>{
println(e.getMessage)
}
}
}.start().awaitTermination()
def parseSysLog(msg: String): (String,String,List[String]) = {
//println("PRINTING MESSAGES")
//println(msg)
val splitmsg=msg.split(",")
val traffic_type=splitmsg(3)
val temp=splitmsg(0).split(" ")
val date_time=temp.dropRight(2).mkString(" ")
val domain_name=temp(temp.size-2)
val future_use1=temp(temp.size-1)
val device_name=domain_name.split("\\.")(0)
var result=new ListBuffer[String]()
//result+=temp2
result+=date_time
result+=domain_name
result+=future_use1
result=result++splitmsg.slice(1,splitmsg.size).toList
(device_name,traffic_type,result.toList)
}
}
}
package configurations
import org.apache.spark.sql.types.{StringType, StructType, TimestampType, DateType}
object formatPA {
val threat_9_0=new StructType()
.add("date_time",StringType)
.add("log_source",StringType)
.add("future_use1",StringType)
.add("received_time",StringType)
.add("serial_number",StringType)
.add("traffic_type",StringType)
.add("threat_content_type",StringType)
.add("future_use2",StringType)
.add("generated_time",StringType)
.add("src_ip",StringType)
.add("dst_ip",StringType)
.add("src_nat",StringType)
.add("dst_nat",StringType)
.add("rule_name",StringType)
.add("src_user",StringType)
.add("dst_user",StringType)
.add("app",StringType)
.add("vsys",StringType)
.add("src_zone",StringType)
.add("dst_zone",StringType)
.add("igr_int",StringType)
.add("egr_int",StringType)
.add("log_fw_profile",StringType)
.add("future_use3",StringType)
.add("session_id",StringType)
.add("repeat_count",StringType)
.add("src_port",StringType)
.add("dst_port",StringType)
.add("src_nat_port",StringType)
.add("dst_nat_port",StringType)
.add("flags",StringType)
.add("protocol",StringType)
.add("action",StringType)
.add("miscellaneous",StringType)
.add("threat_id",StringType)
.add("category",StringType)
.add("severity",StringType)
.add("direction",StringType)
.add("seq_num",StringType)
.add("act_flag",StringType)
.add("src_geo_location",StringType)
.add("dst_geo_location",StringType)
.add("future_use4",StringType)
.add("content_type",StringType)
.add("pcap_id",StringType)
.add("file_digest",StringType)
.add("apt_cloud",StringType)
.add("url_index",StringType)
.add("user_agent",StringType)
.add("file_type",StringType)
.add("x_forwarded_for",StringType)
.add("referer",StringType)
.add("sender",StringType)
.add("subject",StringType)
.add("recipient",StringType)
.add("report_id",StringType)
.add("dghl1",StringType)
.add("dghl2",StringType)
.add("dghl3",StringType)
.add("dghl4",StringType)
.add("vsys_name",StringType)
.add("device_name",StringType)
.add("future_use5",StringType)
.add("src_vm_uuid",StringType)
.add("dst_vm_uuid",StringType)
.add("http_method",StringType)
.add("tunnel_id_imsi",StringType)
.add("monitor_tag_imei",StringType)
.add("parent_session_id",StringType)
.add("parent_start_time",StringType)
.add("tunnel_type",StringType)
.add("threat_category",StringType)
.add("content_version",StringType)
.add("future_use6",StringType)
.add("sctp_association_id",StringType)
.add("payload_protocol_id",StringType)
.add("http_headers",StringType)
.add("url_category_list",StringType)
.add("uuid_for_rule",StringType)
.add("http_2_connection",StringType)
.add("created",TimestampType)
val traffic_9_0=new StructType()
.add("date_time",StringType)
.add("log_source",StringType)
.add("future_use1",StringType)
.add("received_time",StringType)
.add("serial_number",StringType)
.add("traffic_type",StringType)
.add("threat_content_type",StringType)
.add("future_use2",StringType)
.add("generated_time",StringType)
.add("src_ip",StringType)
.add("dst_ip",StringType)
.add("src_nat",StringType)
.add("dst_nat",StringType)
.add("rule_name",StringType)
.add("src_user",StringType)
.add("dst_user",StringType)
.add("app",StringType)
.add("vsys",StringType)
.add("src_zone",StringType)
.add("dst_zone",StringType)
.add("igr_int",StringType)
.add("egr_int",StringType)
.add("log_fw_profile",StringType)
.add("future_use3",StringType)
.add("session_id",StringType)
.add("repeat_count",StringType)
.add("src_port",StringType)
.add("dst_port",StringType)
.add("src_nat_port",StringType)
.add("dst_nat_port",StringType)
.add("flags",StringType)
.add("protocol",StringType)
.add("action",StringType)
.add("bytes",StringType)
.add("bytes_sent",StringType)
.add("bytes_received",StringType)
.add("packets",StringType)
.add("start_time",StringType)
.add("end_time",StringType)
.add("category",StringType)
.add("future_use4",StringType)
.add("seq_num",StringType)
.add("act_flag",StringType)
.add("src_geo_location",StringType)
.add("dst_geo_location",StringType)
.add("future_use5",StringType)
.add("packet_sent",StringType)
.add("packet_received",StringType)
.add("session_end_reason",StringType)
.add("dghl1",StringType)
.add("dghl2",StringType)
.add("dghl3",StringType)
.add("dghl4",StringType)
.add("vsys_name",StringType)
.add("device_name",StringType)
.add("action_source",StringType)
.add("src_vm_uuid",StringType)
.add("dst_vm_uuid",StringType)
.add("tunnel_id_imsi",StringType)
.add("monitor_tag_imei",StringType)
.add("parent_session_id",StringType)
.add("parent_start_time",StringType)
.add("tunnel_type",StringType)
.add("sctp_association_id",StringType)
.add("sctp_chunks",StringType)
.add("sctp_chunks_sent",StringType)
.add("sctp_chunks_received",StringType)
.add("uuid_for_rule",StringType)
.add("http_2_connection",StringType)
.add("created",TimestampType)
}
The output for the above code is as follows:
+---------+----------+-----------+-------------+-------------+------------+-------------------+-----------+--------------+------+------+-------+-------+---------+--------+--------+---+----+--------+--------+-------+-------+--------------+-----------+----------+------------+--------+--------+------------+------------+-----+--------+------+-------------+---------+--------+--------+---------+-------+--------+----------------+----------------+-----------+------------+-------+-----------+---------+---------+----------+---------+---------------+-------+------+-------+---------+---------+-----+-----+-----+-----+---------+-----------+-----------+-----------+-----------+-----------+--------------+----------------+-----------------+-----------------+-----------+---------------+---------------+-----------+-------------------+-------------------+------------+-----------------+-------------+-----------------+-------+
|date_time|log_source|future_use1|received_time|serial_number|traffic_type|threat_content_type|future_use2|generated_time|src_ip|dst_ip|src_nat|dst_nat|rule_name|src_user|dst_user|app|vsys|src_zone|dst_zone|igr_int|egr_int|log_fw_profile|future_use3|session_id|repeat_count|src_port|dst_port|src_nat_port|dst_nat_port|flags|protocol|action|miscellaneous|threat_id|category|severity|direction|seq_num|act_flag|src_geo_location|dst_geo_location|future_use4|content_type|pcap_id|file_digest|apt_cloud|url_index|user_agent|file_type|x_forwarded_for|referer|sender|subject|recipient|report_id|dghl1|dghl2|dghl3|dghl4|vsys_name|device_name|future_use5|src_vm_uuid|dst_vm_uuid|http_method|tunnel_id_imsi|monitor_tag_imei|parent_session_id|parent_start_time|tunnel_type|threat_category|content_version|future_use6|sctp_association_id|payload_protocol_id|http_headers|url_category_list|uuid_for_rule|http_2_connection|created|
+---------+----------+-----------+-------------+-------------+------------+-------------------+-----------+--------------+------+------+-------+-------+---------+--------+--------+---+----+--------+--------+-------+-------+--------------+-----------+----------+------------+--------+--------+------------+------------+-----+--------+------+-------------+---------+--------+--------+---------+-------+--------+----------------+----------------+-----------+------------+-------+-----------+---------+---------+----------+---------+---------------+-------+------+-------+---------+---------+-----+-----+-----+-----+---------+-----------+-----------+-----------+-----------+-----------+--------------+----------------+-----------------+-----------------+-----------+---------------+---------------+-----------+-------------------+-------------------+------------+-----------------+-------------+-----------------+-------+
+---------+----------+-----------+-------------+-------------+------------+-------------------+-----------+--------------+------+------+-------+-------+---------+--------+--------+---+----+--------+--------+-------+-------+--------------+-----------+----------+------------+--------+--------+------------+------------+-----+--------+------+-------------+---------+--------+--------+---------+-------+--------+----------------+----------------+-----------+------------+-------+-----------+---------+---------+----------+---------+---------------+-------+------+-------+---------+---------+-----+-----+-----+-----+---------+-----------+-----------+-----------+-----------+-----------+--------------+----------------+-----------------+-----------------+-----------+---------------+---------------+-----------+-------------------+-------------------+------------+-----------------+-------------+-----------------+-------+
threat_9_0_DF saved
20/01/08 14:59:49 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 2)
java.lang.RuntimeException: Error while encoding: java.lang.ArrayIndexOutOfBoundsException: 69
if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.sql.catalyst.util.DateTimeUtils$, TimestampType, fromJavaTimestamp, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 69, created), TimestampType), true, false) AS created#773
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:292)
at org.apache.spark.sql.SparkSession$$anonfun$4.apply(SparkSession.scala:593)
at org.apache.spark.sql.SparkSession$$anonfun$4.apply(SparkSession.scala:593)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:410)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$13$$anon$1.hasNext(WholeStageCodegenExec.scala:636)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:255)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$24.apply(RDD.scala:836)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 69
at org.apache.spark.sql.catalyst.expressions.GenericRow.get(rows.scala:174)
at org.apache.spark.sql.Row$class.isNullAt(Row.scala:191)
at org.apache.spark.sql.catalyst.expressions.GenericRow.isNullAt(rows.scala:166)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.writeFields_0_34$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:289)
... 25 more
It looks like it does not matter that the messages are in JSON format, does it?
Let's then use a sample dataset of any schema and add a timestamp column.
val messages = spark.range(3)
scala> messages.printSchema
root
|-- id: long (nullable = false)
val withTs = messages.withColumn("timestamp", current_timestamp())
scala> withTs.printSchema
root
|-- id: long (nullable = false)
|-- timestamp: timestamp (nullable = false)
That gives you a dataset with a timestamp column.
The following line in your code should work, too (you don't need lit).
val xDF = thDF.withColumn("created", lit(current_timestamp())) //This does not get cast to TimestampType
What do you mean by "This does not get cast to TimestampType"? How do you check it out? Are you perhaps confusing TimestampType in Spark and Cassandra? The Spark connector for Cassandra should handle it.
Let's give that a try:
val litTs = spark.range(3).withColumn("ts", lit(current_timestamp))
scala> litTs.printSchema
root
|-- id: long (nullable = false)
|-- ts: timestamp (nullable = false)
import org.apache.spark.sql.types._
val dataType = litTs.schema("ts").dataType
assert(dataType.isInstanceOf[TimestampType])
scala> println(dataType)
TimestampType
Related
Read a large zstd file with Spark (Scala)
I'm trying to load a zstd-compressed JSON file with the archive size of 16.4 GB using Spark 3.1.1 with Scala 2.12.10. See Sample file for reference. For reference, my PC has 32 GB of RAM. The zstd decompressor I'm using is from the native libraries via LD_LIBRARY_PATH=/opt/hadoop/lib/native My configuration: trait SparkProdContext { private val master = "local[*]" private val appName = "testing" private val conf: SparkConf = new SparkConf() .setMaster(master) .setAppName(appName) .set("spark.driver.allowMultipleContexts", "false") .set("spark.ui.enabled", "false") val ss: SparkSession = SparkSession.builder().config(conf).getOrCreate() val sc: SparkContext = ss.sparkContext val sqlContext: SQLContext = ss.sqlContext } My code: import org.apache.spark.sql.types.StructType import org.apache.spark.sql.catalyst.ScalaReflection import ss.implicits._ case class Comment ( author: String, body: String, score: BigInt, subreddit_id: String, subreddit: String, id: String, parent_id: String, link_id: String, retrieved_on: BigInt, created_utc: BigInt, permalink: String ) val schema = ScalaReflection.schemaFor[Comment].dataType.asInstanceOf[StructType] val comments = ss.read.schema(schema).json("/home/user/Downloads/RC_2020-03.zst").as[Comment] Upon running the code I'm getting the following error. 22/01/06 23:59:44 INFO CodecPool: Got brand-new decompressor [.zst] 22/01/06 23:59:44 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0) java.lang.InternalError: Frame requires too much memory for decoding at org.apache.hadoop.io.compress.zstd.ZStandardDecompressor.inflateBytesDirect(Native Method) at org.apache.hadoop.io.compress.zstd.ZStandardDecompressor.decompress(ZStandardDecompressor.java:181) at org.apache.hadoop.io.compress.DecompressorStream.decompress(DecompressorStream.java:111) at org.apache.hadoop.io.compress.DecompressorStream.read(DecompressorStream.java:105) at java.base/java.io.InputStream.read(InputStream.java:205) at org.apache.hadoop.util.LineReader.fillBuffer(LineReader.java:182) at org.apache.hadoop.util.LineReader.readDefaultLine(LineReader.java:218) at org.apache.hadoop.util.LineReader.readLine(LineReader.java:176) at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.skipUtfByteOrderMark(LineRecordReader.java:152) at org.apache.hadoop.mapreduce.lib.input.LineRecordReader.nextKeyValue(LineRecordReader.java:192) at org.apache.spark.sql.execution.datasources.RecordReaderIterator.hasNext(RecordReaderIterator.scala:37) at org.apache.spark.sql.execution.datasources.HadoopFileLinesReader.hasNext(HadoopFileLinesReader.scala:69) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:173) at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithoutKey_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:132) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) at org.apache.spark.scheduler.Task.run(Task.scala:131) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) Any ideas appreciated!
Scala : Reading data from csv with columns have null values
Enviornment - spark-3.0.1-bin-hadoop2.7, ScalaLibraryContainer 2.12.3, Scala, SparkSQL, eclipse-jee-oxygen-2-linux-gtk-x86_64 I have a csv file having 3 columns with data-type :String,Long,Date. I have converted csv file to datafram and want to show it. But it is giving following error java.lang.ArrayIndexOutOfBoundsException: 2 at org.apache.spark.examples.sql.SparkSQLExample5$.$anonfun$runInferSchemaExample$2(SparkSQLExample5.scala:30) at scala.collection.Iterator$$anon$10.next(Iterator.scala:448) at scala.collection.Iterator$$anon$10.next(Iterator.scala:448) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:340) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:872) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:872) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:446) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:449) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at scala code .map(attributes => Person(attributes(0), attributes(1),attributes(2))).toDF(); Error comes, if subsequent rows have less values than number of values present in header. Basically I am trying to read data from csv using Scala and Spark with columns have null values. Rows dont have the same number of columns. It is running successfully if all the rows have 3 column values. package org.apache.spark.examples.sql import org.apache.spark.sql.Row import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types._ import java.sql.Date import org.apache.spark.sql.functions._ import java.util.Calendar; object SparkSQLExample5 { case class Person(name: String, age: String, birthDate: String) def main(args: Array[String]): Unit = { val fromDateTime=java.time.LocalDateTime.now; val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.master", "local").getOrCreate(); import spark.implicits._ runInferSchemaExample(spark); spark.stop() } private def runInferSchemaExample(spark: SparkSession): Unit = { import spark.implicits._ println("1. Creating an RDD of 'Person' object and converting into 'Dataframe' "+ " 2. Registering the DataFrame as a temporary view.") println("1. Third column of second row is not present.Last value of second row is comma.") val peopleDF = spark.sparkContext .textFile("examples/src/main/resources/test.csv") .map(_.split(",")) .map(attributes => Person(attributes(0), attributes(1),attributes(2))).toDF(); val finalOutput=peopleDF.select("name","age","birthDate") finalOutput.show(); } } csv file col1,col2,col3 row21,row22, row31,row32,
Try PERMISSIVE mode when reading csv file, it will add NULL for missing fields val df = spark.sqlContext.read.format("csv").option("mode", "PERMISSIVE") .load("examples/src/main/resources/test.csv") you can find more information https://docs.databricks.com/data/data-sources/read-csv.html
Input: csv file col1,col2,col3 row21,row22, row31,row32, Code: import org.apache.spark.sql.SparkSession object ReadCsvFile { case class Person(name: String, age: String, birthDate: String) def main(args: Array[String]): Unit = { val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.master", "local").getOrCreate(); readCsvFileAndInferCustomSchema(spark); spark.stop() } private def readCsvFileAndInferCustomSchema(spark: SparkSession): Unit = { val df = spark.read.csv("C:/Users/Ralimili/Desktop/data.csv") val rdd = df.rdd.mapPartitionsWithIndex { (idx, iter) => if (idx == 0) iter.drop(1) else iter } val mapRdd = rdd.map(attributes => { Person(attributes.getString(0), attributes.getString(1),attributes.getString(2)) }) val finalDf = spark.createDataFrame(mapRdd) finalDf.show(false); } } output +-----+-----+---------+ |name |age |birthDate| +-----+-----+---------+ |row21|row22|null | |row31|row32|null | +-----+-----+---------+ If you want to fill some values instead of null values use below code val customizedNullDf = finalDf.na.fill("No data") customizedNullDf.show(false); output +-----+-----+---------+ |name |age |birthDate| +-----+-----+---------+ |row21|row22|No data | |row31|row32|No data | +-----+-----+---------+
org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start() kafka
I want to pipe a python machine learning file,predict the output and then attach it to my dataframe and then save it. The error that I am getting is :- Exception Details Exception in thread "main" org.apache.spark.sql.AnalysisException: Queries with streaming sources must be executed with writeStream.start();; kafka t org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.throwError(UnsupportedOperationChecker.scala:431) at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.$anonfun$checkForBatch$1(UnsupportedOperationChecker.scala:37) at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.$anonfun$checkForBatch$1$adapted(UnsupportedOperationChecker.scala:35) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:177) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:176) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:176) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:176) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:176) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:176) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:176) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:176) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:176) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:176) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1(TreeNode.scala:176) at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$foreachUp$1$adapted(TreeNode.scala:176) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:176) at org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker$.checkForBatch(UnsupportedOperationChecker.scala:35) at org.apache.spark.sql.execution.QueryExecution.assertSupported(QueryExecution.scala:62) at org.apache.spark.sql.execution.QueryExecution.$anonfun$withCachedData$1(QueryExecution.scala:73) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) at org.apache.spark.sql.execution.QueryExecution.withCachedData$lzycompute(QueryExecution.scala:71) at org.apache.spark.sql.execution.QueryExecution.withCachedData(QueryExecution.scala:71) at org.apache.spark.sql.execution.QueryExecution.$anonfun$optimizedPlan$1(QueryExecution.scala:82) at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111) at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:133) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:133) at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:82) at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:79) at org.apache.spark.sql.execution.QueryExecution.assertOptimized(QueryExecution.scala:85) at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:103) at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:100) at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:122) at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:121) at org.apache.spark.sql.Dataset.rdd$lzycompute(Dataset.scala:3198) at org.apache.spark.sql.Dataset.rdd(Dataset.scala:3196) at StockPredictionKafkaStructuredStreaming$.loadingLinearRegressionModelPython(StockPredictionKafkaStructuredStreaming.scala:85) at StockPredictionKafkaStructuredStreaming$.predictingPrice(StockPredictionKafkaStructuredStreaming.scala:103) at StockPredictionKafkaStructuredStreaming$.delayedEndpoint$StockPredictionKafkaStructuredStreaming$1(StockPredictionKafkaStructuredStreaming.scala:15) at StockPredictionKafkaStructuredStreaming$delayedInit$body.apply(StockPredictionKafkaStructuredStreaming.scala:6) at scala.Function0.apply$mcV$sp(Function0.scala:39) at scala.Function0.apply$mcV$sp$(Function0.scala:39) at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:17) at scala.App.$anonfun$main$1$adapted(App.scala:80) at scala.collection.immutable.List.foreach(List.scala:392) at scala.App.main(App.scala:80) at scala.App.main$(App.scala:78) at StockPredictionKafkaStructuredStreaming$.main(StockPredictionKafkaStructuredStreaming.scala:6) at StockPredictionKafkaStructuredStreaming.main(StockPredictionKafkaStructuredStreaming.scala) Scala Code import Utility.UtilityClass import org.apache.spark.ml.PipelineModel import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, from_json, lit} import org.apache.spark.sql.types.{DoubleType, StringType, StructType} object StockPredictionKafkaStructuredStreaming extends App { val brokers = args(0) val topics = args(1) val sparkSessionObj = UtilityClass.createSessionObject("StockPrediction") sparkSessionObj.sparkContext.setLogLevel("ERROR") import sparkSessionObj.implicits._ val streamedDataFrame = takingInput() val preprocessedDataFrame = preProcessing(streamedDataFrame) val predictedPriceDataFrame = predictingPrice(preprocessedDataFrame) writeToOutputStream(predictedPriceDataFrame) def takingInput(): DataFrame = { val inputDataFrame = sparkSessionObj.readStream .format("kafka") .option("kafka.bootstrap.servers", brokers) .option("subscribe", topics) .load() inputDataFrame } private def creatingDataFrameFromJson( inputDataFrame: DataFrame ): DataFrame = { val schema = new StructType() .add("1. open", StringType, true) .add("2. high", StringType, true) .add("3. low", StringType, true) .add("4. close", StringType, true) .add("5. volume", StringType, true) val jsonStringDataFrame = inputDataFrame.selectExpr("CAST(value AS STRING)").as[String] val columnsRenamedDataFrame = jsonStringDataFrame .select(from_json(col("value"), schema).as("jsonData")) .select("jsonData.*") .withColumnRenamed("1. open", "Open") .withColumnRenamed("2. high", "High") .withColumnRenamed("3. low", "Low") .withColumnRenamed("4. close", "Close") .withColumnRenamed("5. volume", "Volume") columnsRenamedDataFrame } private def castingDataType(inputDataFrame: DataFrame): DataFrame = { val castedDataFrame = inputDataFrame.select( col("Open").cast(DoubleType), col("High").cast(DoubleType), col("Low").cast(DoubleType), col("Volume").cast(DoubleType) ) castedDataFrame } def preProcessing(inputDataFrame: DataFrame): DataFrame = { val columnsRenamedDataFrame = creatingDataFrameFromJson(inputDataFrame) val castedDataFrame = castingDataType(columnsRenamedDataFrame) castedDataFrame } private def loadingLinearRegressionModelSpark( inputDataFrame: DataFrame ): DataFrame = { val linearRegressionModel = PipelineModel.load("./MachineLearningModel/model") //Applying the model to our Input DataFrame val predictedDataFrame = linearRegressionModel.transform(inputDataFrame) //Extracting the Predicted Close Price from the Output DataFrame predictedDataFrame } private def loadingLinearRegressionModelPython( inputDataFrame: DataFrame ): DataFrame = { val command = "python3 ./pythonFiles/StockPricePrediction.py" // creating rdd with the input files,repartitioning the rdd and passing the command using pipe val predictedPriceRDD = inputDataFrame.rdd .repartition(1) .pipe(command) // //Collecting the result from the output RDD. val predictedPrice = predictedPriceRDD.collect().apply(0) val scaledPredictedPrice = BigDecimal(predictedPrice) .setScale(2, BigDecimal.RoundingMode.HALF_UP) .toDouble val predictedColumnDataFrame = inputDataFrame.withColumn("prediction", lit(scaledPredictedPrice)) println(predictedColumnDataFrame.isStreaming) predictedColumnDataFrame.printSchema() predictedColumnDataFrame } def predictingPrice( inputDataFrame: DataFrame ): DataFrame = { val predictedDataFrame = loadingLinearRegressionModelPython(inputDataFrame) val predictedClosePrice = predictedDataFrame.select( col("prediction").alias("Predicted Close Price"), col("Open"), col("Low"), col("High"), col("Volume") ) predictedClosePrice } def writeToOutputStream(inputDataFrame: DataFrame): Unit = { val outputQuery = inputDataFrame.writeStream .format("console") .outputMode("append") .start() .awaitTermination() } }
Try below code. val streamedDataFrame = takingInput() val preprocessedDataFrame = preProcessing(streamedDataFrame) // Dont invoke below line here, instead invoke below like inside foreachBatch format. //val predictedPriceDataFrame = predictingPrice(preprocessedDataFrame) writeToOutputStream(preprocessedDataFrame) def writeToOutputStream(inputDataFrame: DataFrame): Unit = { val outputQuery = inputDataFrame .writeStream .foreachBatch { (batchDF: DataFrame, batchId: Long) => predictingPrice(batchDF).show(false) // Added above line here. }.start() .awaitTermination() }
Spark: java.lang.IllegalArgumentException: requirement failed kmeans (mllib)
I am trying to do a clustering aplicaction with kmeans. My dataset is: https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014# I do not have much experience with spark, I have been working only a few months, the error occurs when I try to apply kmean.train which has a inputs: vector, num_cluster and iterations. I am running locally, is it possible that my machine can not computing so much data? The main code is: import org.apache.spark.sql.SparkSession import org.apache.spark.SparkConf import org.apache.spark.SparkContext import scala.collection._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.functions.udf import org.apache.spark.sql.Row import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} object Preprocesado { def main(args: Array[String]) { val spark = SparkSession.builder.appName("Preprocesado").getOrCreate() import spark.implicits._ val sc = spark.sparkContext val datos = spark.read.format("csv").option("sep", ";").option("inferSchema", "true").option("header", "true").load("input.csv") var df= datos.select("data", "MT_001").withColumn("data", to_date($"data").cast("string")).withColumn("data", concat(lit("MT_001 "), $"data")) val col=datos.columns for(a<- 2 to col.size-1) { var user = col(a) println(user) var df_$a = datos.select("data", col(a)).withColumn("data", to_date($"data").cast("string")).withColumn("data", concat(lit(user), lit(" "), $"data")) df = df.unionAll(df_$a) } val rd=df.withColumnRenamed("MT_001", "values") val df2 = rd.groupBy("data").agg(collect_list("values")) val convertUDF = udf((array : Seq[Double]) => { Vectors.dense(array.toArray) }) val withVector = df2.withColumn("collect_list(values)", convertUDF($"collect_list(values)")) val items : Array[Double] = new Array[Double](96) val vecToRemove = Vectors.dense(items) def vectors_unequal(vec1: Vector) = udf((vec2: Vector) => !vec1.equals(vec2)) val filtered = withVector.filter(vectors_unequal(vecToRemove)($"collect_list(values)")) val Array(a, b) = filtered.randomSplit(Array(0.7,0.3)) val trainingData = a.select("collect_list(values)").rdd.map{x:Row => x.getAs[Vector](0)} val testData = b.select("collect_list(values)").rdd.map{x:Row => x.getAs[Vector](0)} trainingData.cache() testData.cache() val numClusters = 4 val numIterations = 20 val clusters = KMeans.train(trainingData, numClusters, numIterations) clusters.predict(testData).coalesce(1,true).saveAsTextFile("output") spark.stop() } } When I compile there is no errors. Then I submit with: spark-submit \ --class "spark.Preprocesado.Preprocesado" \ --master local[4] \ --executor-memory 7g \ --driver-memory 6g \ target/scala-2.11/preprocesado_2.11-1.0.jar The problem is in the clustering: This is the error: 18/05/20 16:45:48 ERROR Executor: Exception in task 10.0 in stage 7.0 (TID 6347) java.lang.IllegalArgumentException: requirement failed at scala.Predef$.require(Predef.scala:212) at org.apache.spark.mllib.util.MLUtils$.fastSquaredDistance(MLUtils.scala:486) at org.apache.spark.mllib.clustering.KMeans$.fastSquaredDistance(KMeans.scala:589) at org.apache.spark.mllib.clustering.KMeans$$anonfun$findClosest$1.apply(KMeans.scala:563) at org.apache.spark.mllib.clustering.KMeans$$anonfun$findClosest$1.apply(KMeans.scala:557) at scala.collection.immutable.List.foreach(List.scala:381) at org.apache.spark.mllib.clustering.KMeans$.findClosest(KMeans.scala:557) at org.apache.spark.mllib.clustering.KMeans$.pointCost(KMeans.scala:580) at org.apache.spark.mllib.clustering.KMeans$$anonfun$initKMeansParallel$2.apply(KMeans.scala:371) at org.apache.spark.mllib.clustering.KMeans$$anonfun$initKMeansParallel$2.apply(KMeans.scala:370) at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038) at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029) at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969) at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029) at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760) at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334) at org.apache.spark.rdd.RDD.iterator(RDD.scala:285) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:108) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:338) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) How can I solve this error? Thank you
I think you are generating your DataFrame df and consequently df2 in the wrong way. Maybe you are trying to do this: case class Data(values: Double, data: String) var df = spark.emptyDataset[Data] df = datos.columns.filter(_.startsWith("MT")).foldLeft(df)((df, c) => { val values = col(c).cast("double").as("values") val data = concat(lit(c), lit(" "), to_date($"_c0").cast("string")).as("data") df.union(datos.select(values, data).as[Data]) }) val df2 = df.groupBy("data").agg(collect_list("values")) As i think, you only need two columns: data and values, but in the for loop you are generating a DataFrame with 140256 columns (one for each attribute) and maybe this is the source of your problems. pd: sorry for my english!.
Spark DataFrame not respecting schema and considering everything as String
I am facing a problem which I have failed to get over for ages now. I am on Spark 1.4 and Scala 2.10. I cannot upgrade at this moment (big distributed infrastructure) I have a file with few hundred columns, only 2 of which are string and rest all are Long. I want to convert this data into a Label/Features dataframe. I have been able to get it into LibSVM format. I just cannot get it into a Label/Features format. The reason being I am not being able to use the toDF() as shown here https://spark.apache.org/docs/1.5.1/ml-ensembles.html val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF() it it not supported in 1.4 So I first converted the txtFile into a DataFrame where I used something like this def getColumnDType(columnName:String):StructField = { if((columnName== "strcol1") || (columnName== "strcol2")) return StructField(columnName, StringType, false) else return StructField(columnName, LongType, false) } def getDataFrameFromTxtFile(sc: SparkContext,staticfeatures_filepath: String,schemaConf: String) : DataFrame = { val sfRDD = sc.textFile(staticfeatures_filepath)// val sqlContext = new org.apache.spark.sql.SQLContext(sc) // reads a space delimited string from application.properties file val schemaString = readConf(Array(schemaConf)).get(schemaConf).getOrElse("") // Generate the schema based on the string of schema val schema = StructType( schemaString.split(" ").map(fieldName => getSFColumnDType(fieldName))) val data = sfRDD .map(line => line.split(",")) .map(p => Row.fromSeq(p.toSeq)) var df = sqlContext.createDataFrame(data, schema) //schemaString.split(" ").drop(4) //.map(s => df = convertColumn(df, s, "int")) return df } When I do a df.na.drop() df.printSchema() with this returned dataframe I get perfect Schema Like this root |-- rand_entry: long (nullable = false) |-- strcol1: string (nullable = false) |-- label: long (nullable = false) |-- strcol2: string (nullable = false) |-- f1: long (nullable = false) |-- f2: long (nullable = false) |-- f3: long (nullable = false) and so on till around f300 But - the sad part is whatever I try to do (see below) with the df, I am always getting a ClassCastException (java.lang.String cannot be cast to java.lang.Long) val featureColumns = Array("f1","f2",....."f300") assertEquals(-99,df.select("f1").head().getLong(0)) assertEquals(-99,df.first().get(4)) val transformeddf = new VectorAssembler() .setInputCols(featureColumns) .setOutputCol("features") .transform(df) So - the bad is - even though the schema says Long - the df is still internally considering everything as String. Edit Adding a simple example Say I have a file like this 1,A,20,P,-99,1,0,0,8,1,1,1,1,131153 1,B,23,P,-99,0,1,0,7,1,1,0,1,65543 1,C,24,P,-99,0,1,0,9,1,1,1,1,262149 1,D,7,P,-99,0,0,0,8,1,1,1,1,458759 and sf-schema=f0 strCol1 f1 strCol2 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 (column names really do not matter so you can disregard this details) All I am trying to do is create a Label/Features kind of dataframe where my 3rd column becomes a label and the 5th to 11th columns become a feature [Vector] column. Such that I can follow the rest of the steps in https://spark.apache.org/docs/latest/ml-classification-regression.html#tree-ensembles. I have cast the columns too like suggested by zero323 val types = Map("strCol1" -> "string", "strCol2" -> "string") .withDefault(_ => "bigint") df = df.select(df.columns.map(c => df.col(c).cast(types(c)).alias(c)): _*) df = df.drop("f0") df = df.drop("strCol1") df = df.drop("strCol2") But the assert and VectorAssembler still fails. featureColumns = Array("f2","f3",....."f11") This is whole sequence I want to do after I have my df var transformeddf = new VectorAssembler() .setInputCols(featureColumns) .setOutputCol("features") .transform(df) transformeddf.show(2) transformeddf = new StringIndexer() .setInputCol("f1") .setOutputCol("indexedF1") .fit(transformeddf) .transform(transformeddf) transformeddf.show(2) transformeddf = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(5) .fit(transformeddf) .transform(transformeddf) The exception trace from ScalaIDE - just when it hits the VectorAssembler is as below java.lang.ClassCastException: java.lang.String cannot be cast to java.lang.Long at scala.runtime.BoxesRunTime.unboxToLong(BoxesRunTime.java:110) at scala.math.Numeric$LongIsIntegral$.toDouble(Numeric.scala:117) at org.apache.spark.sql.catalyst.expressions.Cast$$anonfun$castToDouble$5.apply(Cast.scala:364) at org.apache.spark.sql.catalyst.expressions.Cast$$anonfun$castToDouble$5.apply(Cast.scala:364) at org.apache.spark.sql.catalyst.expressions.Cast.eval(Cast.scala:436) at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118) at org.apache.spark.sql.catalyst.expressions.CreateStruct$$anonfun$eval$2.apply(complexTypes.scala:75) at org.apache.spark.sql.catalyst.expressions.CreateStruct$$anonfun$eval$2.apply(complexTypes.scala:75) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at scala.collection.TraversableLike$class.map(TraversableLike.scala:244) at scala.collection.AbstractTraversable.map(Traversable.scala:105) at org.apache.spark.sql.catalyst.expressions.CreateStruct.eval(complexTypes.scala:75) at org.apache.spark.sql.catalyst.expressions.CreateStruct.eval(complexTypes.scala:56) at org.apache.spark.sql.catalyst.expressions.ScalaUdf$$anonfun$2.apply(ScalaUdf.scala:72) at org.apache.spark.sql.catalyst.expressions.ScalaUdf$$anonfun$2.apply(ScalaUdf.scala:70) at org.apache.spark.sql.catalyst.expressions.ScalaUdf.eval(ScalaUdf.scala:960) at org.apache.spark.sql.catalyst.expressions.Alias.eval(namedExpressions.scala:118) at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:68) at org.apache.spark.sql.catalyst.expressions.InterpretedMutableProjection.apply(Projection.scala:52) at scala.collection.Iterator$$anon$11.next(Iterator.scala:328) at scala.collection.Iterator$$anon$11.next(Iterator.scala:328) at scala.collection.Iterator$$anon$10.next(Iterator.scala:312) at scala.collection.Iterator$class.foreach(Iterator.scala:727) at scala.collection.AbstractIterator.foreach(Iterator.scala:1157) at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:48) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:103) at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:47) at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:273) at scala.collection.AbstractIterator.to(Iterator.scala:1157) at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:265) at scala.collection.AbstractIterator.toBuffer(Iterator.scala:1157) at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:252) at scala.collection.AbstractIterator.toArray(Iterator.scala:1157) at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143) at org.apache.spark.sql.execution.SparkPlan$$anonfun$3.apply(SparkPlan.scala:143) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63) at org.apache.spark.scheduler.Task.run(Task.scala:70) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745)
You get ClassCastException because this is exactly what should happen. Schema argument is not used for automatic casting (some DataSources may use schema this way, but not methods like createDataFrame). It only declares what are the types of the values which are stored in the rows. It is you responsibility to pass data which matches the schema, not the other way around. While DataFrame shows schema you've declared it is validated only on runtime, hence the runtime exception.If you want to transform data to specific you have cast data explicitly. First read all columns as StringType: val rows = sc.textFile(staticfeatures_filepath) .map(line => Row.fromSeq(line.split(","))) val schema = StructType( schemaString.split(" ").map( columnName => StructField(columnName, StringType, false) ) ) val df = sqlContext.createDataFrame(rows, schema) Next cast selected columns to desired type: import org.apache.spark.sql.types.{LongType, StringType} val types = Map("strcol1" -> StringType, "strcol2" -> StringType) .withDefault(_ => LongType) val casted = df.select(df.columns.map(c => col(c).cast(types(c)).alias(c)): _*) Use assembler: val transformeddf = new VectorAssembler() .setInputCols(featureColumns) .setOutputCol("features") .transform(casted) You can simply steps 1 and 2 using spark-csv: // As originally val schema = StructType( schemaString.split(" ").map(fieldName => getSFColumnDType(fieldName))) val df = sqlContext .read.schema(schema) .format("com.databricks.spark.csv") .option("header", "false") .load(staticfeatures_filepath) See also Correctly reading the types from file in PySpark