Error with spark Row.fromSeq for a text file - scala

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
object fixedLength {
def main(args:Array[String]) {
def getRow(x : String) : Row={
val columnArray = new Array[String](4)
columnArray(0)=x.substring(0,3)
columnArray(1)=x.substring(3,13)
columnArray(2)=x.substring(13,18)
columnArray(3)=x.substring(18,22)
Row.fromSeq(columnArray)
}
Logger.getLogger("org").setLevel(Level.ERROR)
val spark = SparkSession.builder().master("local").appName("ReadingCSV").getOrCreate()
val conf = new SparkConf().setAppName("FixedLength").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true");
val sc = new SparkContext(conf)
val fruits = sc.textFile("in/fruits.txt")
val schemaString = "id,fruitName,isAvailable,unitPrice";
val fields = schemaString.split(",").map( field => StructField(field,StringType,nullable=true))
val schema = StructType(fields)
val df = spark.createDataFrame(fruits.map { x => getRow(x)} , schema)
df.show() // Error
println("End of the program")
}
}
I'm getting error in the df.show() command.
My file content is
56 apple TRUE 0.56
45 pear FALSE1.34
34 raspberry TRUE 2.43
34 plum TRUE 1.31
53 cherry TRUE 1.4
23 orange FALSE2.34
56 persimmon FALSE23.2
ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.ClassCastException: org.apache.spark.util.SerializableConfiguration cannot be cast to [B
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:81)
Can you please help?

You are creating rdd in old way SparkContext(conf)
val conf = new SparkConf().setAppName("FixedLength").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true");
val sc = new SparkContext(conf)
val fruits = sc.textFile("in/fruits.txt")
whereas you are creating dataframe in new way using SparkSession
val spark = SparkSession.builder().master("local").appName("ReadingCSV").getOrCreate()
val df = spark.createDataFrame(fruits.map { x => getRow(x)} , schema)
Ultimately you are mixing rdd created with old sparkContext functions with dataframe created by using new sparkSession.
I would suggest you to use only one way.
I guess thats the reason for the issue
Update
doing the following should work for you
def getRow(x : String) : Row={
val columnArray = new Array[String](4)
columnArray(0)=x.substring(0,3)
columnArray(1)=x.substring(3,13)
columnArray(2)=x.substring(13,18)
columnArray(3)=x.substring(18,22)
Row.fromSeq(columnArray)
}
Logger.getLogger("org").setLevel(Level.ERROR)
val spark = SparkSession.builder().master("local").appName("ReadingCSV").getOrCreate()
val fruits = spark.sparkContext.textFile("in/fruits.txt")
val schemaString = "id,fruitName,isAvailable,unitPrice";
val fields = schemaString.split(",").map( field => StructField(field,StringType,nullable=true))
val schema = StructType(fields)
val df = spark.createDataFrame(fruits.map { x => getRow(x)} , schema)

Related

How to convert RDD[org.apache.spark.sql.Row] to RDD[org.apache.spark.mllib.linalg.Vector]

I am trying to convert RDD[Row] to RDD[Vector] but it throws exception stating
java.lang.ClassCastException: org.apache.spark.ml.linalg.DenseVector cannot be cast to org.apache.spark.mllib.linalg.Vector
My code is
val spark = SparkSession.builder().master("local").getOrCreate()
val df = spark.range(0,10).withColumn("uniform" , rand(10L)).withColumn("normal1" , randn(10L)).withColumn("normal2" , randn(11L))
val assembler = new VectorAssembler().setInputCols(Array("uniform" ,"normal1","normal2")).setOutputCol("features")
val dfVec = assembler.transform(df)
val dfOutlier = dfVec.select("id" , "features").union( spark.createDataFrame(Seq( (10 , org.apache.spark.mllib.linalg.Vectors.dense(3,3,3)) )) )
dfOutlier.show(false)
val scaler = new StandardScaler().setInputCol("features").setOutputCol("Scaled").setWithStd(true).setWithMean(true)
val model = scaler.fit(dfOutlier).transform(dfOutlier)
model.show(false)
val dfVecRdd = model.select("Scaled").rdd.map(_(0).asInstanceOf[org.apache.spark.mllib.linalg.Vector] )
When I perform action on dfVecRdd, exception is raised. How can I solve this?
Try to remove this import in your code
org.apache.spark.mllib.linalg.Vector
and import this one
import org.apache.spark.ml.linalg.Vectors

HBASE SPARK Query with filter without load all the hbase

I have to query HBASE and then work with the data with spark and scala.
My problem is that with my solution, i take ALL the data of my HBASE table and then i filter, it's not an efficient way because it takes too much memory. So i would like to do the filter directly, how can i do that ?
def HbaseSparkQuery(table: String, gatewayINPUT: String, sparkContext: SparkContext): DataFrame = {
val sqlContext = new SQLContext(sparkContext)
import sqlContext.implicits._
val conf = HBaseConfiguration.create()
val tableName = table
conf.set("hbase.zookeeper.quorum", "localhost")
conf.set("hbase.master", "localhost:60000")
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val hBaseRDD = sparkContext.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
val DATAFRAME = hBaseRDD.map(x => {
(Bytes.toString(x._2.getValue(Bytes.toBytes("header"), Bytes.toBytes("gatewayIMEA"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("header"), Bytes.toBytes("eventTime"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("node"), Bytes.toBytes("imei"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("measure"), Bytes.toBytes("rssi"))))
}).toDF()
.withColumnRenamed("_1", "GatewayIMEA")
.withColumnRenamed("_2", "EventTime")
.withColumnRenamed("_3", "ap")
.withColumnRenamed("_4", "RSSI")
.filter($"GatewayIMEA" === gatewayINPUT)
DATAFRAME
}
As you can see in my code, I do the filter after the creation of the dataframe, after the loading of Hbase data ..
Thank you in advance for your answers
Here is the solution I found
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.filter._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil
object HbaseConnector {
def main(args: Array[String]): Unit = {
// System.setProperty("hadoop.home.dir", "/usr/local/hadoop")
val sparkConf = new SparkConf().setAppName("CoverageAlgPipeline").setMaster("local[*]")
val sparkContext = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sparkContext)
import sqlContext.implicits._
val spark = org.apache.spark.sql.SparkSession.builder
.master("local")
.appName("Coverage Algorithm")
.getOrCreate
val GatewayIMEA = "123"
val TABLE_NAME = "TABLE"
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "localhost")
conf.set("hbase.master", "localhost:60000")
conf.set(TableInputFormat.INPUT_TABLE, TABLE_NAME)
val connection = ConnectionFactory.createConnection(conf)
val table = connection.getTable(TableName.valueOf(TABLE_NAME))
val scan = new Scan
val GatewayIDFilter = new SingleColumnValueFilter(Bytes.toBytes("header"), Bytes.toBytes("gatewayIMEA"), CompareFilter.CompareOp.EQUAL, Bytes.toBytes(String.valueOf(GatewayIMEA)))
scan.setFilter(GatewayIDFilter)
conf.set(TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(scan))
val hBaseRDD = sparkContext.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
val DATAFRAME = hBaseRDD.map(x => {
(Bytes.toString(x._2.getValue(Bytes.toBytes("header"), Bytes.toBytes("gatewayIMEA"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("header"), Bytes.toBytes("eventTime"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("node"), Bytes.toBytes("imei"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("measure"), Bytes.toBytes("Measure"))))
}).toDF()
.withColumnRenamed("_1", "GatewayIMEA")
.withColumnRenamed("_2", "EventTime")
.withColumnRenamed("_3", "ap")
.withColumnRenamed("_4", "measure")
DATAFRAME.show()
}
}
What is done is to set your input table, set your filter, do the scan with the filter and get the scan to a RDD, and then transform the RDD to a dataframe (optional)
To do multiple filters :
val timestampFilter = new SingleColumnValueFilter(Bytes.toBytes("header"), Bytes.toBytes("eventTime"), CompareFilter.CompareOp.GREATER, Bytes.toBytes(String.valueOf(dateOfDayTimestamp)))
val GatewayIDFilter = new SingleColumnValueFilter(Bytes.toBytes("header"), Bytes.toBytes("gatewayIMEA"), CompareFilter.CompareOp.EQUAL, Bytes.toBytes(String.valueOf(GatewayIMEA)))
val filters = new FilterList(GatewayIDFilter, timestampFilter)
scan.setFilter(filters)
You can use a spark-hbase connector with predicate pushdown. e.g.https://spark-packages.org/package/Huawei-Spark/Spark-SQL-on-HBase

nanoseconds truncated while inserting data into hive from Spark

While inserting data into Hive TimestampType from spark, nanoseconds are truncated. does anyone has any solution towards it? I have tried writing to orc and csv format on hive.
CSV: it appeared as 2018-03-20T13:04:20.123Z
ORC: 2018-03-20 13:04:20.123456
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.StructField
import java.util.Date
import org.apache.spark.sql.Row
import java.math.{BigDecimal,MathContext,RoundingMode}
/**
* Main class to read Order, Route and Trade records and convert them to ORC File format
* #author Shefali.Nema
* #since 1.0.0
*/
object testDateAndDecimal {
def main(args: Array[String]): Unit = {
execute;
}
private def execute: Unit = {
val sparkConf = new SparkConf().setAppName("Test");
val sc = new SparkContext(sparkConf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
// Define DataTypes
val datetimestring: String = "2018-03-20 13:04:20.123456789"
val dt = java.sql.Timestamp.valueOf(datetimestring)
//val DecimalType = DataTypes.createDecimalType(18, 8)
//Define Values
val id = 1
//System.out.println(new BigDecimal("135.69")); // 135.69
val price = new BigDecimal("1234567890.1234567899")
System.out.println("\n###################################################price###################################" + price + "\n")
System.out.println("\n###################################################dt###################################" + dt + "\n")
val schema = StructType(StructField("id",IntegerType,true) :: StructField("name",TimestampType,true) :: StructField("price",DecimalType(18,8),true) :: Nil)
val values = List(id,dt,price)
val row = Row.fromSeq(values)
// Create `RDD` from `Row`
val rdd = sc.makeRDD(List(row))
val orcFolderName = "testDecimal"
val hiveRowsDF = sqlContext.createDataFrame(rdd, schema)
hiveRowsDF.write.mode(org.apache.spark.sql.SaveMode.Append).orc(orcFolderName)
}
}

How to convert matrix to DataFrame with scala/spark?

I have a matrix and number of columns and rows is unknow
One example Matrix is:
[5,1.3]
[1,5.2]
I want to convert it to DataFrame,column name is random,how to achive it?
this is my expect result:
+-------------+----+
| _1 | _2 |
+-------------+----+
|5 |1.3 |
|1 |5.2 |
--------------------
I suggest you convert matrix to RDD and then convert RDD to DataFrame, it is not a good way but works fine in Spark 2.0.0.
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.mllib.linalg._
import org.apache.spark.rdd.RDD
object mat2df {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("mat2df").setMaster("local[1]")
val sc = new SparkContext(conf)
val values = Array(5, 1, 1.3, 5.2)
val mat = Matrices.dense(2, 2, values).asInstanceOf[DenseMatrix]
def toRDD(m: Matrix): RDD[Vector] = {
val columns = m.toArray.grouped(m.numRows)
val rows = columns.toSeq.transpose
val vectors = rows.map(row => new DenseVector(row.toArray))
sc.parallelize(vectors)
}
val mat_rows = toRDD(mat)// matrix to rdd
val mat_rdd = mat_rows.map(_.toArray).map{case Array(p0, p1) => (p0, p1)}
val spark: SparkSession = SparkSession.builder.master("local").getOrCreate
val df = spark.createDataFrame(mat_rdd) // rdd to dataframe
df.show()
}
}
def matrixToDataFrame(sc:SparkContext, matrix:Matrix, m_nodeColName:String):DataFrame={
val rdd = sc.parallelize(matrix.colIter.toSeq).map(x => {
Row.fromSeq(x.toArray.toSeq)
})
val sc = new SQLContext(nodeContext.getSparkCtx())
var schema = new StructType()
val ids = ArrayBuffer[String]()
for (i <- 0 until matrix.rowIter.size) {
schema = schema.add(StructField(m_nodeColName +"_"+ i.toString(), DoubleType, true))
ids.append(m_nodeColName +"_"+ i.toString())
}
sc.sparkSession.createDataFrame(rdd, schema)
}

Creating a broadcast variable with SparkSession ? Spark 2.0

Is it possible to create broadcast variables with the sparkContext provided by SparkSession ? I keep getting an error under sc.broadcast , however in a different project when using the SparkContext from org.apache.spark.SparkContext I have no problems.
import org.apache.spark.sql.SparkSession
object MyApp {
def main(args: Array[String]){
val spark = SparkSession.builder()
.appName("My App")
.master("local[*]")
.getOrCreate()
val sc = spark.sparkContext
.setLogLevel("ERROR")
val path = "C:\\Boxes\\github-archive\\2015-03-01-0.json"
val ghLog = spark.read.json(path)
val pushes = ghLog.filter("type = 'PushEvent'")
pushes.printSchema()
println("All events: "+ ghLog.count)
println("Only pushes: "+pushes.count)
pushes.show(5)
val grouped = pushes.groupBy("actor.login").count()
grouped.show(5)
val ordered = grouped.orderBy(grouped("count").desc)
ordered.show(5)
import scala.io.Source.fromFile
val fileName= "ghEmployees.txt"
val employees = Set() ++ (
for {
line <- fromFile(fileName).getLines()
} yield line.trim
)
val bcEmployees = sc.broadcast(employees)
}
}
Or is it a problem of using a Set () instead of a Seq object ?
Thanks for any help
Edit:
I keep getting a "cannot resolve symbol broadcast" error msg in intellij
after complying I get an error of:
Error:(47, 28) value broadcast is not a member of Unit
val bcEmployees = sc.broadcast(employees)
^
Your sc variable has type Unit because, according to the docs, setLogLevel has return type Unit. Do this instead:
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("ERROR")
It is important to keep track of the types of your variables to catch errors earlier.