Scala Spark | IntelliJ compilation error on .read and .write - scala

Im new to scala spark. I m trying to read a csv file from S3 and write it into cassandra. The code below on compilation in IntelliJ gives me errors.
Not sure What am I doing wrong. Appreciate your inputs in advance.
scala : 2.11.12
Spark : 2.4.3
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType, IntegerType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.cassandra._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions.col
import com.datastax.spark.connector._
import org.apache.spark.sql.DataFrame
import org.apache.spark.rdd.RDD
object OrderAdjustmentLoad
{
def main(args:Array[String])=
{
val awsAccessKeyId: String = args(0)
val awsSecretAccessKey: String = args(1)
val csvFilePath: String = args(2)
val host: String = args(3)
val username: String = args(4)
val password: String = args(5)
val keyspace: String = args(6)
println("length args: " + args.length)
val conf = new SparkConf(true)
.set("fs.s3n.awsAccessKeyId", awsAccessKeyId)
.set("fs.s3n.awsSecretAccessKey", awsSecretAccessKey)
.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
.set("spark.cassandra.connection.host", host)
.set("spark.cassandra.connection.port","9042")
.set("spark.cassandra.auth.username", username)
.set("spark.cassandra.auth.password", password)
val sc = new SparkContext(conf)
val schemaHdr = StructType( StructField("a2z_name", StringType) ::
StructField("a2z_key", StringType) ::
StructField("a2z_id", IntegerType) :: Nil
)
val df = sc
.read
.format("csv")
.option("header", "true")
.option("delimiter", "\t")
.option("quote", "\"")
.schema(schemaHdr)
.load("s3n://"+csvFilePath)
println(df.count())
val sqlContext = new SQLContext(sc)
df.write
.format("org.apache.spark.sql.cassandra")
.option("keyspace","poc_sparkjob")
.option("table","a2z")
.mode(org.apache.spark.sql.SaveMode.Append)
.save
sc.stop()
}
}
Errors
Cannot resolve symbol read
Cannot resolve symbol write

Related

Cannot convert an RDD to Dataframe

I've converted a dataframe to an RDD:
val rows: RDD[Row] = df.orderBy($"Date").rdd
And now I'm trying to convert it back:
val df2 = spark.createDataFrame(rows)
But I'm getting an error:
Edit:
rows.toDF()
Also produces an error:
Cannot resolve symbol toDF
Even though I included this line earlier:
import spark.implicits._
Full code:
import org.apache.spark._
import org.apache.spark.sql._
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import scala.util._
import org.apache.spark.mllib.rdd.RDDFunctions._
import org.apache.spark.rdd._
object Playground {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("Playground")
.config("spark.master", "local")
.getOrCreate()
import spark.implicits._
val sc = spark.sparkContext
val df = spark.read.csv("D:/playground/mre.csv")
df.show()
val rows: RDD[Row] = df.orderBy($"Date").rdd
val df2 = spark.createDataFrame(rows)
rows.toDF()
}
}
Your IDE is right, SparkSession.createDataFrame needs a second parameter: either a bean class or a schema.
This will fix your problem:
val df2 = spark.createDataFrame(rows, df.schema)

Unable to create multiple files using foreachBatch in spark (This Code Works Now)

I want to save files to multiple destination using foreachBatch , the code is running fine but foreachBatch isn't running the way wanted.
Kindly help me with this if you got any clue.
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql._
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.streaming._
import org.apache.spark.storage.StorageLevel
object multiDestination {
val spark = SparkSession.builder()
.master("local")
.appName("Writing data to multiple destinations")
.getOrCreate()
def main(args: Array[String]): Unit = {
val mySchema = StructType(Array(
StructField("Id", IntegerType),
StructField("Name", StringType)
))
val askDF = spark
.readStream
.format("csv")
.option("header","true")
.schema(mySchema)
.load("/home/amulya/Desktop/csv/")
//println(askDF.show())
println(askDF.isStreaming)
askDF.writeStream.foreachBatch { (askDF : DataFrame , batchId:Long) =>
askDF.persist()

HBASE SPARK Query with filter without load all the hbase

I have to query HBASE and then work with the data with spark and scala.
My problem is that with my solution, i take ALL the data of my HBASE table and then i filter, it's not an efficient way because it takes too much memory. So i would like to do the filter directly, how can i do that ?
def HbaseSparkQuery(table: String, gatewayINPUT: String, sparkContext: SparkContext): DataFrame = {
val sqlContext = new SQLContext(sparkContext)
import sqlContext.implicits._
val conf = HBaseConfiguration.create()
val tableName = table
conf.set("hbase.zookeeper.quorum", "localhost")
conf.set("hbase.master", "localhost:60000")
conf.set(TableInputFormat.INPUT_TABLE, tableName)
val hBaseRDD = sparkContext.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
val DATAFRAME = hBaseRDD.map(x => {
(Bytes.toString(x._2.getValue(Bytes.toBytes("header"), Bytes.toBytes("gatewayIMEA"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("header"), Bytes.toBytes("eventTime"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("node"), Bytes.toBytes("imei"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("measure"), Bytes.toBytes("rssi"))))
}).toDF()
.withColumnRenamed("_1", "GatewayIMEA")
.withColumnRenamed("_2", "EventTime")
.withColumnRenamed("_3", "ap")
.withColumnRenamed("_4", "RSSI")
.filter($"GatewayIMEA" === gatewayINPUT)
DATAFRAME
}
As you can see in my code, I do the filter after the creation of the dataframe, after the loading of Hbase data ..
Thank you in advance for your answers
Here is the solution I found
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.filter._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil
object HbaseConnector {
def main(args: Array[String]): Unit = {
// System.setProperty("hadoop.home.dir", "/usr/local/hadoop")
val sparkConf = new SparkConf().setAppName("CoverageAlgPipeline").setMaster("local[*]")
val sparkContext = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sparkContext)
import sqlContext.implicits._
val spark = org.apache.spark.sql.SparkSession.builder
.master("local")
.appName("Coverage Algorithm")
.getOrCreate
val GatewayIMEA = "123"
val TABLE_NAME = "TABLE"
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "localhost")
conf.set("hbase.master", "localhost:60000")
conf.set(TableInputFormat.INPUT_TABLE, TABLE_NAME)
val connection = ConnectionFactory.createConnection(conf)
val table = connection.getTable(TableName.valueOf(TABLE_NAME))
val scan = new Scan
val GatewayIDFilter = new SingleColumnValueFilter(Bytes.toBytes("header"), Bytes.toBytes("gatewayIMEA"), CompareFilter.CompareOp.EQUAL, Bytes.toBytes(String.valueOf(GatewayIMEA)))
scan.setFilter(GatewayIDFilter)
conf.set(TableInputFormat.SCAN, TableMapReduceUtil.convertScanToString(scan))
val hBaseRDD = sparkContext.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
val DATAFRAME = hBaseRDD.map(x => {
(Bytes.toString(x._2.getValue(Bytes.toBytes("header"), Bytes.toBytes("gatewayIMEA"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("header"), Bytes.toBytes("eventTime"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("node"), Bytes.toBytes("imei"))),
Bytes.toString(x._2.getValue(Bytes.toBytes("measure"), Bytes.toBytes("Measure"))))
}).toDF()
.withColumnRenamed("_1", "GatewayIMEA")
.withColumnRenamed("_2", "EventTime")
.withColumnRenamed("_3", "ap")
.withColumnRenamed("_4", "measure")
DATAFRAME.show()
}
}
What is done is to set your input table, set your filter, do the scan with the filter and get the scan to a RDD, and then transform the RDD to a dataframe (optional)
To do multiple filters :
val timestampFilter = new SingleColumnValueFilter(Bytes.toBytes("header"), Bytes.toBytes("eventTime"), CompareFilter.CompareOp.GREATER, Bytes.toBytes(String.valueOf(dateOfDayTimestamp)))
val GatewayIDFilter = new SingleColumnValueFilter(Bytes.toBytes("header"), Bytes.toBytes("gatewayIMEA"), CompareFilter.CompareOp.EQUAL, Bytes.toBytes(String.valueOf(GatewayIMEA)))
val filters = new FilterList(GatewayIDFilter, timestampFilter)
scan.setFilter(filters)
You can use a spark-hbase connector with predicate pushdown. e.g.https://spark-packages.org/package/Huawei-Spark/Spark-SQL-on-HBase

Spark Scala Cassandra CSV insert into cassandra

Here is the code below:
Scala Version: 2.11.
Spark Version: 2.0.2.6
Cassandra Version: cqlsh 5.0.1 | Cassandra 3.11.0.1855 | DSE 5.1.3 | CQL spec 3.4.4 | Native protocol v4
I am trying to read from CSV and write to Cassandra Table. I am new to Scala and Spark. Please correct me where I am doing wrong
import org.apache.spark.sql.SparkSession
import org.apache.log4j.{Level, Logger}
import com.datastax
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import com.datastax.spark.connector._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import org.apache.spark.sql._
import com.datastax.spark.connector.UDTValue
import com.datastax.spark.connector.mapper.DefaultColumnMapper
object dataframeset {
def main(args: Array[String]): Unit = {
// Cassandra Part
val conf = new SparkConf().setAppName("Sample1").setMaster("local[*]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val rdd1 = sc.cassandraTable("tdata", "map")
rdd1.collect().foreach(println)
// Scala Read CSV Part
Logger.getLogger("org").setLevel(Level.ERROR)
Logger.getLogger("akka").setLevel(Level.ERROR)
val spark1 = org.apache.spark.sql.SparkSession
.builder()
.master("local")
.appName("Spark SQL basic example")
.getOrCreate()
val df = spark1.read.format("csv")
.option("header","true")
.option("inferschema", "true")
.load("/Users/tom/Desktop/del2.csv")
import spark1.implicits._
df.printSchema()
val dfprev = df.select(col = "Year","Measure").filter("Category = 'Prevention'" )
// dfprev.collect().foreach(println)
val a = dfprev.select("YEAR")
val b = dfprev.select("Measure")
val collection = sc.parallelize(Seq(a,b))
collection.saveToCassandra("tdata", "map", SomeColumns("sno", "name"))
spark1.stop()
}
}
Error:
Exception in thread "main" java.lang.IllegalArgumentException: Multiple constructors with the same number of parameters not allowed.
Cassandra Table
cqlsh:tdata> desc map
CREATE TABLE tdata.map (
sno int PRIMARY KEY,
name text;
I know I am missing something especially trying to write entire Data frame into Cassandra in one shot. Not I don't know what needs to be done either.
Thanks
tom
You can directly write a dataframe (dataset[Row] in spark 2.x) to cassandra.
You will have to define cassandra host, username and password if authentication is enabled in spark conf to connect to cassandra using somethin like
val conf = new SparkConf(true)
.set("spark.cassandra.connection.host", "CASSANDRA_HOST")
.set("spark.cassandra.auth.username", "CASSANDRA_USERNAME")
.set("spark.cassandra.auth.password", "CASSANDRA_PASSWORD")
OR
val spark1 = org.apache.spark.sql.SparkSession
.builder()
.master("local")
.config("spark.cassandra.connection.host", "CASSANDRA_HOST")
.config("spark.cassandra.auth.username", "CASSANDRA_USERNAME")
.config("spark.cassandra.auth.password", "CASSANDRA_PASSWORD")
.appName("Spark SQL basic example")
.getOrCreate()
val dfprev = df.filter("Category = 'Prevention'" ).select(col("Year").as("yearAdded"),col("Measure").as("Recording"))
dfprev .write
.format("org.apache.spark.sql.cassandra")
.options(Map("table" -> "map", "keyspace" -> "tdata"))
.save()
Dataframe in spark-cassandra-connector

Spark 2.0 - Convert DataFrame to DataSet

I want to load my data and do some basic linear regression on it. So first, I need to use VectorAssembler to produce my features column. However, when I use assembler.transform(df), df is a DataFrame, and it expects a DataSet. I tried df.toDS, but it gives value toDS is not a member of org.apache.spark.sql.DataFrame. Indeed, it is a member of org.apache.spark.sql.DatasetHolder.
What am I getting wrong here?
package main.scala
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.DatasetHolder
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.RFormula
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
object Analyzer {
def main(args: Array[String]) {
val conf = new SparkConf()
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val df = sqlContext.read
.format("com.databricks.spark.csv")
.option("header", "false")
.option("delimiter", "\t")
.option("parserLib", "UNIVOCITY")
.option("inferSchema", "true")
.load("data/snap/*")
val assembler = new VectorAssembler()
.setInputCols(Array("own", "want", "wish", "trade", "comment"))
.setOutputCol("features")
val df1 = assembler.transform(df)
val formula = new RFormula().setFormula("rank ~ own + want + wish + trade + comment")
.setFeaturesCol("features")
.setLabelCol("rank")
}
}
Apparently the problem was because I still using Spark 1.6 style of Spark and SQLContext. I changed for the SparkSession, and transform() was able to implicitly accept the DataFrame.
package main.scala
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.Dataset
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.RFormula
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
object Analyzer {
def main(args: Array[String]) {
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val df = spark.read
.format("com.databricks.spark.csv")
.option("header", "false")
.option("delimiter", "\t")
.option("parserLib", "UNIVOCITY")
.option("inferSchema", "true")
.load("data/snap/*")
df.show()
val assembler = new VectorAssembler()
.setInputCols(Array("own", "want", "wish", "trade", "comment"))
.setOutputCol("features")
val df1 = assembler.transform(df)
}
}