Following is a simple word count Spark App using DataFrame and the corresponding unit tests using spark-testingbase. It works if I use the following
def toWords(linesDf: DataFrame) = {
linesDf
.select(linesDf("line"),
explode(split(linesDf("line"), WhitespaceRegex)).as("word"))
}
But doesn't work if I use $ method call to reference the columns as shown below
def toWords(linesDf: DataFrame) = {
import spark.implicits._
linesDf
.select($"line"),
explode(split($"line", WhitespaceRegex)).as("word"))
}
Error
java.lang.NullPointerException was thrown.
java.lang.NullPointerException
at com.aravind.oss.eg.wordcount.spark.WordCountDFApp$.toWords(WordCountDFApp.scala:42)
at com.aravind.oss.eg.wordcount.spark.WordCountDFAppTestSpec2$$anonfun$1.apply$mcV$sp(WordCountDFAppTestSpec2.scala:32)
at com.aravind.oss.eg.wordcount.spark.WordCountDFAppTestSpec2$$anonfun$1.apply(WordCountDFAppTestSpec2.scala:17)
at com.aravind.oss.eg.wordcount.spark.WordCountDFAppTestSpec2$$anonfun$1.apply(WordCountDFAppTestSpec2.scala:17)
at org.scalatest.OutcomeOf$class.outcomeOf(OutcomeOf.scala:85)
at org.scalatest.OutcomeOf$.outcomeOf(OutcomeOf.scala:104)
Spark App
object WordCountDFApp extends App with Logging {
logInfo("WordCount with Dataframe API")
val paths = getPaths(args)
val cluster = getClusterCfg(args)
if (paths.size > 1) {
logInfo("More than one file to process")
}
logInfo("Path(s): " + paths)
logInfo("Cluster: " + cluster)
val spark = getSparkSession("WordCountDFApp", cluster)
val linesDf: DataFrame = spark.read
.textFile(paths: _*)
.toDF("line") //Dataset[Row]
logInfo("DataFrame before splitting line")
linesDf.show(false)
import spark.implicits._
import org.apache.spark.sql.functions._
val wordsDf = toWords(linesDf)
logInfo("Inferred schema")
wordsDf.printSchema()
logInfo("DataFrame after splitting the line into words")
wordsDf.show(false)
countWords(wordsDf).show(false)
def toWords(linesDf: DataFrame) = {
linesDf
.select(linesDf("line"),
explode(split(linesDf("line"), WhitespaceRegex)).as("word"))
}
}
Test
class WordCountDFAppTestSpec2 extends FlatSpec with DataFrameSuiteBase {
val input: Seq[String] = Seq(
("one"),
("two"),
(""),
("three Three")
)
"toWords" should "split the file into words" in {
val sqlCtx = sqlContext
import sqlCtx.implicits._
val sourceDf = input.toDF("line")
// sourceDf.show(false)
val expectedDF = Seq(
("one", "one"),
("two", "two"),
("", ""),
("three Three", "three"),
("three Three", "Three")
).toDF("line", "word")
// expectedDF.show(false)
val actualDF = WordCountDFApp.toWords(sourceDf)
// actualDF.show(false)
assertDataFrameEquals(actualDF, expectedDF)
}
}
The main problem is the implicits is not imported in runtime, you need to add this line:
import linesDf.sparkSession.implicits._
in your method, e.g:
def toWords(linesDf: DataFrame) = {
import linesDf.sparkSession.implicits._
linesDf
.select($"line",
explode(split(linesDf("line"), WhitespaceRegex)).as("word"))
}
and that will fix the problem.
You should call/import sqlContext.implicits to access $(dollar sign) in your code
import spark.sqlContext.implicits._
So your full imports looks like this:
import spark.implicits._
import spark.sqlContext.implicits._
import org.apache.spark.sql.functions._
Related
The path given to the text file is correct still I am getting error " Input path does not exist: file:/C:/Users/cmpil/Downloads/hunger_games.txt". Why is it happening
import org.apache.spark.sql._
import org.apache.log4j._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
object WordCountDataSet {
case class Book(value:String)
def main(args:Array[String]): Unit ={
Logger.getLogger("org").setLevel(Level.ERROR)
val spark = SparkSession
.builder()
.appName("WordCount")
.master("local[*]")
.getOrCreate()
import spark.implicits._
//Another way of doing it
val bookRDD = spark.sparkContext.textFile("C:/Users/cmpil/Downloads/hunger_games.txt")
val wordsRDD = bookRDD.flatMap(x => x.split("\\W+"))
val wordsDS = wordsRDD.toDS()
val lowercaseWordsDS = wordsDS.select(lower($"value").alias("word"))
val wordCountsDS = lowercaseWordsDS.groupBy("word").count()
val wordCountsSortedDS = wordCountsDS.sort("count")
wordCountsSortedDS.show(wordCountsSortedDS.count().toInt)
}
}
on windows you have to use '\\' in place of '/'
try using "C:\\Users\\cmpil\\Downloads\\hunger_games.txt"
I am trying to stream twitter data using Apache Spark and I want to save it as csv file into HDFS. I understand that I have to convert it to a dataframe but I am not able to do so.
Here is my full code:
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.twitter.TwitterUtils
//import com.google.gson.Gson
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
//import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
//import org.apache.spark.sql.functions._
import sentimentAnalysis.sentimentScore
case class twitterCaseClass (userID: String = "", user: String = "", createdAt: String = "", text: String = "", sentimentType: String = "")
object twitterStream {
//private val gson = new Gson()
def main(args: Array[String]) {
//Twitter API
Logger.getLogger("org").setLevel(Level.ERROR)
System.setProperty("twitter4j.oauth.consumerKey", "#######")
System.setProperty("twitter4j.oauth.consumerSecret", "#######")
System.setProperty("twitter4j.oauth.accessToken", "#######")
System.setProperty("twitter4j.oauth.accessTokenSecret", "#######")
val spark = SparkSession.builder().appName("twitterStream").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
val streamContext = new StreamingContext(sc, Seconds(5))
import spark.implicits._
val filters = Array("Singapore")
val filtered = TwitterUtils.createStream(streamContext, None, filters)
val englishTweets = filtered.filter(_.getLang() == "en")
englishTweets.print()
val tweets = englishTweets.map{ col => {
(
"userID" -> col.getId,
"user" -> col.getUser.getScreenName,
"createdAt" -> col.getCreatedAt.toInstant.toString,
"text" -> col.getText.toLowerCase.split(" ").filter(_.matches("^[a-zA-Z0-9 ]+$")).fold("")((a, b) => a + " " + b).trim,
"sentimentType" -> sentimentScore(col.getText).toString
)
}
}
//val tweets = englishTweets.map(gson.toJson(_))
//tweets.saveAsTextFiles("hdfs://localhost:9000/usr/sparkApp/test/")
streamContext.start()
streamContext.awaitTermination()
}
}
I am not sure where did I possibly went wrong. There is another way to go about which is using case class. Is there a good example I can follow?
Update
The result of the Map function which is save into HDFS is like this:
((userID,1345940003533312000),(user,rei_yang),(createdAt,2021-01-04T03:47:57Z),(text,just posted a photo singapore),(sentimentType,NEUTRAL))
Is there a way to code it to a dataframe?
I have a utility function written in scala to read parquet files from s3 bucket. Could someone help me in writing unit test cases for this
Below is the function which needs to be tested.
def readParquetFile(spark: SparkSession,
locationPath: String): DataFrame = {
spark.read
.parquet(locationPath)
}
So far i have created a SparkSession for which the master is local
import org.apache.spark.sql.SparkSession
trait SparkSessionTestWrapper {
lazy val spark: SparkSession = {
SparkSession.builder().master("local").appName("Test App").getOrCreate()
}
}
I am stuck with testing the function. Here is the code where I am stuck. The question is should i create a real parquet file and load to see if the dataframe is getting created or is there a mocking framework to test this.
import com.github.mrpowers.spark.fast.tests.DataFrameComparer
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.scalatest.FunSpec
class ReadAndWriteSpec extends FunSpec with DataFrameComparer with SparkSessionTestWrapper {
import spark.implicits._
it("reads a parquet file and creates a dataframe") {
}
}
Edit:
Basing on the inputs from the comments i came up with the below but i am still not able to understand how this can be leveraged.
I am using https://github.com/findify/s3mock
class ReadAndWriteSpec extends FunSpec with DataFrameComparer with SparkSessionTestWrapper {
import spark.implicits._
it("reads a parquet file and creates a dataframe") {
val api = S3Mock(port = 8001, dir = "/tmp/s3")
api.start
val endpoint = new EndpointConfiguration("http://localhost:8001", "us-west-2")
val client = AmazonS3ClientBuilder
.standard
.withPathStyleAccessEnabled(true)
.withEndpointConfiguration(endpoint)
.withCredentials(new AWSStaticCredentialsProvider(new AnonymousAWSCredentials()))
.build
/** Use it as usual. */
client.createBucket("foo")
client.putObject("foo", "bar", "baz")
val url = client.getUrl("foo","bar")
println(url.getFile())
val df = ReadAndWrite.readParquetFile(spark,url.getPath())
df.printSchema()
}
}
I figured out and kept it simple. I could complete some basic test cases.
Here is my solution. I hope this will help someone.
import org.apache.spark.sql
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.scalatest.{BeforeAndAfterEach, FunSuite}
import loaders.ReadAndWrite
class ReadAndWriteTestSpec extends FunSuite with BeforeAndAfterEach{
private val master = "local"
private val appName = "ReadAndWrite-Test"
var spark : SparkSession = _
override def beforeEach(): Unit = {
spark = new sql.SparkSession.Builder().appName(appName).master(master).getOrCreate()
}
test("creating data frame from parquet file") {
val sparkSession = spark
import sparkSession.implicits._
val peopleDF = spark.read.json("src/test/resources/people.json")
peopleDF.write.mode(SaveMode.Overwrite).parquet("src/test/resources/people.parquet")
val df = ReadAndWrite.readParquetFile(sparkSession,"src/test/resources/people.parquet")
df.printSchema()
}
test("creating data frame from text file") {
val sparkSession = spark
import sparkSession.implicits._
val peopleDF = ReadAndWrite.readTextfileToDataSet(sparkSession,"src/test/resources/people.txt").map(_.split(","))
.map(attributes => Person(attributes(0), attributes(1).trim.toInt))
.toDF()
peopleDF.printSchema()
}
test("counts should match with number of records in a text file") {
val sparkSession = spark
import sparkSession.implicits._
val peopleDF = ReadAndWrite.readTextfileToDataSet(sparkSession,"src/test/resources/people.txt").map(_.split(","))
.map(attributes => Person(attributes(0), attributes(1).trim.toInt))
.toDF()
peopleDF.printSchema()
assert(peopleDF.count() == 3)
}
test("data should match with sample records in a text file") {
val sparkSession = spark
import sparkSession.implicits._
val peopleDF = ReadAndWrite.readTextfileToDataSet(sparkSession,"src/test/resources/people.txt").map(_.split(","))
.map(attributes => Person(attributes(0), attributes(1).trim.toInt))
.toDF()
peopleDF.printSchema()
assert(peopleDF.take(1)(0)(0).equals("Michael"))
}
test("Write a data frame as csv file") {
val sparkSession = spark
import sparkSession.implicits._
val peopleDF = ReadAndWrite.readTextfileToDataSet(sparkSession,"src/test/resources/people.txt").map(_.split(","))
.map(attributes => Person(attributes(0), attributes(1).trim.toInt))
.toDF()
//header argument should be boolean to the user to avoid confusions
ReadAndWrite.writeDataframeAsCSV(peopleDF,"src/test/resources/out.csv",java.time.Instant.now().toString,",","true")
}
override def afterEach(): Unit = {
spark.stop()
}
}
case class Person(name: String, age: Int)
I am trying out frameless library for Scala and getting an "No implicits found for parameters i0: TypedColumn.Exists". If you can help me resolve it - that would be awesome....
I am using spark 2.4.0 and frameless 0.8.0.
Following is my code
import org.apache.spark.sql.SparkSession
import frameless.TypedDataset
object TestSpark {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[*]")
.appName("Spark Test")
.getOrCreate
import spark.implicits._
val empDS = spark.read
.option("header",true)
.option("delimiter",",")
.csv("emp.csv")
.as[Emp]
val empTyDS = TypedDataset.create(empDS)
import frameless.syntax._
empTyDS.show(10,false).run
val deptCol = empTyDS('dept) //Get the error here.`
}
}
case class for the code is
case class Emp (
name: String,
dept: String,
manager: String,
salary: String
)
This is to use evaluation (Eval or something similar) of an expression (string) to write a dataframe to a csv file in Scala.
import org.apache.spark.sql.{SaveMode, SparkSession, SQLContext, Row, DataFrame, Column}
import scala.reflect.runtime.universe._
import scala.tools.reflect.ToolBox
import scala.reflect.runtime.currentMirror
val df = Seq(("a", "b", "c"), ("a1", "b1", "c1")).toDF("A", "B", "C")
val df_write = """df.coalesce(1).write.option("delimiter", "\u001F").csv("file:///var/tmp/test")"""
// This is one of my failed attempts - I have tried using the interpreter as well (code not shown here).
val tb = runtimeMirror(getClass.getClassLoader).mkToolBox()
toolbox.eval(toolbox.parse(df_write))
Errors are:
object coalesce is not a member of package df ....
Shiva, try the below code. The issue was that the object variables were not in scope for the toolbox and therefore it was unable to evaluate the expression.
package com.mansoor.test
import org.apache.spark.sql.{DataFrame, SparkSession}
object Driver extends App {
def evalCode[T](code: String): T = {
import scala.tools.reflect.ToolBox
import scala.reflect.runtime.{currentMirror => m}
val toolbox = m.mkToolBox()
toolbox.eval(toolbox.parse(code)).asInstanceOf[T]
}
val sparkSession: SparkSession = SparkSession.builder().appName("Test")
.master("local[2]")
.getOrCreate()
import sparkSession.implicits._
val df: DataFrame = Seq(("a", "b", "c"), ("a1", "b1", "c1")).toDF("A", "B", "C")
val df_write =
s"""
|import com.mansoor.test.Driver._
|
|df.coalesce(1).write.option("delimiter", "\u001F").csv("file:///var/tmp/test")
""".stripMargin
evalCode[Unit](df_write)
sparkSession.sparkContext.stop()
}