Scala and Spark - Logistic Regression - NullPointerException - scala

Hellow everyone!
I have a problem with my scala and spark code. I am trying to implement a logistic regreesion model. For this I had to implement two UDF functions to collect my features. The problem is that every time when I try to call dataframe.show() function, I get an error:
I thought tha maybe I had null values on my dataframe and I tried to call dataframe.na.drop() in order to eliminate likely null values.
The poblem stil exists an it says that faild to execute user defined function(anonfun$3: (array, array) => int).
Here is my hole code:
val sc = spark.sparkContext
val data = sc.textFile("resources/data/training_set.txt").map(line =>{
val fields = line.split(" ")
(fields(0),fields(1), fields(2).toInt)
})
import spark.implicits._
val trainingDF = data.toDF("srcId","dstId", "label")
val infoRDD = spark.read.option("header","false").option("inferSchema","true").format("csv").load("resources/data/node_information.csv")
val infoDF = infoRDD.toDF("srcId","year","title","authors","jurnal","abstract")
println("Showing linksDF sample...")
trainingDF.show(5)
println("Rows of linksDF: ",trainingDF.count())
println("Showing infoDF sample...")
infoDF.show(2)
println("Rows of infoDF: ",infoDF.count())
println("Joining linksDF and infoDF...")
var joinedDF = trainingDF.as("a").join(infoDF.as("b"),$"a.srcId" === $"b.srcId")
println(joinedDF.count())
joinedDF = joinedDF.select($"a.srcId",$"a.dstId",$"a.label",$"b.year",$"b.title",$"b.authors",$"b.jurnal",$"b.abstract")
println("Renameming joinedDF...")
joinedDF = joinedDF
.withColumnRenamed("srcId","id_from")
.withColumnRenamed("dstId","id_to")
.withColumnRenamed("year","year_from")
.withColumnRenamed("title","title_from")
.withColumnRenamed("authors","authors_from")
.withColumnRenamed("jurnal","jurnal_from")
.withColumnRenamed("abstract","abstract_from")
var infoDfRenamed = joinedDF
.withColumnRenamed("id_from","id_from")
.withColumnRenamed("id_to","id_to")
.withColumnRenamed("year_from","year_to")
.withColumnRenamed("title_from","title_to")
.withColumnRenamed("authors_from","authors_to")
.withColumnRenamed("jurnal_from","jurnal_to")
.withColumnRenamed("abstract_from","abstract_to").select("id_to","year_to","title_to","authors_to","jurnal_to","jurnal_to")
var finalDF = joinedDF.as(("a")).join(infoDF.as("b"),$"a.id_to" === $"b.srcId")
finalDF = finalDF
.withColumnRenamed("year","year_to")
.withColumnRenamed("title","title_to")
.withColumnRenamed("authors","authors_to")
.withColumnRenamed("jurnal","jurnal_to")
.withColumnRenamed("abstract","abstract_to")
println("Dropping unused columns from joinedDF...")
finalDF = finalDF.drop("srcId")
println("Spliting title_from column into words...")
finalDF = finalDF.withColumn("title_from_words", functions.split(col("title_from"), "\\s+"))
println("Spliting title_to column into words...")
finalDF = finalDF.withColumn("title_to_words", functions.split(col("title_to"), "\\s+"))
println("Spliting authors_from column into words...")
finalDF = finalDF.withColumn("authors_from_words", functions.split(col("authors_from"), "\\s+"))
println("Spliting authors_to column into words...")
finalDF = finalDF.withColumn("authors_to_words", functions.split(col("authors_to"), "\\s+"))
println("Removing stopwords from title_from column...")
val remover = new StopWordsRemover().setInputCol("title_from_words").setOutputCol("title_from_words_f")
finalDF = remover.transform(finalDF)
println("Removing stopwords from title_to column...")
val remover2 = new StopWordsRemover().setInputCol("title_to_words").setOutputCol("title_to_words_f")
finalDF = remover2.transform(finalDF)
println("Removing stopwords from authors_from column...")
val remover3 = new StopWordsRemover().setInputCol("authors_from_words").setOutputCol("authors_from_words_f")
finalDF = remover3.transform(finalDF)
println("Removing stopwords from authors_to column...")
val remover4 = new StopWordsRemover().setInputCol("authors_to_words").setOutputCol("authors_to_words_f")
finalDF = remover4.transform(finalDF)
finalDF.count()
val udf_title_overlap=udf(findNumberCommonWordsTitle(_:Seq[String],_:Seq[String]))
val udf_authors_overlap = udf(findNumberCommonAuthors(_:Seq[String], _:Seq[String]))
println("Getting the number of common words between title_from and title_to columns using UDF function...")
finalDF = finalDF.withColumn("titles_intersection",udf_title_overlap(finalDF("title_from_words"),finalDF("title_to_words")))
println("Getting the number of common words between authors_from and authors_to columns using UDF function...")
finalDF = finalDF.withColumn("authors_intersection",udf_authors_overlap(finalDF("aut
hors_from_words"),finalDF("authors_to_words")))
finalDF.count()
finalDF = finalDF.withColumn("time_dist",abs($"year_from" -
$"year_to"))
println("Show schema of finalDF:\n")
finalDF.printSchema()
println("Dropping unused columns from finalDF...\n")
val finalCollsDF = finalDF.select("label","titles_intersection",
"authors_intersection", "time_dist")
println("Printing schema for finalDF...\n")
finalCollsDF.printSchema()
println("Creating features coll from finalDF using
VectorAssembler...\n")
val assembler = new VectorAssembler()
.setInputCols(Array("titles_intersection", "authors_intersection",
"time_dist"))
.setOutputCol("features")
val output = assembler.transform(finalCollsDF)
println("Printing final schema before trainning...\n")
output.printSchema()
output.na.drop()
println("Splitting dataset into trainingData and testData...\n")
val Array(trainingData, testData) = output.randomSplit(Array(0.6,
0.4))
val lr = new LogisticRegression()
.setFeaturesCol("features")
.setLabelCol("label")
.setPredictionCol("prediction")
.setRawPredictionCol("prediction_raw")
.setMaxIter(10)
val lr_model = lr.fit(trainingData)
val lr_results = lr_model.transform(testData)
val evaluator = new BinaryClassificationEvaluator()
.setRawPredictionCol("prediction_raw")
.setLabelCol("label")
.setMetricName("areaUnderPR")
println("RESULTS FOR LOGISTIC REGRESSION:")
println(evaluator.evaluate(lr_results))
}
The two UDF functions which I am using are the following:
def findNumberCommonWordsTitle(title_from:Seq[String],
title_to:Seq[String]) ={
val intersection = title_from.intersect(title_to)
intersection.length
}
def findNumberCommonAuthors(author_from:Seq[String],
author_to:Seq[String])={
val intersection = author_from.intersect(author_to)
intersection.length
}
I had searched for null values manualy using a foreach statement but it did not worked too. How can I fix this problem. Maybe I is another problem which I can not find.
Thanks

Spark allow values to be null in dataframes, so calling the UDF on columns of that dataframe may result in NullPointerExceptions if for example one or both arrays are null.
You should check in your UDF for null values and handle these accordingly, or make sure the column in the dataframe may not be null and set null values to empty arrays.
def findNumberCommonAuthors(author_from:Seq[String], author_to:Seq[String])={
if(author_from == null || author_to == null) 0
else author_from.intersect(author_to).length
}

Related

How to use scala SparkML model to do multiple predictions in loop

I just started learning Spark 3.3 for scala to do some regressions.
I was able to create, fit and test a model, but I got stuck trying to predict different subsets of a dataframe looping through one of it's columns to filter the data.
This is my principal function: (I'm testing everything here)
The last line is what I'm trying to achieve, but I'm getting a "Task not serializable" error
def test() = {
val data_key = "./data/EAD_HILIC_PFP_Com.csv"
val df = spark.read
.option("multiLine", true)
.option("header", "true")
.option("inferSchema", "true")
.csv(data_key)
val df2 = df.withColumn("PRECURSORMZ", $"PRECURSORMZ".cast("double").as("PRECURSORMZ"))
// to get list of samples from data
val labelDF = df2.select("mix_label").distinct
val (lrModel, test) = createModel(df2, labelDF)
println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")
println(s"Root Mean Squared Error (RMSE) = ${lrModel.summary.rootMeanSquaredError}")
println(s"R^2 = ${lrModel.summary.r2}")
val predictions = lrModel.transform(test)
val rmse = new RegressionEvaluator()
.setLabelCol("PRECURSORMZ")
.setPredictionCol("prediction")
.setMetricName("rmse")
val r2 = new RegressionEvaluator()
.setLabelCol("PRECURSORMZ")
.setPredictionCol("prediction")
.setMetricName("r2")
println(s"Root Mean Squared Error (RMSE) on test data (${labelDF.head.get(0)}) = " + rmse.evaluate(predictions))
println(s"R^2 on test data (${labelDF.head.get(0)}) = " + r2.evaluate(predictions))
labelDF.foreach(label => process(df2, label, lrModel, rmse, r2))
}
The createModel funtion, does what it says, creates and fits a linear regression model:
def createModel(df2: DataFrame, labelDF: DataFrame): (LinearRegressionModel, DataFrame) = {
val first = labelDF.head.get(0)
//istds for single sample (mix_label)
val istdsDF = df2
.filter('mix_label === first)
.select($"PRECURSORMZ", $"Average_mz")
df2.show()
istdsDF.show()
val assembler = new VectorAssembler()
.setInputCols(istdsDF.drop("msms", "mix_label").columns)
.setOutputCol("features")
val (train, test) = train_test_split(istdsDF, assembler)
val lr = new LinearRegression()
.setLabelCol("PRECURSORMZ")
.setFeaturesCol("features")
val lrModel = lr.fit(train)
(lrModel, test)
}
def train_test_split(data: DataFrame, assembler: VectorAssembler): (DataFrame, DataFrame) = {
val Array(train, test) = data.randomSplit(Array(0.8, 0.2), seed = 30)
(assembler.transform(train), assembler.transform(test))
}
Thanks for any help
EDIT 1: adding process function:
def process(otherDF: DataFrame, otherLabel: String, lrModel: LinearRegressionModel): Unit = {
val assembler = new VectorAssembler()
.setInputCols(otherDF.drop("msms", "mix_label").columns)
.setOutputCol("features")
val others = assembler.transform(otherDF)
others.show()
val rmse = new RegressionEvaluator()
.setLabelCol("PRECURSORMZ")
.setPredictionCol("prediction")
.setMetricName("rmse")
val r2 = new RegressionEvaluator()
.setLabelCol("PRECURSORMZ")
.setPredictionCol("prediction")
.setMetricName("r2")
val otherPreds = lrModel.transform(others)
println(s"Root Mean Squared Error (RMSE) on other data (with label '${otherLabel}') = " + rmse.evaluate(otherPreds))
println(s"R^2 on other data (with label '${otherLabel}') = " + r2.evaluate(otherPreds))
}
StackTrace here: https://pastebin.com/xhUpqvnx

not able to add a spark scala column and add a tuple data

Below is thedata which needs to be populated into a dataframe
val columnNames = Array("ID", "Name","Age")
val d1 = Array("QWER","TOM","28")
val d2 = Array( "SPSRT","BENJI","45")
val d1zip = columnNames.zip(d1)
val d2zip1 = columnNames.zip(d2)
val data = Array(d1zip, d2zip1)
import org.apache.spark.sql.{DataFrame}
import org.apache.spark.sql.functions._
var df = generateDF1( columnNames , data)
Function to generate the dataframe
def generateDF1(colnames : Array[String], data : Array[Array[(String, String)]]) :
DataFrame = {
import spark.implicits._
var initDF = spark.emptyDataFrame
for ( itemlist <- data ) {
for ( item <- itemlist) {
initDF = initDF.withColumn(item._1, lit(item._2))
}
}
initDF
}
}
Not able to see any data being added to dataframe
if d1 and d2 where actual tuples (and not Arrays as in your code) you could:
val columnNames = Array("ID", "Name","Age")
val d1 =("QWER","TOM","28")
val d2 =("SPSRT","BENJI","45")
val data=Seq(d1,d2).toDF(columnNames:_*)
What you are doing is adding the same columns multiple times and each time setting all values to a constant.
The withColumn adds a column to all rows in a dataframe - but since you have an empty dataframe you don't actually add anything

Use dataframes for Decision tree classifier in spark with string fields

I have managed to get my Decision Tree classifier work for the RDD-based API, but now I am trying to switch to the Dataframes-based API in Spark.
I have a dataset like this (but with many more fields) :
country, destination, duration, label
Belgium, France, 10, 0
Bosnia, USA, 120, 1
Germany, Spain, 30, 0
First I load my csv file in a dataframe :
val data = session.read
.format("org.apache.spark.csv")
.option("header", "true")
.csv("/home/Datasets/data/dataset.csv")
Then I transform string columns into numerical columns
val stringColumns = Array("country", "destination")
val index_transformers = stringColumns.map(
cname => new StringIndexer()
.setInputCol(cname)
.setOutputCol(s"${cname}_index")
)
Then I assemble all my features into one single vector, using VectorAssembler, like this :
val assembler = new VectorAssembler()
.setInputCols(Array("country_index", "destination_index", "duration_index"))
.setOutputCol("features")
I split my data into training and test :
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
Then I create my DecisionTree Classifier
val dt = new DecisionTreeClassifier()
.setLabelCol("label")
.setFeaturesCol("features")
Then I use a pipeline to make all the transformations
val pipeline = new Pipeline()
.setStages(Array(index_transformers, assembler, dt))
I train my model and use it for predictions :
val model = pipeline.fit(trainingData)
val predictions = model.transform(testData)
But I get some mistakes I don't understand :
When I run my code like that, I have this error :
[error] found : Array[org.apache.spark.ml.feature.StringIndexer]
[error] required: org.apache.spark.ml.PipelineStage
[error] .setStages(Array(index_transformers, assembler,dt))
So what I did is that I added a pipeline right after the index_transformers val, and right before val assembler :
val index_pipeline = new Pipeline().setStages(index_transformers)
val index_model = index_pipeline.fit(data)
val df_indexed = index_model.transform(data)
and I use as training set and testing set, my new df_indexed dataframe, and I removed index_transformers from my pipeline with assembler and dt
val Array(trainingData, testData) = df_indexed.randomSplit(Array(0.7, 0.3))
val pipeline = new Pipeline()
.setStages(Array(assembler,dt))
And I get this error :
Exception in thread "main" java.lang.IllegalArgumentException: Data type StringType is not supported.
It basically says I use VectorAssembler on String, whereas I told it to use it on df_indexed which has now a numerical column_index, but it doesn't seem to use it in vectorAssembler, and i just don't understand..
Thank you
EDIT
Now I have almost managed to get it work :
val data = session.read
.format("org.apache.spark.csv")
.option("header", "true")
.csv("/home/hvfd8529/Datasets/dataOINIS/dataset.csv")
val stringColumns = Array("country_index", "destination_index", "duration_index")
val stringColumns_index = stringColumns.map(c => s"${c}_index")
val index_transformers = stringColumns.map(
cname => new StringIndexer()
.setInputCol(cname)
.setOutputCol(s"${cname}_index")
)
val assembler = new VectorAssembler()
.setInputCols(stringColumns_index)
.setOutputCol("features")
val labelIndexer = new StringIndexer()
.setInputCol("label")
.setOutputCol("indexedLabel")
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3))
// Train a DecisionTree model.
val dt = new DecisionTreeClassifier()
.setLabelCol("indexedLabel")
.setFeaturesCol("features")
.setImpurity("entropy")
.setMaxBins(1000)
.setMaxDepth(15)
// Convert indexed labels back to original labels.
val labelConverter = new IndexToString()
.setInputCol("prediction")
.setOutputCol("predictedLabel")
.setLabels(labelIndexer.labels())
val stages = index_transformers :+ assembler :+ labelIndexer :+ dt :+ labelConverter
val pipeline = new Pipeline()
.setStages(stages)
// Train model. This also runs the indexers.
val model = pipeline.fit(trainingData)
// Make predictions.
val predictions = model.transform(testData)
// Select example rows to display.
predictions.select("predictedLabel", "label", "indexedFeatures").show(5)
// Select (prediction, true label) and compute test error.
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("indexedLabel")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println("accuracy = " + accuracy)
val treeModel = model.stages(2).asInstanceOf[DecisionTreeClassificationModel]
println("Learned classification tree model:\n" + treeModel.toDebugString)
except that now I have an error saying this :
value labels is not a member of org.apache.spark.ml.feature.StringIndexer
and I don't understand, as I am following examples on spark doc :/
Should be:
val pipeline = new Pipeline()
.setStages(index_transformers ++ Array(assembler, dt): Array[PipelineStage])
What I did for my first problem :
val stages = index_transformers :+ assembler :+ labelIndexer :+ rf :+ labelConverter
val pipeline = new Pipeline()
.setStages(stages)
For my second issue with label, I needed to use .fit(data) like this
val labelIndexer = new StringIndexer()
.setInputCol("label_fraude")
.setOutputCol("indexedLabel")
.fit(data)

converting textFile to dataFrame dynamically

I am trying to convert input from a text file to dataframe using a schema file which is read at run time.
My input text file looks like this:
John,23
Charles,34
The schema file looks like this:
name:string
age:integer
This is what I tried:
object DynamicSchema {
def main(args: Array[String]) {
val inputFile = args(0)
val schemaFile = args(1)
val schemaLines = Source.fromFile(schemaFile, "UTF-8").getLines().map(_.split(":")).map(l => l(0) -> l(1)).toMap
val spark = SparkSession.builder()
.master("local[*]")
.appName("Dynamic Schema")
.getOrCreate()
import spark.implicits._
val input = spark.sparkContext.textFile(args(0))
val schema = spark.sparkContext.broadcast(schemaLines)
val nameToType = {
Seq(IntegerType,StringType)
.map(t => t.typeName -> t).toMap
}
println(nameToType)
val fields = schema.value
.map(field => StructField(field._1, nameToType(field._2), nullable = true)).toSeq
val schemaStruct = StructType(fields)
val rowRDD = input
.map(_.split(","))
.map(attributes => Row.fromSeq(attributes))
val peopleDF = spark.createDataFrame(rowRDD, schemaStruct)
peopleDF.printSchema()
// Creates a temporary view using the DataFrame
peopleDF.createOrReplaceTempView("people")
// SQL can be run over a temporary view created using DataFrames
val results = spark.sql("SELECT name FROM people")
results.show()
}
}
Though the printSchema gives the desired result, result.show errors out. I think the age field actually needs to be converted using toInt. Is there a way to achieve the same when the schema is only available at runtime?
Replace
val input = spark.sparkContext.textFile(args(0))
with
val input = spark.read.schema(schemaStruct).csv(args(0))
and move it after schema definition.

How to use feature extraction with DStream in Apache Spark

I have data that arrive from Kafka through DStream. I want to perform feature extraction in order to obtain some keywords.
I do not want to wait for arrival of all data (as it is intended to be continuous stream that potentially never ends), so I hope to perform extraction in chunks - it doesn't matter to me if the accuracy will suffer a bit.
So far I put together something like that:
def extractKeywords(stream: DStream[Data]): Unit = {
val spark: SparkSession = SparkSession.builder.getOrCreate
val streamWithWords: DStream[(Data, Seq[String])] = stream map extractWordsFromData
val streamWithFeatures: DStream[(Data, Array[String])] = streamWithWords transform extractFeatures(spark) _
val streamWithKeywords: DStream[DataWithKeywords] = streamWithFeatures map addKeywordsToData
streamWithFeatures.print()
}
def extractFeatures(spark: SparkSession)
(rdd: RDD[(Data, Seq[String])]): RDD[(Data, Array[String])] = {
val df = spark.createDataFrame(rdd).toDF("data", "words")
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(numOfFeatures)
val rawFeatures = hashingTF.transform(df)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(rawFeatures)
val rescaledData = idfModel.transform(rawFeature)
import spark.implicits._
rescaledData.select("data", "features").as[(Data, Array[String])].rdd
}
However, I received java.lang.IllegalStateException: Haven't seen any document yet. - I am not surprised as I just try out to scrap things together, and I understand that since I am not waiting for an arrival of some data, the generated model might be empty when I try to use it on data.
What would be the right approach for this problem?
I used advises from comments and split the procedure into 2 runs:
one that calculated IDF model and saves it to file
def trainFeatures(idfModelFile: File, rdd: RDD[(String, Seq[String])]) = {
val session: SparkSession = SparkSession.builder.getOrCreate
val wordsDf = session.createDataFrame(rdd).toDF("data", "words")
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures")
val featurizedDf = hashingTF.transform(wordsDf)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedDf)
idfModel.write.save(idfModelFile.getAbsolutePath)
}
one that reads IDF model from file and simply runs it on all incoming information
val idfModel = IDFModel.load(idfModelFile.getAbsolutePath)
val documentDf = spark.createDataFrame(rdd).toDF("update", "document")
val tokenizer = new Tokenizer().setInputCol("document").setOutputCol("words")
val wordsDf = tokenizer.transform(documentDf)
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures")
val featurizedDf = hashingTF.transform(wordsDf)
val extractor = idfModel.setInputCol("rawFeatures").setOutputCol("features")
val featuresDf = extractor.transform(featurizedDf)
featuresDf.select("update", "features")