GraphX not working properly Spark / Scala - scala

I am trying to create a GraphX object in apache Spark/Scala but it doesn't seem to be working for some reason. I have attached a file of the example input file, the actual program code is:
package SGraph
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.sql._
import org.apache.log4j._
import org.apache.spark.rdd.RDD
import org.apache.spark.graphx._
`
object GooglePlusGraph {
/** Our main function where the action happens */
def main(args: Array[String]) {
// Set the log level to only print errors
Logger.getLogger("org").setLevel(Level.ERROR)
// Create a SparkContext using every core of the local machine
val sc = new SparkContext("local[*]", "GooglePlusGraphX")
val lines = sc.textFile("../Example.txt")
val ratings = lines.map(x => x.toString().split(":")(0))
val verts = ratings.map(line => (line.toLong,line))
val edges = lines.flatMap(makeEdges)
val default = "Nobody"
val graph = Graph(verts, edges, default).cache()
graph.degrees.join(verts).take(10).foreach(println)
}
def makeEdges(line: String) : List[Edge[Int]] = {
import scala.collection.mutable.ListBuffer
var edges = new ListBuffer[Edge[Int]]()
val fields = line.split(",").flatMap(a => a.split(":"))
val origin = fields(0)
for (x <- 1 to (fields.length - 1)) {
// Our attribute field is unused, but in other graphs could
// be used to deep track of physical distances etc.
edges += Edge(origin.toLong, fields(x).toLong, 0)
}
return edges.toList
}
}
The first error i get is the following:
16/12/19 01:28:33 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 3)
java.lang.NumberFormatException: For input string: "935750800736168978117"
thanks for any help !

It's the same issue with the following your question.
Cannot convert string to a long in scala
The given number has 21 digits beyond the maximum number of digits of Long (19 digits).

Related

KMeansModel.clusterCenters returns NULL

I am using AWS glue to execute Kmeans clustering on my dataset. I wish to find not only the cluster labels but also the cluster centers. I am failing to find the later.
In the code below model.clusterCenters returns NULL. KMeans clustering works fine, and it returns the cluster label i.e. clusterInstance variable.
import java.time.LocalDate
import java.time.format.DateTimeFormatter
import com.amazonaws.services.glue.util.JsonOptions
import com.amazonaws.services.glue.{DynamicFrame, GlueContext}
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
object Clustering {
case class ObjectDay(realnumber: Double, bnumber : Double, blockednumber: Double,
creationdate : String, fname : String, uniqueid : Long, registrationdate : String,
plusnumber : Double, cvalue : Double, hvalue : Double)
case class ClusterInfo( instance: Int, centers: String)
def main(args: Array[String]): Unit = {
val sc: SparkContext = new SparkContext()
val glueContext: GlueContext = new GlueContext(sc)
val spark: SparkSession = glueContext.getSparkSession
import spark.implicits._
// write your code here - start
// Data Catalog: database and table name
val dbName = "dbname"
val tblName = "raw"
val sqlText = "SELECT <columns removed> FROM viewname WHERE `creation_date` ="
// S3 location for output
val outputDir = "s3://blucket/path/"
// Read data into a DynamicFrame using the Data Catalog metadata
val rawDyf: DynamicFrame = glueContext.getCatalogSource(database = dbName, tableName = tblName).getDynamicFrame()
// get only single day data with only numbers
// Spark SQL on a Spark dataframe
val numberDf = rawDyf.toDF()
numberDf.createOrReplaceTempView("viewname")
def getDataViaSql(runDate : LocalDate): RDD[ObjectDay] ={
val data = spark.sql(s"${sqlText} '${runDate.toString}'")
data.as[ObjectDay].rdd
}
def getDenseVector(rddnumbers: RDD[ObjectDay]): RDD[linalg.Vector]={
rddnumbers.map(s => Vectors.dense(Array(s.realnumber, s.bnumber, s.blockednumber))).cache()
}
def getClusters( numbers: RDD[linalg.Vector] ): RDD[ClusterInfo] = {
// Trains a k-means model
val model: KMeansModel = KMeans.train(numbers, 2, 20)
val centers: Array[linalg.Vector] = model.clusterCenters
//put together unique_ids with cluster predictions
val clusters: RDD[Int] = model.predict(numbers)
clusters.map{ clusterInstance =>
ClusterInfo(clusterInstance.toInt, centers(clusterInstance).toJson)
}
}
def combineDataAndClusterInstances(rddnumbers : RDD[ObjectDay], clusterCenters: RDD[ClusterInfo]): DataFrame ={
val numbersWithCluster = rddnumbers.zip(clusterCenters)
numbersWithCluster.map(
x =>
(x._1.realnumber, x._1.bnumber, x._1.blockednumber, x._1.creationdate, x._1.fname,
x._1.uniqueid, x._1.registrationdate, x._1.plusnumber, x._1.cvalue, x._1.hvalue,
x._2.instance, x._2.centers)
)
.toDF("realnumber", "bnumber", "blockednumber", "creationdate",
"fname","uniqueid", "registrationdate", "plusnumber", "cvalue", "hvalue",
"clusterInstance", "clusterCenter")
}
def process(runDate : LocalDate): DataFrame = {
val rddnumbers = getDataViaSql( runDate)
val dense = getDenseVector(rddnumbers)
val clusterCenters = getClusters(dense)
combineDataAndClusterInstances(rddnumbers, clusterCenters)
}
val startdt = LocalDate.parse("2018-01-01", DateTimeFormatter.ofPattern("yyyy-MM-dd"))
val dfByDates = (0 to 240)
.map(days => startdt.plusDays(days))
.map(process(_))
val result = dfByDates.tail.fold(dfByDates.head)((accDF, newDF) => accDF.union(newDF))
val output = DynamicFrame(result, glueContext).withName(name="prediction")
// write your code here - end
glueContext.getSinkWithFormat(connectionType = "s3",
options = JsonOptions(Map("path" -> outputDir)), format = "csv").writeDynamicFrame(output)
}
}
I can successfully find the cluster centres using Python sklearn library on the same data.
UPDATED: Showing the complete Scala code which runs as Glue job. Also I am not getting any error while running the job. I just dont get any cluster centres.
What am I missing ?
Nevermind. It is generating cluster centres.
I didnt see the S3 output files until now.
I was running Glue Crawler and looking at the results in AWS Athena.
The crawler created a struct or array column datatype for clustercenter column and Athena failed to parse and read the JSON stored as string in the CSV output.
Sorry to bother.

Change datatype on scala Spark Streaming

On that course on Module 3 - hands on lab ... there's an example (Spark Fundamentals 1) that I'm using to learn Scala and Spark.
https://courses.cognitiveclass.ai/courses/course-v1:BigDataUniversity+BD0211EN+2016/courseware/14ec4166bc9b4a3a9592b7960f4a5401/b0c736193c834b01b3c1c5bd4ce2d8a8/
I tried to modify the Streaming part in order to calculate the moving average as streaming comes in. I haven't figured out how to do it, but right now I'm facing the problem that I don't know how to change the datatype.
import org.apache.log4j.Logger
import org.apache.log4j.Level
Logger.getLogger("org").setLevel(Level.OFF)
Logger.getLogger("akka").setLevel(Level.OFF)
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
val ssc = new StreamingContext(sc,Seconds(1))
val lines = ssc.socketTextStream("localhost",7777)
import scala.collection.mutable.Queue
var ints = Queue[Double]()
def movingAverage(values: Queue[Double], period: Int): List[Double] = {
val first = (values take period).sum / period
val subtract = values map (_ / period)
val add = subtract drop period
val addAndSubtract = add zip subtract map Function.tupled(_ - _)
val res = (addAndSubtract.foldLeft(first :: List.fill(period - 1)(0.0)) {
(acc, add) => (add + acc.head) :: acc
}).reverse
res
}
val pass = lines.map(_.split(",")).
map(pass=>(pass(7).toDouble))
pass.getClass
class org.apache.spark.streaming.dstream.MappedDStream
ints ++= List(pass).to[Queue]
Name: Compile Error
Message: console :41: error: type mismatch;
found : scala.collection.mutable.Queue[org.apache.spark.streaming.dstream.DStream[Double]]
required: scala.collection.TraversableOnce[Double]
ints ++= List(pass).to[Queue]
^
StackTrace:
al pass2 = movingAverage(ints,2)
pass2.print()
ints.dequeue
ssc.start()
ssc.awaitTermination()
How to get the streaming data from pass to ints as a queue of doubles?
After a lot of asking
val p1 = new scala.collection.mutable.Queue[Double]
pass.foreachRDD( rdd => {
for(item <- rdd.collect().toArray) {
p1 += item ;
println(item +" - "+ movingAverage(p1,2).last) ;
}
})

Best way to convert online csv to dataframe scala

I am trying to figure out the most efficient way to accomplish putting this online csv file into a data frame in Scala.
To save a download, the csv file in the code looks like this:
"Symbol","Name","LastSale","MarketCap","ADR
TSO","IPOyear","Sector","Industry","Summary Quote"
"DDD","3D Systems Corporation","18.09","2058834640.41","n/a","n/a","Technology","Computer Software: Prepackaged Software","http://www.nasdaq.com/symbol/ddd"
"MMM","3M Company","211.68","126423673447.68","n/a","n/a","Health Care","Medical/Dental Instruments","http://www.nasdaq.com/symbol/mmm"
....
From my research, I start by downloading the csv, and placing it into a list buffer (since you can't do this with a list because it's immutable):
import scala.collection.mutable.ListBuffer
val sc = new SparkContext(conf)
var stockInfoNYSE_ListBuffer = new ListBuffer[java.lang.String]()
import scala.io.Source
val bufferedSource =
Source.fromURL("http://www.nasdaq.com/screening/companies-by-
industry.aspx?exchange=NYSE&render=download")
for (line <- bufferedSource.getLines) {
val cols = line.split(",").map(_.trim)
stockInfoNYSE_ListBuffer += s"${cols(0)},${cols(1)},${cols(2)},${cols(3)},${cols(4)},${cols(5)},${cols(6)},${cols(7)},${cols(8)}"
}
bufferedSource.close
val stockInfoNYSE_List = stockInfoNYSE_ListBuffer.toList
So we have a list. You can basically get each value like this:
// SYMBOL : stockInfoNYSE_List(1).split(",")(0)
// COMPANY NAME : stockInfoNYSE_List(1).split(",")(1)
// IPOYear : stockInfoNYSE_List(1).split(",")(5)
// Sector : stockInfoNYSE_List(1).split(",")(6)
// Industry : stockInfoNYSE_List(1).split(",")(7)
Here is where I get stuck- how do I get this to a dataframe? The wrong approaches I have taken. I didn't put all the values in just yet- was a simple test.
case class StockMap(Symbol: String, Name: String)
val caseClassDS = Seq(StockMap(stockInfoNYSE_List(1).split(",")(0),
StockMap(stockInfoNYSE_List(1).split(",")(1))).toDS()
caseClassDS.show()
The problem with the approach above: I can only figure out how to add one sequence (row) by hard coding it. I want every Row in the list.
My second failed attempt:
val sqlContext= new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val test = stockInfoNYSE_List.toDF
This will just give you the array, and I want to divide up the values.
Array(["Symbol","Name","LastSale","MarketCap","ADR TSO","IPOyear","Sector","Industry","Summary Quote"], ["DDD","3D Systems Corporation","18.09","2058834640.41","n/a","n/a","Technology","Computer Software: Prepackaged Software","http://www.nasdaq.com/symbol/ddd"], ["MMM","3M Company","211.68","126423673447.68","n/a","n/a","Health Care","Medical/Dental Instruments","http://www.nasdaq.com/symbol/mmm"],.......
case class TestClass(Symbol:String,Name:String,LastSale:String,MarketCap :String,ADR_TSO:String,IPOyear:String,Sector: String,Industry:String,Summary_Quote:String
| )
defined class TestClass
var stockDF= stockInfoNYSE_ListBuffer.drop(1)
val demoDS = stockDF.map(line => {
val fields = line.replace("\"","").split(",")
TestClass(fields(0), fields(1), fields(2),fields(3), fields(4), fields(5),fields(6), fields(7), fields(8))
})
scala> demoDS.toDS.show
+------+--------------------+--------+---------------+-------------+-------+-----------------+--------------------+--------------------+
|Symbol| Name|LastSale| MarketCap| ADR_TSO|IPOyear| Sector| Industry| Summary_Quote|
+------+--------------------+--------+---------------+-------------+-------+-----------------+--------------------+--------------------+
| DDD|3D Systems Corpor...| 18.09| 2058834640.41| n/a| n/a| Technology|Computer Software...|http://www.nasdaq...|
| MMM| 3M Company| 211.68|126423673447.68| n/a| n/a| Health Care|Medical/Dental In...|http://www.nasdaq...|
In case anyone is trying to get this example working, here is the code using the above solution:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import scala.collection.mutable.ListBuffer
import sqlContext.implicits._
var stockInfoNYSE_ListBuffer = new ListBuffer[java.lang.String]()
import scala.io.Source
val bufferedSource =
Source.fromURL("http://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=NYSE&render=download")
for (line <- bufferedSource.getLines) {
val cols = line.split(",").map(_.trim)
stockInfoNYSE_ListBuffer += s"${cols(0)},${cols(1)},${cols(2)},${cols(3)},${cols(4)},${cols(5)},${cols(6)},${cols(7)},${cols(8)}"
}
bufferedSource.close
case class TestClass(Symbol:String,Name:String,LastSale:String,MarketCap :String,ADR_TSO:String,IPOyear:String,Sector: String,Industry:String,Summary_Quote:String )
var stockDF= stockInfoNYSE_ListBuffer.drop(1)
val demoDS = stockDF.map(line => {
val fields = line.replace("\"","").split(",")
TestClass(fields(0), fields(1), fields(2),fields(3), fields(4), fields(5),fields(6), fields(7), fields(8))
})
demoDS.toDF().show

Tokenization by Stanford parser is slow?

Quession Summary: tokenization by stanford parser is slow on my local machine, but unreasonably much much faster on spark. Why?
I'm using stanford coreNLP tool to tokenize sentences.
My script in Scala is like this:
import java.util.Properties
import scala.collection.JavaConversions._
import scala.collection.immutable.ListMap
import scala.io.Source
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation
import edu.stanford.nlp.ling.CoreLabel
import edu.stanford.nlp.pipeline.Annotation
import edu.stanford.nlp.pipeline.StanfordCoreNLP
val properties = new Properties()
val coreNLP = new StanfordCoreNLP(properties)
def tokenize(s: String) = {
properties.setProperty("annotators", "tokenize")
val annotation = new Annotation(s)
coreNLP.annotate(annotation)
annotation.get(classOf[TokensAnnotation]).map(_.value.toString)
}
tokenize("Here is my sentence.")
One call of tokenize function takes roughly (at least) 0.1 sec.
This is very very slow because I have 3 million sentences.
(3M * 0.1sec = 300K sec = 5000H)
As an alternative approach, I have applied the tokenizer on Spark.
(with four worker machines.)
import java.util.List
import java.util.Properties
import scala.collection.JavaConversions._
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation
import edu.stanford.nlp.ling.CoreLabel
import edu.stanford.nlp.pipeline.Annotation
import edu.stanford.nlp.pipeline.StanfordCoreNLP
val file = sc.textFile("hdfs:///myfiles")
def tokenize(s: String) = {
val properties = new Properties()
properties.setProperty("annotators", "tokenize")
val coreNLP = new StanfordCoreNLP(properties)
val annotation = new Annotation(s)
coreNLP.annotate(annotation)
annotation.get(classOf[TokensAnnotation]).map(_.toString)
}
def normalizeToken(t: String) = {
val ts = t.toLowerCase
val num = "[0-9]+[,0-9]*".r
ts match {
case num() => "NUMBER"
case _ => ts
}
}
val tokens = file.map(tokenize(_))
val tokenList = tokens.flatMap(_.map(normalizeToken))
val wordCount = tokenList.map((_,1)).reduceByKey(_ + _).sortBy(_._2, false)
wordCount.saveAsTextFile("wordcount")
This scripts finishes tokenization and word count of 3 million sentences just in 5 minites!
And results seems reasonable.
Why this is so first? Or, why the first scala script is so slow?
The problem with your first approach is that you set the annotators property after you initialize the StanfordCoreNLP object. Therefore CoreNLP is initialized with the list of default annotators which include the part-of-speech tagger and the parser which are orders of magnitude slower than the tokenizer.
To fix this, simply move the line
properties.setProperty("annotators", "tokenize")
before the line
val coreNLP = new StanfordCoreNLP(properties)
This should be even slightly faster than your second approach as you don't have to reinitialize CoreNLP for each sentence.

Scala Futures not executing when sending to Kinesis (Amazon AWS)

I am attempting to asynchronously write messages to Amazon Kinesis using Scala Futures so I can load test an application.
This code works, and I can see data moving down my pipeline as well as the output printing to the console.
import com.amazonaws.services.kinesis.AmazonKinesisClient
import java.nio.CharBuffer
import java.nio.charset.Charset
import java.text.SimpleDateFormat
import java.util.{Date, TimeZone}
object KinesisDummyDataProducer extends App {
val kinesis = new AmazonKinesisClient(PipelineConfig.awsCredentials)
println("Connected")
lazy val encoder = Charset.forName("UTF-8").newEncoder()
lazy val tz = TimeZone.getTimeZone("UTC")
lazy val df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'Z")
df.setTimeZone(tz)
(1 to args(0).toInt).map(int => send(int)).map(msg => println(msg))
private def send(int: Int) = {
val msg = "{\"event_name\":\"test\",\"timestamp\":\"%s\",\"int\":%s}".format(df.format(new Date()), int.toString)
val bytes = encoder.encode(CharBuffer.wrap(msg))
encoder.flush(bytes)
kinesis.putRecord("PrimaryEventStream", bytes, "123")
msg
}
}
This code works with Scala Futures.
import scala.concurrent.future
import scala.concurrent.ExecutionContext.Implicits.global
def doIt(x: Int) = {Thread.sleep(1000); x + 1}
(1 to 10).map(x => future{doIt(x)}).map(y => y.onSuccess({case x => println(x)}))
You'll note that the syntax is nearly identical on the mapping of sequences. However, the follwoing does not work (i.e., it neither prints to the console nor sends data down my pipeline).
import com.amazonaws.services.kinesis.AmazonKinesisClient
import java.nio.CharBuffer
import java.nio.charset.Charset
import java.text.SimpleDateFormat
import java.util.{Date, TimeZone}
import scala.concurrent.future
import scala.concurrent.ExecutionContext.Implicits.global
object KinesisDummyDataProducer extends App {
val kinesis = new AmazonKinesisClient(PipelineConfig.awsCredentials)
println("Connected")
lazy val encoder = Charset.forName("UTF-8").newEncoder()
lazy val tz = TimeZone.getTimeZone("UTC")
lazy val df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'Z")
df.setTimeZone(tz)
(1 to args(0).toInt).map(int => future {send(int)}).map(f => f.onSuccess({case msg => println(msg)}))
private def send(int: Int) = {
val msg = "{\"event_name\":\"test\",\"timestamp\":\"%s\",\"int\":%s}".format(df.format(new Date()), int.toString)
val bytes = encoder.encode(CharBuffer.wrap(msg))
encoder.flush(bytes)
kinesis.putRecord("PrimaryEventStream", bytes, "123")
msg
}
}
Some more notes about this project. I am using Maven to do the build (from the command line), and running all of the above code (also from the command line) works just dandy.
My question is: Why with using the same syntax does my function 'send' appear to not be executing?