Why one Kafka streams block the other one from getting started? - scala

I am working with the new Kafka-scala-streams api recently opensourced by lightbend.
And I am trying to run two streams. But Whats happening is two of them don't run simultaneously and I am not getting the desired output.
package in.internity
import java.util.Properties
import java.util.concurrent.TimeUnit
import com.lightbend.kafka.scala.streams.{KStreamS, StreamsBuilderS}
import org.apache.kafka.common.serialization.Serdes
import org.apache.kafka.streams.kstream.Produced
import org.apache.kafka.streams.{StreamsConfig, _}
import org.json4s.DefaultFormats
import org.json4s.native.JsonMethods.parse
import org.json4s.native.Serialization.write
import scala.util.Try
/**
* #author Shivansh <shiv4nsh#gmail.com>
* #since 8/1/18
*/
object Boot extends App {
implicit val formats: DefaultFormats.type = DefaultFormats
val config: Properties = {
val p = new Properties()
p.put(StreamsConfig.APPLICATION_ID_CONFIG, "wordcount-application")
p.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092")
p.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass)
p.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass)
p
}
val streams1 = wordSplit("lines", "wordCount")
val streams2 = readAndWriteJson("person", "personName")
private def wordSplit(intopic: String, outTopic: String) = {
val builder = new StreamsBuilderS()
val produced = Produced.`with`(Serdes.String(), Serdes.String())
val textLines: KStreamS[String, String] = builder.stream(intopic)
val data: KStreamS[String, String] = textLines.flatMapValues(value => value.toLowerCase.split("\\W+").toIterable)
data.to(outTopic, produced)
val streams: KafkaStreams = new KafkaStreams(builder.build(), config)
streams
}
private def readAndWriteJson(intopic: String, outTopic: String) = {
val builder = new StreamsBuilderS()
val produced = Produced.`with`(Serdes.String(), Serdes.String())
val textLines: KStreamS[String, String] = builder.stream(intopic)
val data: KStreamS[String, String] = textLines.mapValues(value => {
val person = Try(parse(value).extract[Person]).toOption
println("1::", person)
val personNameAndEmail = person.map(a => PersonNameAndEmail(a.name, a.email))
println("2::", personNameAndEmail)
write(personNameAndEmail)
})
data.to(outTopic, produced)
val streams: KafkaStreams = new KafkaStreams(builder.build(), config)
streams
}
streams1.start()
streams2.start()
Runtime.getRuntime.addShutdownHook(new Thread(() => {
streams2.close(10, TimeUnit.SECONDS)
streams1.close(10, TimeUnit.SECONDS)
}))
}
case class Person(name: String, age: Int, email: String)
case class PersonNameAndEmail(name: String, email: String)
When I run this and produce Messages on topic person they do not get consumed.
But When I change the ordering of their start i.e
streams2.start()
streams1.start()
It works fine. So why is starting of One stream blocks the other .Can't we run multiple streams at the same time.

Got it working , seems like I was trying to initialize the stream twice in different methods themselves (silly of me :P )
Working code :
package in.internity
import java.util.Properties
import java.util.concurrent.TimeUnit
import com.lightbend.kafka.scala.streams.{KStreamS, StreamsBuilderS}
import org.apache.kafka.common.serialization.Serdes
import org.apache.kafka.streams.kstream.Produced
import org.apache.kafka.streams.{StreamsConfig, _}
import org.json4s.DefaultFormats
import org.json4s.native.JsonMethods.parse
import org.json4s.native.Serialization.write
import scala.util.Try
/**
* #author Shivansh <shiv4nsh#gmail.com>
* #since 8/1/18
*/
object Boot extends App {
implicit val formats: DefaultFormats.type = DefaultFormats
val config: Properties = {
val p = new Properties()
p.put(StreamsConfig.APPLICATION_ID_CONFIG, "wordcount-application")
p.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092")
p.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass)
p.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass)
p
}
val builder = new StreamsBuilderS()
private def wordSplit(intopic: String, outTopic: String) = {
val produced = Produced.`with`(Serdes.String(), Serdes.String())
val textLines: KStreamS[String, String] = builder.stream(intopic)
val data: KStreamS[String, String] = textLines.flatMapValues(value => value.toLowerCase.split("\\W+").toIterable)
data.to(outTopic, produced)
}
private def readAndWriteJson(intopic: String, outTopic: String) = {
val produced = Produced.`with`(Serdes.String(), Serdes.String())
val textLines: KStreamS[String, String] = builder.stream(intopic)
val data: KStreamS[String, String] = textLines.mapValues(value => {
val person = Try(parse(value).extract[Person]).toOption
println("1::", person)
val personNameAndEmail = person.map(a => PersonNameAndEmail(a.name, a.email))
println("2::", personNameAndEmail)
write(personNameAndEmail)
})
data.to(outTopic, produced)
}
wordSplit("lines", "wordCount")
readAndWriteJson("person", "personName")
val streams: KafkaStreams = new KafkaStreams(builder.build(), config)
streams.start()
streams
Runtime.getRuntime.addShutdownHook(new Thread(() => {
streams.close(10, TimeUnit.SECONDS)
}))
}
case class Person(name: String, age: Int, email: String)
case class PersonNameAndEmail(name: String, email: String)

Related

Run Object notebook in Databricks

I am trying to execute this code on databricks in scala. Everything is in an object, then I have a case class and def main and other def functions.
Trying to work with "package cells" but I got Warning: classes defined within packages cannot be redefined without a cluster restart.
Compilation successful.
removing the object didn't work either
package x.y.z
import java.util.Date
import java.io.File
import java.io.PrintWriter
import org.apache.hadoop.fs.{FileSystem, Path}
object Meter {
val dateFormat = new SimpleDateFormat("yyyyMMdd")
case class Forc (cust: String, Num: String, date: String, results: Double)
def main(args: Array[String]): Unit = {
val inputFile = "sv" //
val outputFile = "ssv" //
val fileSystem = getFileSystem(inputFile)
val inputData = readLines(fileSystem, inputFile, skipHeader = true).toSeq
val filtinp = inputData.filter(x => x.nonEmpty)
.map(x => Results(x(6), x(5), x(0), x(8).toDouble))
def getTimestamp(date: String): Long = dateFormat.parse(date).getTime
def getDate(timeStampInMills: Long): String = {
val time = new Date(timeStampInMills)
dateFormat.format(time)
}
def getFileSystem(path: String): FileSystem = {
val hconf = new Configuration()
new Path(path).getFileSystem(hconf)
}
override def next(): String = {
val result = line
line = inputData.readLine()
if (line == null) {
inputData.close()
}
result
}
}
}
}

Scala, Sangria and Scalatra

We have a Scala application using Scalatra (http://scalatra.org/) as our web framework. I'm wondering if there are any good (or just any) resources out there on how to implement a GraphQL endpoint using Sangria (http://sangria-graphql.org/) and Scalatra?
I'm new to Scala and would appreciate any help to get started on this.
There aren't any that I know of but since Scalatra uses json4s you would use sangria's json4s marshaller .
Otherwise, if sangria could be clearer to you, here's a scala worksheet with a very simplistic example based off play + sangria - in this case you would just need to swap the json library.
The db is mocked (perhaps you use Slick?) and the http server as well but it's a simple case of swapping in the function definitions.
import sangria.ast.Document
import sangria.execution.{ErrorWithResolver, Executor, QueryAnalysisError}
import sangria.macros.derive.{ObjectTypeDescription, ObjectTypeName, deriveObjectType}
import sangria.parser.{QueryParser, SyntaxError}
import sangria.renderer.SchemaRenderer
import sangria.schema.{Argument, Field, IntType, ListType, ObjectType, OptionInputType, Schema, fields}
import scala.concurrent.Await
import scala.concurrent.duration._
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.Future
import scala.util.{Failure, Success}
// replace with another json lib
// eg https://github.com/sangria-graphql/sangria-json4s-jackson
import play.api.libs.json._
import sangria.marshalling.playJson._
case class User(name: String, age: Int, phone: Option[String])
class FakeDb {
class UsersTable {
def getUsers(limit: Int): List[User] = {
// this would come from the db
List(
User("john smith", 23, None),
User("Anne Schwazenbach", 45, Some("2134556"))
)
}
}
val usersRepo = new UsersTable
}
object MySchema {
val limitArg: Argument[Int] = Argument("first", OptionInputType(IntType),
description = s"Returns the first n elements from the list.",
defaultValue = 10)
implicit val UsersType: ObjectType[FakeDb, User] = {
deriveObjectType[FakeDb, User](
ObjectTypeName("Users"),
ObjectTypeDescription("Users in the system")
)
}
private val Query: ObjectType[FakeDb, Unit] = ObjectType[FakeDb, Unit](
"Query", fields[FakeDb, Unit](
Field("users", ListType(UsersType),
arguments = limitArg :: Nil,
resolve = c => c.ctx.usersRepo.getUsers(c.arg(limitArg))
)
))
val theSchema: Schema[FakeDb, Unit] = Schema(Query)
}
object HttpServer {
def get(): String = {
// Http GET
SchemaRenderer.renderSchema(MySchema.theSchema)
}
def post(query: String): Future[JsValue] = {
// Http POST
val variables = None
val operation = None
QueryParser.parse(query) match {
case Success(q) => executeQuery(q, variables, operation)
case Failure(error: SyntaxError) => Future.successful(Json.obj("error" -> error.getMessage))
case Failure(error: Throwable) => Future.successful(Json.obj("error" -> error.getMessage))
}
}
private def executeQuery(queryAst: Document, vars: Option[JsValue], operation: Option[String]): Future[JsValue] = {
val schema: Schema[FakeDb, Unit] = MySchema.theSchema
Executor.execute[FakeDb, Unit, JsValue](schema, queryAst, new FakeDb,
operationName = operation,
variables=vars.getOrElse(Json.obj()))
.map((d: JsValue) => d)
.recover {
case error: QueryAnalysisError ⇒ Json.obj("error" -> error.getMessage)
case error: ErrorWithResolver ⇒ Json.obj("error" -> error.getMessage)
}
}
}
HttpServer.get()
val myquery = """
{
users {
name
}
}
"""
val res: JsValue = Await.result(HttpServer.post(myquery), 10.seconds)

Where to define the implicit val formats in a microservice with Scala, akka-http, Json4s?

I am building a microservice using Scala, akka-http, json4s. Also using case classes for my business bean classes. My case classes have scala Enumerations (I researched on the scala Enums and aware of the limitations, but this perfectly suits my current use cases).
With this background, when I tried to create a service, I am not able to understand where to define the
implicit val formats = DefaultFormats + new EnumNameSerializer(_ProfessionClaType) + new EnumNameSerializer(_LinkRelType)
Following is my rough scala class structure:
import akka.actor.ActorSystem
import akka.event.{Logging, LoggingAdapter}
import akka.http.scaladsl.Http
import akka.http.scaladsl.marshalling.ToResponseMarshallable
import akka.http.scaladsl.model.StatusCodes._
import akka.http.scaladsl.server.Directives
import akka.stream.ActorMaterializer
import com.typesafe.config.{Config, ConfigFactory}
import de.heikoseeberger.akkahttpjson4s.Json4sSupport
import gremlin.scala.ScalaVertex
import org.apache.tinkerpop.gremlin.structure.util.detached.DetachedVertex
import org.json4s.ext.EnumNameSerializer
import org.json4s.{DefaultFormats, jackson}
import scala.concurrent.{ExecutionContext, ExecutionContextExecutor, Future}
trait Service {
implicit val system: ActorSystem
implicit def executor: ExecutionContextExecutor
implicit val materializer: ActorMaterializer
// ?? implicit val formats = DefaultFormats + new EnumNameSerializer(_ProfessionClaType) + new EnumNameSerializer(_LinkRelType)
lazy val client = TitanConnection.cluster.connect();
def config: Config
val logger: LoggingAdapter
def addPerson(p: Person): Future[Either[String, Person]] = {
try {
//Code to add person to database
val resultPerson = Person(propertyMap)
Future.successful(Right(resultPerson))
} catch {
case e :Exception => e.printStackTrace
Future.failed(new Exception("Person can't be created"))
}
}
def fetchPerson(pId: String): Future[Either[Error, Person]] = {
try {
//Code to fetch person object from database
result = results.one() //fetches the first record from results, if it exists
//Following is code for validation of the data
if(result.isNull)
Future.successful(Left(new Error("FAILED","","",s"""There is no person with requested personID:$pId""")))
else {
//Code to retrieve person and return the same as object
Future.successful(Right(resultPerson))
}
} catch {
case e :Exception => e.printStackTrace
Future.successful(Left(new Error("FATAL","","",s"""There is some exception while retrieving person with requested personID:$pId""")))
}
}
/** This is the list of operations possible on the person business entity.
*
* #param ec
* #return
*/
def routes(implicit ec: ExecutionContext) = {
import Directives._
import Json4sSupport._
implicit val serialization = jackson.Serialization // or native.Serialization
implicit val system = ActorSystem()
implicit val materializer = ActorMaterializer()
implicit val formats = DefaultFormats + new EnumNameSerializer(_ProfessionClaType) + new EnumNameSerializer(_LinkRelType)
logRequestResult("PersonMicroservice") {
pathPrefix("person") {
(get & path(Segment)) { personId =>
implicit val formats = DefaultFormats + new EnumNameSerializer(_ProfessionClaType) + new EnumNameSerializer(_LinkRelType)
complete {
fetchPerson(personId).map[ToResponseMarshallable] {
case Right(personFormat) => personFormat
case Left(errorMessage) => BadRequest -> errorMessage
}
}
}~
post { entity(as[Person]) { entity =>
implicit val formats = DefaultFormats + new EnumNameSerializer(_ProfessionClaType) + new EnumNameSerializer(_LinkRelType)
complete {
addPerson(entity).map[ToResponseMarshallable] {
case Right(personFormat) => personFormat
case Left(error) => BadRequest -> error
}
}
}
}
}
}
}
}
/** Microservice for "Person" business entity. This microservice shall handle the basic CRUD related operations
* of Person business entity.
*/
object PersonMicroService extends App with Service {
override implicit val system = ActorSystem()
override implicit val executor = system.dispatcher
override implicit val materializer = ActorMaterializer()
override val config = ConfigFactory.load()
override val logger = Logging(system, getClass)
Http().bindAndHandle(routes , config.getString("http.interface"), config.getInt("http.port"))
}
I also have a ScalaTest spec for unit testing the service, and I am also forced to define the formats in each of the test case. Not sure if am doing this right. Hence seeking expert advice.
Following is my test spec:
package in.niftyride.unit
import akka.event.{Logging, LoggingAdapter}
import akka.http.scaladsl.server.Directives
import de.heikoseeberger.akkahttpjson4s.Json4sSupport
import org.json4s.{jackson, DefaultFormats}
import org.json4s.ext.EnumNameSerializer
import akka.http.scaladsl.model.ContentTypes._
import akka.http.scaladsl.model.StatusCodes._
import Json4sSupport._
class PersonEndpointSpec extends UnitServiceSpec{
override def testConfigSource = "akka.loglevel = WARNING"
override def config = testConfig
override lazy val client = TestDatabaseProvider.cluster.connect;
val logger: LoggingAdapter = Logging(system, this.getClass)
System.setProperty(DatabaseUtils.SERVER_HASH_TEXT, DatabaseUtils.RANDOM_HASH)
"Service" should "respond to single id query" in {
implicit val serialization = jackson.Serialization // or native.Serialization
implicit val formats = DefaultFormats + new EnumNameSerializer(_ProfessionClaType) + new EnumNameSerializer(_LinkRelType)
Get(s"/person/${PersonTestData.personId1}") ~> routes ~> check {
status shouldBe OK
contentType shouldBe `application/json`
responseAs[Person] shouldBe PersonTestData.minData1
}
}
it should "be possible to create a person with valid data through POST" in {
implicit val serialization = jackson.Serialization // or native.Serialization
implicit val formats = DefaultFormats + new EnumNameSerializer(_ProfessionClaType) + new EnumNameSerializer(_LinkRelType)
Post(s"/person", PersonTestData.minDataEmptyPersonId1) ~> routes ~> check {
status shouldBe OK
contentType shouldBe `application/json`
responseAs[Person] shouldBe PersonTestData.minData1
}
}
}

Scala:case class runTime Error

This demo ran Ok. But when I move it to another class function(my former project) and call the function, it compiles failure.
object DFMain {
case class Person(name: String, age: Double, t:String)
def main (args: Array[String]): Unit = {
val sc = new SparkContext("local", "Scala Word Count")
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val bsonRDD = sc.parallelize(("foo",1,"female")::
("bar",2,"male")::
("baz",-1,"female")::Nil)
.map(tuple=>{
var bson = new BasicBSONObject()
bson.put("name","bfoo")
bson.put("value",0.1)
bson.put("t","female")
(null,bson)
})
val tDf = bsonRDD.map(_._2)
.map(f=>Person(f.get("name").toString,
f.get("value").toString.toDouble,
f.get("t").toString)).toDF()
tDf.limit(1).show()
}
}
'MySQLDao.insertIntoMySQL()' compile error
object MySQLDao {
private val sc= new SparkContext("local", "Scala Word Count")
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
case class Person(name: String, age: Double, t:String)
def insertIntoMySQL(): Unit ={
val bsonRDD = sc.parallelize(("foo",1,"female")::
("bar",2,"male")::
("baz",-1,"female")::Nil)
.map(tuple=>{
val bson = new BasicBSONObject()
bson.put("name","bfoo")
bson.put("value",0.1)
bson.put("t","female")
(null,bson)
})
val tDf = bsonRDD.map(_._2).map( f=> Person(f.get("name").toString,
f.get("value").toString.toDouble,
f.get("t").toString)).toDF()
tDf.limit(1).show()
}
}
Will, when I call 'MySQLDao.insertIntoMySQL()' gets the Error of
value typedProductIterator is not a member of object scala.runtim.scala.scalaRuntTime
case class Person(name: String, age: Double, t:String)
I suppose that the case class isn't seen in closure inside map function. Move it to the package level.
case class Person(name: String, age: Double, t:String)
object MySQLDao {
...
}

How to run IntellijIDEA (Spark and Scala) code in Apache Spark in terminal mode

I have written my code in IntelliJIDEA (Scala and Spark) and i want to run this code on linux using terminal how can i do this? I can't access to Graphical mode in this Linux Server.
for example this is a code similar my code:
package LDAv1
import java.io._
import org.apache.commons.math3.special._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd._
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.immutable._
import scala.collection.mutable._
object SparkLDA {
implicit def arrayToVector(s: Array[Int]) = new Vector(s)
implicit def vectorToArray(s: Vector) = s.data
def main(args: Array[String]){
var numTopics:Int=3
var inPath:String="data/MR.dat"
var outPath:String="data"
var master:String="local[*]"
var iter:Int=100
var mem="4g"
var debug=false
lda(inPath,outPath,master,numTopics,(50/numTopics),0.1,iter,debug,mem);
}
def lda(pathToFileIn:String,pathToFileOut:String,URL:String,numTopics:Int,alpha:Double,beta:Double,numIter:Int,deBug:Boolean,mem:String){
val (conf,sc)=initializeSpark(URL,deBug,mem)
var(documents,dictionary,topicCount)=importText(pathToFileIn,numTopics,sc)
val ll:MutableList[Double]= MutableList[Double]()
for(i<-0 to numIter){
var (doc,dict,tC)=step(sc,documents,numTopics,dictionary,topicCount,alpha,beta)
documents=doc
dictionary=dict
topicCount=tC
if(deBug)ll+=logLikelihood(dictionary,topicCount,alpha,beta)
System.gc()
}
saveAll(documents,ll,sc,dictionary,topicCount,pathToFileOut,deBug)
}
def initializeSpark(URL:String,debug:Boolean,mem:String)={
if(!debug)Logger.getLogger("org").setLevel(Level.WARN)
val conf = new SparkConf()
.setAppName("Spark LDA")
.setMaster(URL)
.set("spark.executor.memory", "4g")
val sc = new SparkContext(conf)
(conf,sc)
}
def importText(pathToFileIn:String,numTopics:Int,sc:SparkContext)={
val stopWords =sc.broadcast(List[String]("a","able","about","above","according","accordingly","across","actually","after"));
val textFile=sc.textFile(pathToFileIn,4)
val documents=textFile.map(line=>{
val topicDistrib=new Array[Int](numTopics)
val lineCleaned=line.replaceAll("[^A-Za-z ]","").toLowerCase()
(lineCleaned.split(" ").map(word=>{
var topic:Int=0
var wrd:String=""
if(word.length()>1&&(!stopWords.value.contains(word))){
topic =Integer.parseInt(Math.round(Math.random()*(numTopics-1)).toString)
topicDistrib.increment(topic)
wrd=word
}
(wrd,topic)
})
,topicDistrib)
})
val(dictionary,topicCount)=updateVariables(documents,numTopics)
(documents,dictionary,topicCount)
}
def updateVariables(documents:RDD[(Array[(String, Int)], Array[Int])],numTopics:Int)={
val dictionary=documents.flatMap(line=>line._1).map(tuple=>{
var value:Array[Int]=new Array[Int](numTopics)
if(!tuple._1.equals("")){
value(tuple._2)+=1
}
(tuple._1,value)
}).reduceByKey((a:Array[Int],b)=>{
for(i<-0 to a.length-1){
a(i)+=b(i)
}
(a)
}).collect().toMap
println(dictionary.take(2))
val topicCount:Array[Int]=new Array[Int](numTopics)
dictionary.foreach(t=>topicCount.add(t._2))
(dictionary,topicCount)
}
def step(sc:SparkContext,documents:RDD[(Array[(String, Int)], Array[Int])],numTopics:Int,dict:scala.collection.immutable.Map[String, Array[Int]],tC: Array[Int],alpha:Double,beta:Double)={
val dictionary=sc.broadcast(dict)
val topicCount=sc.broadcast(tC)
val v=dict.size
val doc=documents.map(tuple=>{
val topicDistrib=tuple._2
val line=tuple._1
val lineupDated=line.map(t=>{
val word=t._1
var top=t._2
if(!t._1.equals("")){
topicDistrib.decrement(top)
top=gibbsSampling(topicDistrib,dictionary.value(word),topicCount.value,alpha,beta,v)
topicDistrib.increment(top)
}
(word,top)
})
(lineupDated,topicDistrib)
})
val(dicti,topC)=updateVariables(doc,numTopics)
(doc:RDD[(Array[(String, Int)], Array[Int])],dicti,topC)
}
def saveAll(documents: RDD[(Array[(String, Int)], Array[Int])],LogLikelihood:MutableList[Double],sc: SparkContext, dictionary: scala.collection.immutable.Map[String, Array[Int]], topicCount: Array[Int],path: String,deBug:Boolean){
removeAll(path)
saveDocuments(documents,path)
saveDictionary(sc,dictionary,path)
saveTopicCount(sc,topicCount,path)
if(deBug)saveLogLikelihood (sc,LogLikelihood, path)
}
def saveDocuments (documents: RDD[(Array[(String, Int)], Array[Int])], path: String) {
removeAll(path+"/documentsTopics")
documents.map {
case (topicAssign, topicDist) =>
var topicDistNorm:Array[Double] = topicDist.normalize()
val probabilities = topicDistNorm.toList.mkString(", ")
(probabilities)
}.saveAsTextFile(path+"/documentsTopics")
}
def saveDictionary(sc: SparkContext, dictionary: scala.collection.immutable.Map[String, Array[Int]], path: String) {
removeAll(path+"/wordsTopics")
val dictionaryArray = dictionary.toArray
val temp = sc.parallelize(dictionaryArray).map {
case (word, topics) =>
var topicsNorm:Array[Double] = topics.normalize()
val topArray = topicsNorm.toList.mkString(", ")
val wordCount = topics.sumAll()
val temp2 = List(word, wordCount, topArray).mkString("\t")
(temp2)
}
temp.saveAsTextFile(path+"/wordsTopics")
}
def saveTopicCount (sc: SparkContext, topicCount: Array[Int], path: String) {
removeAll(path+"/topicCount")
val temp = sc.parallelize(topicCount).map {
case (count) =>
(count)
}
temp.saveAsTextFile(path+"/topicCount")
}
def saveLogLikelihood (sc: SparkContext,LogLikelihood:MutableList[Double], path: String) {
removeAll(path+"/logLikelihood")
val temp = sc.parallelize(LogLikelihood).map {
case (count) =>
(count)
}
temp.saveAsTextFile(path+"/logLikelihood")
}
def gibbsSampling(docTopicDistrib:Array[Int],wordTopicDistrib:Array[Int],topicCount:Array[Int],alpha:Double,beta:Double,v:Int):Int={
val numTopic=docTopicDistrib.length
var ro:Array[Double]=new Array[Double](numTopic)
ro(0)=(docTopicDistrib(0)+alpha)*(wordTopicDistrib(0)+beta)/(topicCount(0)+v*beta)
for(i<-1 to numTopic-1){
ro(i)=ro(i-1)+(docTopicDistrib(i)+alpha)*(wordTopicDistrib(i)+beta)/(topicCount(i)+v*beta)
}
var x=Math.random()*ro(numTopic-1)
var i:Int=0
while(x>ro(i)&&i<numTopic-1)i+=1
return i
}
def logLikelihood(dictionary: scala.collection.immutable.Map[String, Array[Int]],topicCount:Array[Int],alpha:Double,beta:Double):Double={
val V:Int=dictionary.size
val numTopics:Int=topicCount.length-1
var logLikelihood:Double=numTopics*(Gamma.logGamma(V*beta)-V*Gamma.logGamma(beta))
for (i<-0 to numTopics){
var sum:Double=0
dictionary.foreach{t=> sum+=Gamma.logGamma(t._2(i)+beta)
}
logLikelihood+=sum-Gamma.logGamma(topicCount(i)+V*beta)
}
(logLikelihood)
}
def removeAll(pathDir: String) = {
def delete(file: File): Array[(String, Boolean)] = {
Option(file.listFiles).map(_.flatMap(f => delete(f))).getOrElse(Array()) :+ (file.getPath -> file.delete)
}
}
}
and it has one Scala class :
package LDAv1
class Vector(val vect:Array[Int]) {
var data:Array[Int]=vect;
def this(size:Int){
this(new Array[Int](size));
}
def increment(index:Int){
data(index)+=1;
}
def decrement(index:Int){
data(index)-=1;
}
def printIt(){
print("[")
for(i<-0 to data.length-1)print(data(i)+",");
print("]\n")
}
def forEach(callback:(Int) => Unit)={
for(i<-0 to data.length-1)callback(data(i));
}
def add(a:Array[Int]){
for(i<-0 to data.length-1)data(i)+=a(i);
}
def sumAll():Int={
var sum:Int=0;
for(i<-0 to data.length-1)sum+=data(i);
(sum)
}
def normalize():Array[Double]={
var temp:Array[Double] = new Array[Double](data.length);
var sum:Double=0;
for(i<-0 to data.length-1)sum+=data(i);
if (sum>0) {
for(i<-0 to data.length-1) {
temp(i) = data(i).toDouble/sum
};
}
(temp)
}
}
You have to create a fat jar, with all dependencies included, then you can build your application using the spark build-in function, spark-submit.
https://spark.apache.org/docs/latest/submitting-applications.html