Run Object notebook in Databricks - scala

I am trying to execute this code on databricks in scala. Everything is in an object, then I have a case class and def main and other def functions.
Trying to work with "package cells" but I got Warning: classes defined within packages cannot be redefined without a cluster restart.
Compilation successful.
removing the object didn't work either
package x.y.z
import java.util.Date
import java.io.File
import java.io.PrintWriter
import org.apache.hadoop.fs.{FileSystem, Path}
object Meter {
val dateFormat = new SimpleDateFormat("yyyyMMdd")
case class Forc (cust: String, Num: String, date: String, results: Double)
def main(args: Array[String]): Unit = {
val inputFile = "sv" //
val outputFile = "ssv" //
val fileSystem = getFileSystem(inputFile)
val inputData = readLines(fileSystem, inputFile, skipHeader = true).toSeq
val filtinp = inputData.filter(x => x.nonEmpty)
.map(x => Results(x(6), x(5), x(0), x(8).toDouble))
def getTimestamp(date: String): Long = dateFormat.parse(date).getTime
def getDate(timeStampInMills: Long): String = {
val time = new Date(timeStampInMills)
dateFormat.format(time)
}
def getFileSystem(path: String): FileSystem = {
val hconf = new Configuration()
new Path(path).getFileSystem(hconf)
}
override def next(): String = {
val result = line
line = inputData.readLine()
if (line == null) {
inputData.close()
}
result
}
}
}
}

Related

Trying to extract out GraphStageLogic for custom stateful implementation and then passing it as parameter to GraphStage is giving exception

Below is simplified code snippet, where GraphStateLogic implementaion is passed to GraphStage as an constructor argument :-
package akka.shapes.examples.notworking
import akka.actor.ActorSystem
import akka.stream._
import akka.stream.scaladsl.{GraphDSL, RunnableGraph, Sink, Source}
import akka.stream.stage.{GraphStage, GraphStageLogic, InHandler}
//This is base graph stage, where GraphStageLogic and SinkShape are passed in constructor parameter
class BaseGraphStage[T](val shape: SinkShape[T], graphStageLogic: GraphStageLogic) extends GraphStage[ SinkShape[T] ] {
override def createLogic(inheritedAttributes: Attributes): GraphStageLogic = graphStageLogic
}
//this is a sample stateful extension of GraphStageLogic, that accepts first ten elements only
class CountLogic(sinkShape: SinkShape[Int], maxValue: Int) extends GraphStageLogic(sinkShape) {
var counter: Long = 0
override def preStart(): Unit = {
pull(sinkShape.in)
}
setHandler(sinkShape.in, new InHandler {
override def onPush(): Unit = {
val e = grab(sinkShape.in)
println("conditional sink : " + e)
counter = counter + 1
counter == maxValue match {
case true => completeStage()
case false => pull(sinkShape.in)
}
}
})
}
object SampleSinkNotWorking {
def main(args: Array[String]): Unit = {
implicit val actorSystem = ActorSystem("NotWroking")
implicit val actorMaterializer = ActorMaterializer()
val inlet = Inlet[Int](name = "sampleInlet")
val sinkShape = SinkShape( inlet )
val countGraphStateLogic = new CountLogic(sinkShape, 10)
val sinkGraphStage = new BaseGraphStage[Int](sinkShape, countGraphStateLogic)
val sink = Sink.fromGraph( sinkGraphStage )
val graph = GraphDSL.create() { implicit builder =>
import GraphDSL.Implicits._
Source(1 to 100) ~> sink
ClosedShape
}
val runnableGraph = RunnableGraph.fromGraph(graph)
runnableGraph.run()
}
}
Runnning above code is giving ArrayIndexOutOfBoundsException :-
Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException:
-1 at akka.stream.stage.GraphStageLogic.setHandler(GraphStage.scala:439) at
akka.shapes.examples.notworking.CountLogic.(SampleSinkNotWorking.scala:24)
at
akka.shapes.examples.notworking.SampleSinkNotWorking$.main(SampleSinkNotWorking.scala:46)
at
akka.shapes.examples.notworking.SampleSinkNotWorking.main(SampleSinkNotWorking.scala)
I tried debugging, and it looks like, InLet id is -1, ant it's not getting reset.
But, why it's not getting reset, when GraphStateLogic is passed as an constructor argument to GraphState?
i am bit refactor your code and problem is gone, take a look :
class BaseGraphStage(maxValue: Int) extends GraphStage[SinkShape[Int]] {
val inlet = Inlet[Int](name = "sampleInlet")
override def createLogic(inheritedAttributes: Attributes): GraphStageLogic =
new GraphStageLogic(shape) with StageLogging {
var counter: Int = 0
setHandler(inlet, new InHandler {
override def onPush(): Unit = {
val e = grab(inlet)
log.info(s"$e is consumed")
counter += 1
if (counter == maxValue) {
completeStage()
} else {
pull(inlet)
}
}
})
override def preStart(): Unit =
pull(inlet)
override def postStop(): Unit =
counter = 0
}
override def shape: SinkShape[Int] = SinkShape(inlet)
}
object SampleSinkNotWorking {
def main(args: Array[String]): Unit = {
implicit val actorSystem = ActorSystem("NotWorking")
implicit val actorMaterializer = ActorMaterializer()
val sink = Sink.fromGraph(new BaseGraphStage(10))
Source(1 to 100).runWith(sink)
}
}
Can't answer fully on your last question, but i think all trick is about creating inlets in context of graph stage and not out of that, and use pre and post handlers. Hope that helps.

Scala: expecting Try[Stream[(String)]] but is giving Try[String]

I am trying to read a text file to compare 2 files. I have written the code to read the first file and I am expecting readFileStream function to give me Collections of String but I am getting only String.
Could you see where I have made wrong?
import java.io.{BufferedReader, FileInputStream, InputStreamReader}
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import scala.util.{Failure, Success, Try}
object TestCompareHDFSFiles {
def main(args: Array[String]): Unit = {
val hdfs = FileSystem.get(new Configuration())
val path1 = new Path(args(0))
val path2 = new Path(args(1))
readHDFSFile(hdfs, path1, path2)
}
// Accept a parameter which implements a close method
def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
try {
f(resource)
} finally {
resource.close()
}
def readHDFSFile(hdfs: FileSystem, path1: Path, path2: Path): Option[Stream[(String,String)]] = {
Try(using(new BufferedReader(new InputStreamReader(hdfs.open(path1))))(readFileStream))
} match {
case Success(result) => {
}
case Failure(ex) => {
println(s"Could not read file $path1, detail ${ex.getClass.getName}:${ex.getMessage}")
None
}
}
def readFileStream(br: BufferedReader)= {
for {
line <- Try(br.readLine())
if (line != null )
} yield line
}
}
I am struck here. Any help please.
Thanks,

How i can get lineNumber, function name, pkg name in scala?

I try to google it and search give me just java examples...
Thread.currentThread().getStackTrace()(2).getLineNumber
but in worksheet always returned 5, and in working project scala/playframework 2.5 - always 35
I try to play with level, but it does not work.
I understood, for example create services/Log.scala
package services
object Log {
def info(text: String) = {
val systemStr = makeSystemStr
Console.out.println(Console.BLUE + "[INFO] " + systemStr + Console.RESET + text)
}
def makeSystemStr() = {
val fileName = Thread.currentThread().getStackTrace()(3).getFileName
val lineNumber = Thread.currentThread().getStackTrace()(3).getLineNumber
val cnArr = Thread.currentThread().getStackTrace()(3).getClassName.split("[$]")
val pkgName = cnArr(0).split("[.]")(0)
val name = cnArr(0).split("[.]")(1)
val defName = cnArr(3)
s"$pkgName : $fileName : $name : $defName : line $lineNumber - "
}
}
Then in controller
package controllers
import play.api.mvc._
import services.Log
class SomeCtrl extends Controller {
def Index = Action { request =>
Log.info("some text")
Ok("ok")
}
}
use macro , you can easy to get lineNumber and fileName
//edit
import scala.language.experimental.macros
import scala.reflect.macros.blackbox
object SrcFile {
def currentLine: Int = macro SrcFileImpl.currentLine
def currentFileName: String = macro SrcFileImpl.currentFileName
def currentPackage: String = macro SrcFileImpl.currentPackage
def currentClassName: String = macro SrcFileImpl.currentClassName
def currentFuncName :String = macro SrcFileImpl.currentFuncName
}
class SrcFileImpl(val c: blackbox.Context) {
import c.universe._
def getPackage(symbol: Symbol): String =
if(symbol.isPackage) symbol.fullName else getPackage(symbol.owner)
def getClass(symbol: Symbol): String =
if(symbol.isClass) symbol.name.toTypeName.toString else getClass(symbol.owner)
def currentPackage: c.Expr[String] = c.Expr(q"${getPackage(c.internal.enclosingOwner)}")
def currentFileName: c.Expr[String] = c.Expr(q"${c.enclosingPosition.source.file.name}")
def currentLine: c.Expr[Int] = c.Expr(q"${c.enclosingPosition.line}")
def currentClassName: c.Expr[String] = c.Expr(q"${getClass(c.internal.enclosingOwner)}")
def currentFuncName: c.Expr[String] = c.Expr(q"${c.internal.enclosingOwner.name.toTermName.toString}")
}
// test
package so
object SrcFileTest extends App {
def f() = {
println(SrcFile.currentFileName)
println(SrcFile.currentLine)
println(SrcFile.currentPackage)
println(SrcFile.currentClassName)
println(SrcFile.currentFuncName)
}
f()
}

Convert Any type in scala to Array[Byte] and back

I have a variable value declared as Any in my program.
I want to convert this value to Array[Byte].
How can I serialize to Array[Byte] and back? I found examples related to other types such as Double or Int, but not to Any.
This should do what you need. It's pretty similar to how one would do it in Java.
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}
object Serialization extends App {
def serialise(value: Any): Array[Byte] = {
val stream: ByteArrayOutputStream = new ByteArrayOutputStream()
val oos = new ObjectOutputStream(stream)
oos.writeObject(value)
oos.close()
stream.toByteArray
}
def deserialise(bytes: Array[Byte]): Any = {
val ois = new ObjectInputStream(new ByteArrayInputStream(bytes))
val value = ois.readObject
ois.close()
value
}
println(deserialise(serialise("My Test")))
println(deserialise(serialise(List(1))))
println(deserialise(serialise(Map(1 -> 2))))
println(deserialise(serialise(1)))
}
def anyTypeToByteArray(value: Any): Array[Byte] = {
val valueConverted :Array[Byte] = SerializationUtils.serialize(value.isInstanceOf[Serializable])
valueConverted
}
def ByteArrayToAny(value: Array[Byte]): Any = {
val valueConverted: Any = SerializationUtils.deserialize(value)
valueConverted
}

How to run IntellijIDEA (Spark and Scala) code in Apache Spark in terminal mode

I have written my code in IntelliJIDEA (Scala and Spark) and i want to run this code on linux using terminal how can i do this? I can't access to Graphical mode in this Linux Server.
for example this is a code similar my code:
package LDAv1
import java.io._
import org.apache.commons.math3.special._
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd._
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.immutable._
import scala.collection.mutable._
object SparkLDA {
implicit def arrayToVector(s: Array[Int]) = new Vector(s)
implicit def vectorToArray(s: Vector) = s.data
def main(args: Array[String]){
var numTopics:Int=3
var inPath:String="data/MR.dat"
var outPath:String="data"
var master:String="local[*]"
var iter:Int=100
var mem="4g"
var debug=false
lda(inPath,outPath,master,numTopics,(50/numTopics),0.1,iter,debug,mem);
}
def lda(pathToFileIn:String,pathToFileOut:String,URL:String,numTopics:Int,alpha:Double,beta:Double,numIter:Int,deBug:Boolean,mem:String){
val (conf,sc)=initializeSpark(URL,deBug,mem)
var(documents,dictionary,topicCount)=importText(pathToFileIn,numTopics,sc)
val ll:MutableList[Double]= MutableList[Double]()
for(i<-0 to numIter){
var (doc,dict,tC)=step(sc,documents,numTopics,dictionary,topicCount,alpha,beta)
documents=doc
dictionary=dict
topicCount=tC
if(deBug)ll+=logLikelihood(dictionary,topicCount,alpha,beta)
System.gc()
}
saveAll(documents,ll,sc,dictionary,topicCount,pathToFileOut,deBug)
}
def initializeSpark(URL:String,debug:Boolean,mem:String)={
if(!debug)Logger.getLogger("org").setLevel(Level.WARN)
val conf = new SparkConf()
.setAppName("Spark LDA")
.setMaster(URL)
.set("spark.executor.memory", "4g")
val sc = new SparkContext(conf)
(conf,sc)
}
def importText(pathToFileIn:String,numTopics:Int,sc:SparkContext)={
val stopWords =sc.broadcast(List[String]("a","able","about","above","according","accordingly","across","actually","after"));
val textFile=sc.textFile(pathToFileIn,4)
val documents=textFile.map(line=>{
val topicDistrib=new Array[Int](numTopics)
val lineCleaned=line.replaceAll("[^A-Za-z ]","").toLowerCase()
(lineCleaned.split(" ").map(word=>{
var topic:Int=0
var wrd:String=""
if(word.length()>1&&(!stopWords.value.contains(word))){
topic =Integer.parseInt(Math.round(Math.random()*(numTopics-1)).toString)
topicDistrib.increment(topic)
wrd=word
}
(wrd,topic)
})
,topicDistrib)
})
val(dictionary,topicCount)=updateVariables(documents,numTopics)
(documents,dictionary,topicCount)
}
def updateVariables(documents:RDD[(Array[(String, Int)], Array[Int])],numTopics:Int)={
val dictionary=documents.flatMap(line=>line._1).map(tuple=>{
var value:Array[Int]=new Array[Int](numTopics)
if(!tuple._1.equals("")){
value(tuple._2)+=1
}
(tuple._1,value)
}).reduceByKey((a:Array[Int],b)=>{
for(i<-0 to a.length-1){
a(i)+=b(i)
}
(a)
}).collect().toMap
println(dictionary.take(2))
val topicCount:Array[Int]=new Array[Int](numTopics)
dictionary.foreach(t=>topicCount.add(t._2))
(dictionary,topicCount)
}
def step(sc:SparkContext,documents:RDD[(Array[(String, Int)], Array[Int])],numTopics:Int,dict:scala.collection.immutable.Map[String, Array[Int]],tC: Array[Int],alpha:Double,beta:Double)={
val dictionary=sc.broadcast(dict)
val topicCount=sc.broadcast(tC)
val v=dict.size
val doc=documents.map(tuple=>{
val topicDistrib=tuple._2
val line=tuple._1
val lineupDated=line.map(t=>{
val word=t._1
var top=t._2
if(!t._1.equals("")){
topicDistrib.decrement(top)
top=gibbsSampling(topicDistrib,dictionary.value(word),topicCount.value,alpha,beta,v)
topicDistrib.increment(top)
}
(word,top)
})
(lineupDated,topicDistrib)
})
val(dicti,topC)=updateVariables(doc,numTopics)
(doc:RDD[(Array[(String, Int)], Array[Int])],dicti,topC)
}
def saveAll(documents: RDD[(Array[(String, Int)], Array[Int])],LogLikelihood:MutableList[Double],sc: SparkContext, dictionary: scala.collection.immutable.Map[String, Array[Int]], topicCount: Array[Int],path: String,deBug:Boolean){
removeAll(path)
saveDocuments(documents,path)
saveDictionary(sc,dictionary,path)
saveTopicCount(sc,topicCount,path)
if(deBug)saveLogLikelihood (sc,LogLikelihood, path)
}
def saveDocuments (documents: RDD[(Array[(String, Int)], Array[Int])], path: String) {
removeAll(path+"/documentsTopics")
documents.map {
case (topicAssign, topicDist) =>
var topicDistNorm:Array[Double] = topicDist.normalize()
val probabilities = topicDistNorm.toList.mkString(", ")
(probabilities)
}.saveAsTextFile(path+"/documentsTopics")
}
def saveDictionary(sc: SparkContext, dictionary: scala.collection.immutable.Map[String, Array[Int]], path: String) {
removeAll(path+"/wordsTopics")
val dictionaryArray = dictionary.toArray
val temp = sc.parallelize(dictionaryArray).map {
case (word, topics) =>
var topicsNorm:Array[Double] = topics.normalize()
val topArray = topicsNorm.toList.mkString(", ")
val wordCount = topics.sumAll()
val temp2 = List(word, wordCount, topArray).mkString("\t")
(temp2)
}
temp.saveAsTextFile(path+"/wordsTopics")
}
def saveTopicCount (sc: SparkContext, topicCount: Array[Int], path: String) {
removeAll(path+"/topicCount")
val temp = sc.parallelize(topicCount).map {
case (count) =>
(count)
}
temp.saveAsTextFile(path+"/topicCount")
}
def saveLogLikelihood (sc: SparkContext,LogLikelihood:MutableList[Double], path: String) {
removeAll(path+"/logLikelihood")
val temp = sc.parallelize(LogLikelihood).map {
case (count) =>
(count)
}
temp.saveAsTextFile(path+"/logLikelihood")
}
def gibbsSampling(docTopicDistrib:Array[Int],wordTopicDistrib:Array[Int],topicCount:Array[Int],alpha:Double,beta:Double,v:Int):Int={
val numTopic=docTopicDistrib.length
var ro:Array[Double]=new Array[Double](numTopic)
ro(0)=(docTopicDistrib(0)+alpha)*(wordTopicDistrib(0)+beta)/(topicCount(0)+v*beta)
for(i<-1 to numTopic-1){
ro(i)=ro(i-1)+(docTopicDistrib(i)+alpha)*(wordTopicDistrib(i)+beta)/(topicCount(i)+v*beta)
}
var x=Math.random()*ro(numTopic-1)
var i:Int=0
while(x>ro(i)&&i<numTopic-1)i+=1
return i
}
def logLikelihood(dictionary: scala.collection.immutable.Map[String, Array[Int]],topicCount:Array[Int],alpha:Double,beta:Double):Double={
val V:Int=dictionary.size
val numTopics:Int=topicCount.length-1
var logLikelihood:Double=numTopics*(Gamma.logGamma(V*beta)-V*Gamma.logGamma(beta))
for (i<-0 to numTopics){
var sum:Double=0
dictionary.foreach{t=> sum+=Gamma.logGamma(t._2(i)+beta)
}
logLikelihood+=sum-Gamma.logGamma(topicCount(i)+V*beta)
}
(logLikelihood)
}
def removeAll(pathDir: String) = {
def delete(file: File): Array[(String, Boolean)] = {
Option(file.listFiles).map(_.flatMap(f => delete(f))).getOrElse(Array()) :+ (file.getPath -> file.delete)
}
}
}
and it has one Scala class :
package LDAv1
class Vector(val vect:Array[Int]) {
var data:Array[Int]=vect;
def this(size:Int){
this(new Array[Int](size));
}
def increment(index:Int){
data(index)+=1;
}
def decrement(index:Int){
data(index)-=1;
}
def printIt(){
print("[")
for(i<-0 to data.length-1)print(data(i)+",");
print("]\n")
}
def forEach(callback:(Int) => Unit)={
for(i<-0 to data.length-1)callback(data(i));
}
def add(a:Array[Int]){
for(i<-0 to data.length-1)data(i)+=a(i);
}
def sumAll():Int={
var sum:Int=0;
for(i<-0 to data.length-1)sum+=data(i);
(sum)
}
def normalize():Array[Double]={
var temp:Array[Double] = new Array[Double](data.length);
var sum:Double=0;
for(i<-0 to data.length-1)sum+=data(i);
if (sum>0) {
for(i<-0 to data.length-1) {
temp(i) = data(i).toDouble/sum
};
}
(temp)
}
}
You have to create a fat jar, with all dependencies included, then you can build your application using the spark build-in function, spark-submit.
https://spark.apache.org/docs/latest/submitting-applications.html