I am trying to join two dataset. Below are two dataset .
1/2/2009 6:17,iphone,800,Mastercard,carolina
1/2/2009 4:53,cloth,200,Visa,Betina
1/2/2009 13:08,cloth,100,Mastercard,Federica e Andrea
1/3/2009 14:44,blender,160,Visa,Gouya
1/4/2009 12:56,samsung,3600,Visa,Gerd W
1/4/2009 13:19,htc,1200,Visa,LAURENCE
1/4/2009 20:11,iphone,999,Mastercard,Fleur
1/2/2009 20:09,tmobile,81,Mastercard,adam
1/4/2009 13:17,iphone,400,Cash,Renee Elisabeth
similarly other dataset is :
Mastercard,MS
Visa,VS
I want to join two data set and get output like below:
(htc,VS)
(iphone,MS)
(iphone,NULL)
Below is what My approach :
def mapCard(cardname:String):String={
if(cardname.isEmpty()){
return "NONE"
}
else
return cardname
}
def main(args: Array[String]): Unit = {
val source = scala.io.Source.fromFile("bc.txt")
val keymap = scala.collection.mutable.Map[String, String]()
for (line <- source.getLines) {
val Array(country, capital) = line.split(",").map { _.trim() }
keymap += country -> capital
}
println(keymap)
val conf = new SparkConf().setMaster("local[2]").setAppName("AAA")
val sparkcontext = new SparkContext(conf)
val countriesCache = sparkcontext.broadcast(keymap)
val file = sparkcontext.textFile("salesdata.csv")
val a = file.map { line => line.split(",") }
.map { line => {
var columns = line(3)
if(countriesCache.value.contains(columns) )
{
columns.map { x => ( line(1),countriesCache.value(columns) ) }
}
else
columns.map { x => (line(1),"NULL") }
}
}
a.foreach(x=> println(x.mkString(",")))
}}
This doesnot give me my output.Please suggest me the issue here. Instead it gives like below .
htc,VS),(htc,VS),(htc,VS),(htc,VS)
(iphone,MS),(iphone,MS),(iphone,MS),(iphone,MS),(iphone,MS),(iphone,MS),(iphone,MS),(iphone,MS),(iphone,MS),(iphone,MS)
(cloth,VS),(cloth,VS),(cloth,VS),(cloth,VS)
I think to problem is that you iterate over the characters of your sing in these lines:
columns.map { x => ( line(1),countriesCache.value(columns) ) }
and
columns.map { x => (line(1),"NULL") }
just use
( line(1),countriesCache.value(columns) )
and
(line(1),"NULL")
So I've tried seemingly countless things to get this to work. When I call queueWrite, the println statements give me this:
{ "uuid" : "49f2-0b64-4bf3-49f2a35b-bbe8-4954f742d88b" }
and this:
{ "uuid" : "49f2-0b64-4bf3-49f2a35b-bbe8-4954f742d88b", "name" : "personName", "key" : "3E6A" }
Which (I'm pretty sure) is just fine. However, after it prints, I get this:
java.lang.IllegalArgumentException: Invalid BSON field name uuid
Afaik, the field name uuid is fine, the only things about an improper name I could really find is to just make sure there are no '.' symbols in it (which there aren't)
def queueWrite(collection: String, filter: Map[String, () => String], data: Map[String, () => String]) {
val col = collections.get(collection).get
val filterDoc = new BsonDocument
filter.foreach(f => { filterDoc append (f._1, new BsonString(f._2.apply)) })
val filterBson = Document(filterDoc)
println("filter: \n" + filterBson.toJson)
val dataDoc = new BsonDocument
data.foreach(f => { dataDoc append (f._1, new BsonString(f._2.apply)) })
val dataBson = Document(dataDoc)
println("data: \n" + dataBson.toJson)
val options = new FindOneAndUpdateOptions
options.returnDocument(ReturnDocument.AFTER)
options.upsert(true)
val observer = new Observer[Document] {
override def onSubscribe(s: Subscription) = s.request(1)
override def onNext(doc: Document) = println(doc.toJson)
override def onError(e: Throwable) = e.printStackTrace
override def onComplete = println("onComplete")
}
val observable: Observable[Document] = col.findOneAndUpdate(filterBson, dataBson, options)
observable.subscribe(observer)
}
Any ideas / suggestions are greatly appreciated as always :)
My stream works for smaller file of 1000 lines but stops when I test it on a large file ~12MB and ~250,000 lines? I tried applying backpressure with a buffer and throttling it and still same thing...
Here is my data streamer:
class UserDataStreaming(usersFile: File) {
implicit val system = ActorSystemContainer.getInstance().getSystem
implicit val materializer = ActorSystemContainer.getInstance().getMaterializer
def startStreaming() = {
val graph = RunnableGraph.fromGraph(GraphDSL.create() {
implicit builder =>
val usersSource = builder.add(Source.fromIterator(() => usersDataLines)).out
val stringToUserFlowShape: FlowShape[String, User] = builder.add(csvToUser)
val averageAgeFlowShape: FlowShape[User, (String, Int, Int)] = builder.add(averageUserAgeFlow)
val averageAgeSink = builder.add(Sink.foreach(averageUserAgeSink)).in
usersSource ~> stringToUserFlowShape ~> averageAgeFlowShape ~> averageAgeSink
ClosedShape
})
graph.run()
}
val usersDataLines = scala.io.Source.fromFile(usersFile, "ISO-8859-1").getLines().drop(1)
val csvToUser = Flow[String].map(_.split(";").map(_.trim)).map(csvLinesArrayToUser)
def csvLinesArrayToUser(line: Array[String]) = User(line(0), line(1), line(2))
def averageUserAgeSink[usersSource](source: usersSource) {
source match {
case (age: String, count: Int, totalAge: Int) => println(s"age = $age; Average reader age is: ${Try(totalAge/count).getOrElse(0)} count = $count and total age = $totalAge")
case bad => println(s"Bad case: $bad")
}
}
def averageUserAgeFlow = Flow[User].fold(("", 0, 0)) {
(nums: (String, Int, Int), user: User) =>
var counter: Option[Int] = None
var totalAge: Option[Int] = None
val ageInt = Try(user.age.substring(1, user.age.length-1).toInt)
if (ageInt.isSuccess) {
counter = Some(nums._2 + 1)
totalAge = Some(nums._3 + ageInt.get)
}
else {
counter = Some(nums._2 + 0)
totalAge = Some(nums._3 + 0)
}
//println(counter.get)
(user.age, counter.get, totalAge.get)
}
}
Here is my Main:
object Main {
def main(args: Array[String]): Unit = {
implicit val system = ActorSystemContainer.getInstance().getSystem
implicit val materializer = ActorSystemContainer.getInstance().getMaterializer
val usersFile = new File("data/BX-Users.csv")
println(usersFile.length())
val userDataStreamer = new UserDataStreaming(usersFile)
userDataStreamer.startStreaming()
}
It´s possible that there may be any error related to one row of your csv file. In that case, the stream materializes and stops. Try to define your flows like that:
FlowFlowShape[String, User].map {
case (user) => try {
csvToUser(user)
}
}.withAttributes(ActorAttributes.supervisionStrategy {
case ex: Throwable =>
log.error("Error parsing row event: {}", ex)
Supervision.Resume
}
In this case the possible exception is captured and the stream ignores the error and continues.
If you use Supervision.Stop, the stream stops.
I want to apply a list of regex to a string. My current approach is not very functional
My current code:
val stopWords = List[String](
"the",
"restaurant",
"bar",
"[^a-zA-Z -]"
)
def CanonicalName(name: String): String = {
var nameM = name
for (reg <- stopWords) {
nameM = nameM.replaceAll(reg, "")
}
nameM = nameM.replaceAll(" +", " ").trim
return nameM
}
I think this does what you're looking for.
def CanonicalName(name: String): String = {
val stopWords = List("the", "restaurant", "bar", "[^a-zA-Z -]")
stopWords.foldLeft(name)(_.replaceAll(_, "")).replaceAll(" +"," ").trim
}
'replaceAll' has the possiblity to replace part of a word, for example: "the thermal & barbecue restaurant" is replaced to "rmal becue". If what you want is "thermal barbecue", you may split the name first and then apply your stopwords rules word by word:
def isStopWord(word: String): Boolean = stopWords.exists(word.matches)
def CanonicalName(name: String): String =
name.replaceAll(" +", " ").trim.split(" ").flatMap(n => if (isStopWord(n)) List() else List(n)).mkString(" ")
Here is my controller
class Proguard extends Controller {
val proguardFolder = "/public/proguards/"
val proguardFolderFix = "/public/proguards"
val proguardSuffix = "proguard-"
val proguardExtension = ".pro"
val title = "# Created by https://www.proguard.io/api/%s\n\n%s"
def proguard(libraryName: String) = Action {
val libraries = libraryName.split(',')
val availableLibs = listInDir(proguardFolderFix)
val result = availableLibs.filter(libraries.contains).map(readFile).mkString
Ok(title.format(libraryName, result))
}
def list() = Action {
Ok(Json.toJson(listInDir(proguardFolder)))
}
private def listInDir(filePath: String): List[String] = {
getListOfFiles(Play.getFile(filePath)).map(_.getName.replace(proguardExtension, "").replace(proguardSuffix, ""))
}
def getListOfFiles(dir: File): List[File] = {
dir.listFiles.toList
}
def readFile(string: String): String = {
val source = scala.io.Source.fromFile(Play.getFile(s"$proguardFolder$proguardSuffix$string$proguardExtension"))
val lines = try source.mkString finally source.close()
lines
}
}
It worked totally okay in debug mode, but in production at Heroku dir.listFiles. is giving me NPE
I've tried different ways, but looks like only solution is move my files to s3 or database.