Perfomance using orientdb graph api vs sql query - scala

Actually i'm using this api for standards operations (Read, Remove, Find, Save)
http://orientdb.com/docs/last/Graph-Database-Tinkerpop.html
I noticed that the performance of this method for remove are really bad
object Odb {
val factory = new OrientGraphFactory("remote:localhost:2424/recommendation-system","root","root").setupPool(1,10)
def clearDb = {
val graph = factory.getNoTx
val vertices = graph.getVertices().asScala.map(v => v.remove())
}
}
object TagsOdb extends TagsDao {
override def count: Future[Long] = Future {
val graph = Odb.factory.getNoTx
val count = graph.countVertices("Tags")
count
}
override def update(newTag: Tag, oldTag: Tag): Future[Boolean] = Future { synchronized {
val graph = Odb.factory.getTx
val tagVertices = graph.getVertices("Tags.tag",oldTag.flatten).asScala
if(tagVertices.isEmpty) throw new Exception("Tag not found: "+oldTag.id)
tagVertices.head.setProperty("tag",newTag.flatten)
graph.commit()
true
}}
override def all: Future[List[Tag]] = Future {
val graph = Odb.factory.getNoTx
val tagVertices = graph.getVerticesOfClass("Tags").asScala
val tagList = tagVertices.map(v => Tag(v.getProperty("tag"),None)).toList
tagList
}
override def remove(e: Tag): Future[Boolean] = Future { synchronized {
val graph = Odb.factory.getTx
val tagVertices = graph.getVertices("Tags.tag",e.flatten).asScala
if(tagVertices.isEmpty) throw new Exception("Tag not found: "+e.flatten)
tagVertices.head.remove()
graph.commit()
true
}}
override def save(e: Tag, upsert: Boolean = false): Future[Boolean] = Future { synchronized {
val graph = Odb.factory.getTx
val v = graph.getVertices("Tags.tag",e.flatten).asScala
if(v.nonEmpty) {
if (upsert)
v.head.setProperty("tag", e.flatten)
else
throw new Exception("Element already in database")
} else {
val tagVertex = graph.addVertex("Tags", null)
tagVertex.setProperty("tag", e.flatten)
}
graph.commit()
true
}}
override def find(query: String): Future[List[Tag]] = Future {
val graph = Odb.factory.getNoTx
val res: OrientDynaElementIterable = graph.command(new OCommandSQL(query)).execute()
val ridTags: Iterable[Vertex] = res.asScala.asInstanceOf[Iterable[Vertex]]
def getTag(rid: AnyRef): Tag = {
val tagVertex = graph.getVertex(rid)
Tag(tagVertex.getProperty("tag"),None)
}
ridTags.map(r => getTag(r)).toList
}
}
is there any way to get better performance ?
I should use SQL queries?

Since you're connected via remote interface, foreach remove you have a RPC. Try to execute a DELETE VERTEX on the server.

Related

Download an archive file on the fly using play framework 2.3

I'am trying to create and download an archive file without relying on memory(to avoid out of memory exception for large file) I'am using for that play framework 2.3 with Scala, after some research I found an example: https://gist.github.com/kirked/412b5156f94419e71ce4a84ec1d54761, I made some modification on it. My problem is when I download the file and try to open it I get this exception: An error occurred while loading the archive.
here all the code:
def zip() = Action {
implicit request: Request[AnyContent] =>
val buffer = new ZipBuffer(10000)
val writeCentralDirectory = Enumerator.generateM(Future{
if (buffer.isClosed) {
None
}
else {
buffer.flush
buffer.close
Some(buffer.bytes)
}
})
val test = Enumerator.apply(ResolvedSource2("test.txt", "helllo"))
Ok.chunked(test &> zipeach2(buffer) andThen writeCentralDirectory >>> Enumerator.eof) as withCharset("application/zip") withHeaders(
CONTENT_DISPOSITION -> s"attachment; filename=aa.zip")
}
case class ResolvedSource2(filepath: String, stream: String)
def zipeach2(buffer: ZipBuffer)(implicit ec: ExecutionContext): Enumeratee[ResolvedSource2, Array[Byte]] = {
Enumeratee.mapConcat[ResolvedSource2] { source =>
buffer.zipStream.putNextEntry(new ZipEntry(source.filepath))
var done = false
def entryDone: Unit = {
done = true
buffer.zipStream.closeEntry
}
def restOfStream: Stream[Array[Byte]] = {
if (done) Stream.empty
else {
while (!done && !buffer.full) {
try {
val byte = source.stream
buffer.zipStream.write(byte.getBytes)
entryDone
}
catch {
case e: IOException =>
println(s"reading/zipping stream [${source.filepath}]", e)
}
}
buffer.bytes #:: restOfStream
}
}
restOfStream
}
}
}
class ZipBuffer(capacity: Int) {
private val buf = new ByteArrayOutputStream(capacity)
private var closed = false
val zipStream = new ZipOutputStream(buf)
def close(): Unit = {
if (!closed) {
closed = true
reset
zipStream.finish()
zipStream.close // writes central directory
}
}
def flush() = {
zipStream.flush()
}
def isClosed = closed
def reset: Unit = buf.reset
def full: Boolean = buf.size >= capacity
def bytes: Array[Byte] = {
val result = buf.toByteArray
reset
result
}
}

Why does my Akka data stream stops processing a huge file (~250,000 lines of strings) but works for small file?

My stream works for smaller file of 1000 lines but stops when I test it on a large file ~12MB and ~250,000 lines? I tried applying backpressure with a buffer and throttling it and still same thing...
Here is my data streamer:
class UserDataStreaming(usersFile: File) {
implicit val system = ActorSystemContainer.getInstance().getSystem
implicit val materializer = ActorSystemContainer.getInstance().getMaterializer
def startStreaming() = {
val graph = RunnableGraph.fromGraph(GraphDSL.create() {
implicit builder =>
val usersSource = builder.add(Source.fromIterator(() => usersDataLines)).out
val stringToUserFlowShape: FlowShape[String, User] = builder.add(csvToUser)
val averageAgeFlowShape: FlowShape[User, (String, Int, Int)] = builder.add(averageUserAgeFlow)
val averageAgeSink = builder.add(Sink.foreach(averageUserAgeSink)).in
usersSource ~> stringToUserFlowShape ~> averageAgeFlowShape ~> averageAgeSink
ClosedShape
})
graph.run()
}
val usersDataLines = scala.io.Source.fromFile(usersFile, "ISO-8859-1").getLines().drop(1)
val csvToUser = Flow[String].map(_.split(";").map(_.trim)).map(csvLinesArrayToUser)
def csvLinesArrayToUser(line: Array[String]) = User(line(0), line(1), line(2))
def averageUserAgeSink[usersSource](source: usersSource) {
source match {
case (age: String, count: Int, totalAge: Int) => println(s"age = $age; Average reader age is: ${Try(totalAge/count).getOrElse(0)} count = $count and total age = $totalAge")
case bad => println(s"Bad case: $bad")
}
}
def averageUserAgeFlow = Flow[User].fold(("", 0, 0)) {
(nums: (String, Int, Int), user: User) =>
var counter: Option[Int] = None
var totalAge: Option[Int] = None
val ageInt = Try(user.age.substring(1, user.age.length-1).toInt)
if (ageInt.isSuccess) {
counter = Some(nums._2 + 1)
totalAge = Some(nums._3 + ageInt.get)
}
else {
counter = Some(nums._2 + 0)
totalAge = Some(nums._3 + 0)
}
//println(counter.get)
(user.age, counter.get, totalAge.get)
}
}
Here is my Main:
object Main {
def main(args: Array[String]): Unit = {
implicit val system = ActorSystemContainer.getInstance().getSystem
implicit val materializer = ActorSystemContainer.getInstance().getMaterializer
val usersFile = new File("data/BX-Users.csv")
println(usersFile.length())
val userDataStreamer = new UserDataStreaming(usersFile)
userDataStreamer.startStreaming()
}
It´s possible that there may be any error related to one row of your csv file. In that case, the stream materializes and stops. Try to define your flows like that:
FlowFlowShape[String, User].map {
case (user) => try {
csvToUser(user)
}
}.withAttributes(ActorAttributes.supervisionStrategy {
case ex: Throwable =>
log.error("Error parsing row event: {}", ex)
Supervision.Resume
}
In this case the possible exception is captured and the stream ignores the error and continues.
If you use Supervision.Stop, the stream stops.

Akka-streams: how to get flow names in metrics reported by kamon-akka

I've been trying to set-up some instrumentation for Akka streams. Got it working, but, even though I named all my Flows that are part of the streams, I still get this sort of names in the metrics: flow-0-0-unknown-operation
A simple example of what I'm trying to do:
val myflow = Flow[String].named("myflow").map(println)
Source.via(myflow).to(Sink.ignore).run()
I basically want to see the metrics for the Actor that gets created for "myflow", with a proper name.
Is this even possible? Am I missing something?
I was having this challenge in my project and I solved by using Kamon + Prometheus. However I had to create an Akka Stream Flow which I can set its name metricName and export the metric values from it using val kamonThroughputGauge: Metric.Gauge.
class MonitorProcessingTimerFlow[T](interval: FiniteDuration)(metricName: String = "monitorFlow") extends GraphStage[FlowShape[T, T]] {
val in = Inlet[T]("MonitorProcessingTimerFlow.in")
val out = Outlet[T]("MonitorProcessingTimerFlow.out")
Kamon.init()
val kamonThroughputGauge: Metric.Gauge = Kamon.gauge("akka-stream-throughput-monitor")
override def createLogic(inheritedAttributes: Attributes): GraphStageLogic = new TimerGraphStageLogic(shape) {
// mutable state
var open = false
var count = 0
var start = System.nanoTime
setHandler(in, new InHandler {
override def onPush(): Unit = {
try {
push(out, grab(in))
count += 1
if (!open) {
open = true
scheduleOnce(None, interval)
}
} catch {
case e: Throwable => failStage(e)
}
}
})
setHandler(out, new OutHandler {
override def onPull(): Unit = {
pull(in)
}
})
override protected def onTimer(timerKey: Any): Unit = {
open = false
val duration = (System.nanoTime - start) / 1e9d
val throughput = count / duration
kamonThroughputGauge.withTag("name", metricName).update(throughput)
count = 0
start = System.nanoTime
}
}
override def shape: FlowShape[T, T] = FlowShape[T, T](in, out)
}
Then I created a simple stream that uses the MonitorProcessingTimerFlow to export the metrics:
implicit val system = ActorSystem("FirstStreamMonitoring")
val source = Source(Stream.from(1)).throttle(1, 1 second)
/** Simulating workload fluctuation: A Flow that expand the event to a random number of multiple events */
val flow = Flow[Int].extrapolate { element =>
Stream.continually(Random.nextInt(100)).take(Random.nextInt(100)).iterator
}
val monitorFlow = Flow.fromGraph(new MonitorProcessingTimerFlow[Int](5 seconds)("monitorFlow"))
val sink = Sink.foreach[Int](println)
val graph = source
.via(flow)
.via(monitorFlow)
.to(sink)
graph.run()
with a proper configuration at application.conf:
kamon.instrumentation.akka.filters {
actors.track {
includes = [ "FirstStreamMonitoring/user/*" ]
}
}
I can see the throughput metrics on prometheus console with the name name="monitorFlow":

handle spark state istimingout

I'm using the new mapWithState function in spark streaming (1.6) with a timing out state. I want to use the timing out state and add it to another rdd in order to use it for calculations further down the road:
val aggedlogs = sc.emptyRDD[MyLog];
val mappingFunc = (key: String, newlog: Option[MyLog], state: State[MyLog]) => {
val _newLog = newlog.getOrElse(null)
if ((state.exists())&&(_newLog!=null))
{
val stateLog = state.get()
val combinedLog = LogUtil.CombineLogs(_newLog, stateLog);
state.update(combinedLog)
}
else if (_newLog !=null) {
state.update(_newLog);
}
if (state.isTimingOut())
{
val stateLog = state.get();
aggedlogs.union(sc.parallelize(List(stateLog), 1))
}
val stateLog = state.get();
(key,stateLog);
}
val stateDstream = reducedlogs.mapWithState(StateSpec.function(mappingFunc).timeout(Seconds(10)))
but when I try to add it to an rdd in the StateSpec function, I get an error that the function is not serializable. Any thoughts on how I can get pass this?
EDIT:
After drilling deeper i found that my approach was wrong. before trying this solution i tried to get the timing out logs from the statesnapeshot(), but they were not there anymore, changing the mapping function to :
def mappingFunc(key: String, newlog: Option[MyLog], state: State[KomoonaLog]) : Option[(String, MyLog)] = {
val _newLog = newlog.getOrElse(null)
if ((state.exists())&&(_newLog!=null))
{
val stateLog = state.get()
val combinedLog = LogUtil.CombineLogs(_newLog, stateLog);
state.update(combinedLog)
Some(key,combinedLog);
}
else if (_newLog !=null) {
state.update(_newLog);
Some(key,_newLog);
}
if (state.isTimingOut())
{
val stateLog = state.get();
stateLog.timinigOut = true;
System.out.println("timinigOut : " +key );
Some(key, stateLog);
}
val stateLog = state.get();
Some(key,stateLog);
}
i managed to filter the mapedwithstatedstream for the logs that are timing out in each batch:
val stateDstream = reducedlogs.mapWithState(
StateSpec.function(mappingFunc _).timeout(Seconds(60)))
val tiningoutlogs= stateDstream.filter (filtertimingout)

How to list out all the files in the public directory in a Play Framework 2.X Scala application?

Here is my controller
class Proguard extends Controller {
val proguardFolder = "/public/proguards/"
val proguardFolderFix = "/public/proguards"
val proguardSuffix = "proguard-"
val proguardExtension = ".pro"
val title = "# Created by https://www.proguard.io/api/%s\n\n%s"
def proguard(libraryName: String) = Action {
val libraries = libraryName.split(',')
val availableLibs = listInDir(proguardFolderFix)
val result = availableLibs.filter(libraries.contains).map(readFile).mkString
Ok(title.format(libraryName, result))
}
def list() = Action {
Ok(Json.toJson(listInDir(proguardFolder)))
}
private def listInDir(filePath: String): List[String] = {
getListOfFiles(Play.getFile(filePath)).map(_.getName.replace(proguardExtension, "").replace(proguardSuffix, ""))
}
def getListOfFiles(dir: File): List[File] = {
dir.listFiles.toList
}
def readFile(string: String): String = {
val source = scala.io.Source.fromFile(Play.getFile(s"$proguardFolder$proguardSuffix$string$proguardExtension"))
val lines = try source.mkString finally source.close()
lines
}
}
It worked totally okay in debug mode, but in production at Heroku dir.listFiles. is giving me NPE
I've tried different ways, but looks like only solution is move my files to s3 or database.