Serialize Guava's MinMaxPriorityQueue - scala

After a few days researching why my Flink application is not working properly I've came to the conclusion that the problem resides in a MinMaxPriorityQueue I am using.
It seems that this structure is not serializable. I've tried several ways to serialize it:
env.getConfig.registerTypeWithKryoSerializer(classOf[MinMaxPriorityQueue[Double]], classOf[JavaSerializer])
env.getConfig.registerTypeWithKryoSerializer(classOf[MinMaxPriorityQueue[java.lang.Double]], classOf[ProtobufSerializer]);
env.getConfig().addDefaultKryoSerializer(MyCustomType.class, TBaseSerializer.class);
all of them without luck.
However I've found this: Serializing Guava's ImmutableTable
Is there an equivalent to MinMaxPriorityQueue, or a way to serialize it?
Update
I've translated Tomasz into scala:
class MinMaxPriorityQueueSerializer extends Serializer[MinMaxPriorityQueue[Object]] {
private[this] val log = LoggerFactory.getLogger(this.getClass)
setImmutable(false)
setAcceptsNull(false)
val OPTIMIZE_POSITIVE = true
override def read(kryo: Kryo, input: Input, aClass: Class[MinMaxPriorityQueue[Object]]): MinMaxPriorityQueue[Object] = {
log.error("Kryo READ")
val comparator: Ordering[Object] = kryo.readClassAndObject(input).asInstanceOf[Ordering[Object]]
val size = input.readInt(OPTIMIZE_POSITIVE)
val queue: MinMaxPriorityQueue[Object] = MinMaxPriorityQueue.orderedBy(comparator)
.expectedSize(size)
.create()
(0 to size).foreach(_ => queue.offer(kryo.readClassAndObject(input)))
queue
}
override def write(kryo: Kryo, output: Output, queue: MinMaxPriorityQueue[Object]): Unit = {
log.error("Kryo WRITE")
kryo.writeClassAndObject(output, queue.comparator)
val declaredSize = queue.size
output.writeInt(declaredSize, OPTIMIZE_POSITIVE)
val actualSize = queue.toArray.foldLeft(0) {
case (z, q) =>
kryo.writeClassAndObject(output, q)
z + 1
}
Preconditions.checkState(
declaredSize == actualSize,
"Declared size (%s) different than actual size (%s)", declaredSize, actualSize)
}
}
And set kryo in flink to use that Serializer:
env.getConfig.addDefaultKryoSerializer(classOf[MinMaxPriorityQueue[Double]], classOf[MinMaxPriorityQueueSerializer])
env.getConfig.registerTypeWithKryoSerializer(classOf[MinMaxPriorityQueue[Double]], classOf[MinMaxPriorityQueueSerializer])
However it seems it gets never called, since I do not see anywhere in the logs the outputs of log.error("Kryo READ") and log.error("Kryo WRITE")
And the transformation still returns an empty MinMaxPriorityQueue, even I am updating it.
Update 2
I've implemented the SerializerTester, but I am getting a bufferUnderflow:
object Main {
def main(args: Array[String]) {
val tester = new MinMaxPriorityQueueSerializerTester()
val inQueue: MinMaxPriorityQueue[java.lang.Double] = MinMaxPriorityQueue.create()
inQueue.add(1.0)
val outputStream = new ByteArrayOutputStream()
tester.serialize(outputStream, inQueue)
val inputStream = new ByteArrayInputStream(outputStream.toByteArray())
val outQueue: MinMaxPriorityQueue[java.lang.Double] = tester.deserialize(inputStream);
System.out.println(inQueue);
System.out.println(outQueue);
}
class MinMaxPriorityQueueSerializerTester {
val kryo = new Kryo
kryo.setInstantiatorStrategy(new StdInstantiatorStrategy)
registerMinMaxSerializer();
// allowForClassesWithoutNoArgConstructor(); // needed to serialize Ordering
def registerMinMaxSerializer() {
kryo.addDefaultSerializer(classOf[MinMaxPriorityQueue[java.lang.Double]], new MinMaxPriorityQueueSerializer());
}
def serialize(out: OutputStream, queue: MinMaxPriorityQueue[java.lang.Double]) {
// try (Output output = new Output(out)) {
val output = new Output(out)
kryo.writeClassAndObject(output, queue)
// kryo.writeObject(output, queue)
//}
output.flush
}
def deserialize(in: InputStream): MinMaxPriorityQueue[java.lang.Double] = {
//try (Input input = new Input(in)) {
val input = new Input(in)
//kryo.readObject(input, classOf[MinMaxPriorityQueue[java.lang.Double]])
kryo.readClassAndObject(input).asInstanceOf[MinMaxPriorityQueue[java.lang.Double]]
//p}
}
}

You can use a custom Kryo Serializer.
Here is a sample one (in Java):
class MinMaxPriorityQueueSerializer extends Serializer<MinMaxPriorityQueue<Object>> {
private static final boolean OPTIMIZE_POSITIVE = true;
protected MinMaxPriorityQueueSerializer() {
setAcceptsNull(false);
setImmutable(false);
}
#Override
public void write(Kryo kryo, Output output, MinMaxPriorityQueue<Object> queue) {
kryo.writeClassAndObject(output, queue.comparator());
int declaredSize = queue.size();
output.writeInt(declaredSize, OPTIMIZE_POSITIVE);
int actualSize = 0;
for (Object element : queue) {
kryo.writeClassAndObject(output, element);
actualSize++;
}
Preconditions.checkState(
declaredSize == actualSize,
"Declared size (%s) different than actual size (%s)", declaredSize, actualSize
);
}
#Override
public MinMaxPriorityQueue<Object> read(Kryo kryo, Input input, Class<MinMaxPriorityQueue<Object>> type) {
#SuppressWarnings("unchecked")
Comparator<Object> comparator = (Comparator<Object>) kryo.readClassAndObject(input);
int size = input.readInt(OPTIMIZE_POSITIVE);
MinMaxPriorityQueue<Object> queue = MinMaxPriorityQueue.orderedBy(comparator)
.expectedSize(size)
.create();
for (int i = 0; i < size; ++i) {
queue.offer(kryo.readClassAndObject(input));
}
return queue;
}
}
Here is how you could use it:
class MinMaxPriorityQueueSerializerTester {
public static void main(String[] args) {
MinMaxPriorityQueueSerializerTester tester = new MinMaxPriorityQueueSerializerTester();
MinMaxPriorityQueue<Integer> inQueue = MinMaxPriorityQueue.<Integer>orderedBy(Comparator.reverseOrder())
.create(Arrays.asList(5, 2, 7, 2, 4));
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
tester.serialize(outputStream, inQueue);
ByteArrayInputStream inputStream = new ByteArrayInputStream(outputStream.toByteArray());
#SuppressWarnings("unchecked")
MinMaxPriorityQueue<Integer> outQueue = (MinMaxPriorityQueue<Integer>) tester.deserialize(inputStream);
System.out.println(inQueue);
System.out.println(outQueue);
}
private final Kryo kryo;
public MinMaxPriorityQueueSerializerTester() {
this.kryo = new Kryo();
registerMinMaxSerializer();
allowForClassesWithoutNoArgConstructor(); // needed to serialize Ordering
}
private void registerMinMaxSerializer() {
kryo.addDefaultSerializer(MinMaxPriorityQueue.class, new MinMaxPriorityQueueSerializer());
}
private void allowForClassesWithoutNoArgConstructor() {
((Kryo.DefaultInstantiatorStrategy) kryo.getInstantiatorStrategy())
.setFallbackInstantiatorStrategy(new StdInstantiatorStrategy());
}
public void serialize(OutputStream out, MinMaxPriorityQueue<?> queue) {
try (Output output = new Output(out)) {
kryo.writeObject(output, queue);
}
}
public MinMaxPriorityQueue<?> deserialize(InputStream in) {
try (Input input = new Input(in)) {
return kryo.readObject(input, MinMaxPriorityQueue.class);
}
}
}

I finally give up and tried to use a different Data Structure and make it Serializable with java.io.Serializable.
This Data Structure is an IntervalHeap implemented here, I just made it Serializable in my project.
All works correctly now.

Related

How to create the "last" rule in Drools retracting the event if no other rules fire?

I have two rules: one generates the alert message and appending it to the list of strings attribute of event (there will be multiple such rules in future), and the other one retracts event if no alerts was generated.
I use salience for setting the rules priority (the "last" rule should fire literally last). But the rules order is wrong for some reasons so the alerts list is still empty and all the events are retracted. Please look at my samples below.
MainHello.scala
import org.json4s._
import org.json4s.jackson.Serialization.write
import org.kie.api.KieServices
import org.kie.api.builder.Message
import org.kie.internal.io.ResourceFactory
object MainHello {
implicit val formats = DefaultFormats
val events = List(
new Event(123L, null, null),
new Event(456L, null, null),
new Event(789L, null, null),
new Event(688723L, null, null),
new Event(12L, null, null),
new Event(13L, null, null))
def main(args: Array[String]): Unit = {
val kieService = KieServices.get()
val repository = kieService.getRepository
val kfs = kieService.newKieFileSystem
kfs.write(ResourceFactory.newClassPathResource("hello.drl", this.getClass))
val kb = kieService.newKieBuilder(kfs)
kb.buildAll
if (kb.getResults.hasMessages(Message.Level.ERROR)) {
throw new RuntimeException("Build Errors:\n" + kb.getResults.toString)
}
val kContainer = kieService.newKieContainer(repository.getDefaultReleaseId)
val session = kContainer.newKieSession()
events.foreach { item =>
println("INPUT OBJECT: " + write(item))
val facthandle = session.insert(item)
session.fireAllRules()
println("result object: " + write(session.getObject(facthandle)))
}
}
}
Event.java
import java.util.ArrayList;
import java.util.List;
public class Event implements java.io.Serializable {
static final long serialVersionUID = 1L;
private java.lang.Long event_id;
private java.lang.String alert;
private java.util.List<java.lang.String> alerts;
public Event() {
}
public java.lang.Long getEvent_id() {
return this.event_id;
}
public void setEvent_id(java.lang.Long event_id) {
this.event_id = event_id;
}
public java.lang.String getAlert() {
return this.alert;
}
public void setAlert(java.lang.String alert) {
if (alerts == null) {
alerts = new java.util.ArrayList<java.lang.String>();
}
alerts.add(alert);
this.alert = alert;
}
public java.util.List<java.lang.String> getAlerts() {
return this.alerts;
}
public void setAlerts(java.util.List<java.lang.String> alerts) {
this.alerts = alerts;
}
public Event(java.lang.Long event_id, java.lang.String alert,
java.util.List<java.lang.String> alerts) {
this.event_id = event_id;
this.alert = alert;
this.alerts = alerts;
}
}
Hello.drl
rule "alert"
when
e: ru.test.Event(event_id == 123)
then
modify(e) {
setAlert("alert")
}
end
rule "Retract" salience -1
when
e: ru.test.Event (alerts == null || alerts.empty == true)
then
retract(e);
end
console output
INPUT OBJECT: {"event_id":12,"alert":null,"alerts":null}
result object: null
INPUT OBJECT: {"event_id":13,"alert":null,"alerts":null}
result object: null
INPUT OBJECT: {"event_id":123,"alert":null,"alerts":null} // This object should not be retracted because "123" event_id generates alert
result object: null
As you can see, all the events become null.
Drools version is 7.8.0.Final.
What am I doing wrong? Thanks!

Convert byte array to Mat in JavaCV

Like this:
import org.bytedeco.javacpp.opencv_core.Mat
import org.bytedeco.javacpp.{opencv_core, opencv_imgcodecs}
val jpegRawBytes: Array[Byte] = ??? // this is given, not a problem
val matRaw = new Mat(1, jpegRawBytes.length, opencv_core.CV_8UC1)
??? // mat.put(jpegRawBytes)
val matImg = opencv_imgcodecs.imdecode(matRaw, opencv_imgcodecs.IMREAD_COLOR)
How would I do the second ??? step, putting the Array[Byte] into the Mat?
Here is one, but it looks very inefficient, having to call put for each single byte:
val idxRaw = matRaw.createIndexer[UByteRawIndexer]()
var i = 0
while (i < jpegRawBytes.length) {
idxRaw.put(0, i, jpegRawBytes(i))
i += 1
}
public static Mat toMat(BufferedImage bi) {
OpenCVFrameConverter.ToIplImage cv = new OpenCVFrameConverter.ToIplImage();
Java2DFrameConverter jcv = new Java2DFrameConverter();
return cv.convertToMat(jcv.convert(bi));
}
public static Mat toMat(byte[] bs) {
return toMat(toBufferedImage(bs));
}
public static BufferedImage toBufferedImage(byte[] bs) {
try {
return ImageIO.read(new ByteArrayInputStream(bs));
} catch (IOException ex) {
Logger.getLogger(CEIUtils.class.getName()).log(Level.SEVERE, null, ex);
}
return null;
}

Akka-Stream implementation slower than single threaded implementation

UPDATE FROM 2015-10-30
based on Roland Kuhn Awnser:
Akka Streams is using asynchronous message passing between Actors to
implement stream processing stages. Passing data across an
asynchronous boundary has an overhead that you are seeing here: your
computation seems to take only about 160ns (derived from the
single-threaded measurement) while the streaming solution takes
roughly 1µs per element, which is dominated by the message passing.
Another misconception is that saying “stream” implies parallelism: in
your code all computation runs sequentially in a single Actor (the map
stage), so no benefit can be expected over the primitive
single-threaded solution.
In order to benefit from the parallelism afforded by Akka Streams you
need to have multiple processing stages that each perform tasks of
1µs per element, see also the docs.
I did some changes. My code now looks like:
object MultiThread {
implicit val actorSystem = ActorSystem("Sys")
implicit val materializer = ActorMaterializer()
var counter = 0
var oldProgess = 0
//RunnableFlow: in -> flow -> sink
val in = Source(() => Iterator.continually((1254785478l, "name", 48, 23.09f)))
val flow = Flow[(Long, String, Int, Float)].map(p => SharedFunctions.transform2(SharedFunctions.transform(p)))
val tupleToEvent = Flow[(Long, String, Int, Float)].map(SharedFunctions.transform)
val eventToFactorial = Flow[Event].map(SharedFunctions.transform2)
val eventChef: Flow[(Long, String, Int, Float), Int, Unit] = Flow() { implicit builder =>
import FlowGraph.Implicits._
val dispatchTuple = builder.add(Balance[(Long, String, Int, Float)](4))
val mergeEvents = builder.add(Merge[Int](4))
dispatchTuple.out(0) ~> tupleToEvent ~> eventToFactorial ~> mergeEvents.in(0)
dispatchTuple.out(1) ~> tupleToEvent ~> eventToFactorial ~> mergeEvents.in(1)
dispatchTuple.out(2) ~> tupleToEvent ~> eventToFactorial ~> mergeEvents.in(2)
dispatchTuple.out(3) ~> tupleToEvent ~> eventToFactorial ~> mergeEvents.in(3)
(dispatchTuple.in, mergeEvents.out)
}
val sink = Sink.foreach[Int]{
v => counter += 1
oldProgess = SharedFunctions.printProgress(oldProgess, SharedFunctions.maxEventCount, counter,
DateTime.now.getMillis - SharedFunctions.startTime.getMillis)
if(counter == SharedFunctions.maxEventCount) endAkka()
}
def endAkka() = {
val duration = new Duration(SharedFunctions.startTime, DateTime.now)
println("Time: " + duration.getMillis + " || Data: " + counter)
actorSystem.shutdown
actorSystem.awaitTermination
System.exit(-1)
}
def main(args: Array[String]) {
println("MultiThread started: " + SharedFunctions.startTime)
in.via(flow).runWith(sink)
// in.via(eventChef).runWith(sink)
}
}
I not sure if I get something totally wrong, but still my implementation with akka-streams is much slower (now even slower as before) but what I found out is: If I increase the work for example by doing some division the implementation with akka-streams gets faster. So If I get it right (correct me otherwise) it seems there is too much overhead in my example. So you only get a benefit from akka-streams if the code has to do heavy work?
I'm relatively new in both scala & akka-stream. I wrote a little test project which creates some events until a counter has reached a specific number. For each event the factorial for one field of the event is being computed. I implemented this twice. One time with akka-stream and one time without akka-stream (single threaded) and compared the runtime.
I didn't expect that: When I create a single event the runtime of both programs are nearly the same. But if I create 70,000,000 events the implementation without akka-streams is much faster. Here are my results (the following data is based on 24 measurements):
Single event without akka-streams: 403 (+- 2)ms
Single event with akka-streams: 444 (+-13)ms
70Mio events without akka-streams: 11778 (+-70)ms
70Mio events with akka-steams: 75424(+-2959)ms
So my Question is: What is going on? Why is my implementation with akka-stream slower?
here my code:
Implementation with Akka
object MultiThread {
implicit val actorSystem = ActorSystem("Sys")
implicit val materializer = ActorMaterializer()
var counter = 0
var oldProgess = 0
//RunnableFlow: in -> flow -> sink
val in = Source(() => Iterator.continually((1254785478l, "name", 48, 23.09f)))
val flow = Flow[(Long, String, Int, Float)].map(p => SharedFunctions.transform2(SharedFunctions.transform(p)))
val sink = Sink.foreach[Int]{
v => counter += 1
oldProgess = SharedFunctions.printProgress(oldProgess, SharedFunctions.maxEventCount, counter,
DateTime.now.getMillis - SharedFunctions.startTime.getMillis)
if(counter == SharedFunctions.maxEventCount) endAkka()
}
def endAkka() = {
val duration = new Duration(SharedFunctions.startTime, DateTime.now)
println("Time: " + duration.getMillis + " || Data: " + counter)
actorSystem.shutdown
actorSystem.awaitTermination
System.exit(-1)
}
def main(args: Array[String]) {
import scala.concurrent.ExecutionContext.Implicits.global
println("MultiThread started: " + SharedFunctions.startTime)
in.via(flow).runWith(sink).onComplete(_ => endAkka())
}
}
Implementation without Akka
object SingleThread {
def main(args: Array[String]) {
println("SingleThread started at: " + SharedFunctions.startTime)
println("0%")
val i = createEvent(0)
val duration = new Duration(SharedFunctions.startTime, DateTime.now());
println("Time: " + duration.getMillis + " || Data: " + i)
}
def createEventWorker(oldProgress: Int, count: Int, randDate: Long, name: String, age: Int, myFloat: Float): Int = {
if (count == SharedFunctions.maxEventCount) count
else {
val e = SharedFunctions.transform((randDate, name, age, myFloat))
SharedFunctions.transform2(e)
val p = SharedFunctions.printProgress(oldProgress, SharedFunctions.maxEventCount, count,
DateTime.now.getMillis - SharedFunctions.startTime.getMillis)
createEventWorker(p, count + 1, 1254785478l, "name", 48, 23.09f)
}
}
def createEvent(count: Int): Int = {
createEventWorker(0, count, 1254785478l, "name", 48, 23.09f)
}
}
SharedFunctions
object SharedFunctions {
val maxEventCount = 70000000
val startTime = DateTime.now
def transform(t : (Long, String, Int, Float)) : Event = new Event(t._1 ,t._2,t._3,t._4)
def transform2(e : Event) : Int = factorial(e.getAgeYrs)
def calculatePercentage(totalValue: Long, currentValue: Long) = Math.round((currentValue * 100) / totalValue)
def printProgress(oldProgress : Int, fileSize: Long, currentSize: Int, t: Long) = {
val cProgress = calculatePercentage(fileSize, currentSize)
if (oldProgress != cProgress) println(s"$oldProgress% | $t ms")
cProgress
}
private def factorialWorker(n1: Int, n2: Int): Int = {
if (n1 == 0) n2
else factorialWorker(n1 -1, n2*n1)
}
def factorial (n : Int): Int = {
factorialWorker(n, 1)
}
}
Implementation Event
/**
* Autogenerated by Avro
*
* DO NOT EDIT DIRECTLY
*/
#SuppressWarnings("all")
#org.apache.avro.specific.AvroGenerated
public class Event extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Event\",\"namespace\":\"week2P2\",\"fields\":[{\"name\":\"timestampMS\",\"type\":\"long\"},{\"name\":\"name\",\"type\":\"string\"},{\"name\":\"ageYrs\",\"type\":\"int\"},{\"name\":\"sizeCm\",\"type\":\"float\"}]}");
public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; }
#Deprecated public long timestampMS;
#Deprecated public CharSequence name;
#Deprecated public int ageYrs;
#Deprecated public float sizeCm;
/**
* Default constructor. Note that this does not initialize fields
* to their default values from the schema. If that is desired then
* one should use <code>newBuilder()</code>.
*/
public Event() {}
/**
* All-args constructor.
*/
public Event(Long timestampMS, CharSequence name, Integer ageYrs, Float sizeCm) {
this.timestampMS = timestampMS;
this.name = name;
this.ageYrs = ageYrs;
this.sizeCm = sizeCm;
}
public org.apache.avro.Schema getSchema() { return SCHEMA$; }
// Used by DatumWriter. Applications should not call.
public Object get(int field$) {
switch (field$) {
case 0: return timestampMS;
case 1: return name;
case 2: return ageYrs;
case 3: return sizeCm;
default: throw new org.apache.avro.AvroRuntimeException("Bad index");
}
}
// Used by DatumReader. Applications should not call.
#SuppressWarnings(value="unchecked")
public void put(int field$, Object value$) {
switch (field$) {
case 0: timestampMS = (Long)value$; break;
case 1: name = (CharSequence)value$; break;
case 2: ageYrs = (Integer)value$; break;
case 3: sizeCm = (Float)value$; break;
default: throw new org.apache.avro.AvroRuntimeException("Bad index");
}
}
/**
* Gets the value of the 'timestampMS' field.
*/
public Long getTimestampMS() {
return timestampMS;
}
/**
* Sets the value of the 'timestampMS' field.
* #param value the value to set.
*/
public void setTimestampMS(Long value) {
this.timestampMS = value;
}
/**
* Gets the value of the 'name' field.
*/
public CharSequence getName() {
return name;
}
/**
* Sets the value of the 'name' field.
* #param value the value to set.
*/
public void setName(CharSequence value) {
this.name = value;
}
/**
* Gets the value of the 'ageYrs' field.
*/
public Integer getAgeYrs() {
return ageYrs;
}
/**
* Sets the value of the 'ageYrs' field.
* #param value the value to set.
*/
public void setAgeYrs(Integer value) {
this.ageYrs = value;
}
/**
* Gets the value of the 'sizeCm' field.
*/
public Float getSizeCm() {
return sizeCm;
}
/**
* Sets the value of the 'sizeCm' field.
* #param value the value to set.
*/
public void setSizeCm(Float value) {
this.sizeCm = value;
}
/** Creates a new Event RecordBuilder */
public static Event.Builder newBuilder() {
return new Event.Builder();
}
/** Creates a new Event RecordBuilder by copying an existing Builder */
public static Event.Builder newBuilder(Event.Builder other) {
return new Event.Builder(other);
}
/** Creates a new Event RecordBuilder by copying an existing Event instance */
public static Event.Builder newBuilder(Event other) {
return new Event.Builder(other);
}
/**
* RecordBuilder for Event instances.
*/
public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<Event>
implements org.apache.avro.data.RecordBuilder<Event> {
private long timestampMS;
private CharSequence name;
private int ageYrs;
private float sizeCm;
/** Creates a new Builder */
private Builder() {
super(Event.SCHEMA$);
}
/** Creates a Builder by copying an existing Builder */
private Builder(Event.Builder other) {
super(other);
if (isValidValue(fields()[0], other.timestampMS)) {
this.timestampMS = data().deepCopy(fields()[0].schema(), other.timestampMS);
fieldSetFlags()[0] = true;
}
if (isValidValue(fields()[1], other.name)) {
this.name = data().deepCopy(fields()[1].schema(), other.name);
fieldSetFlags()[1] = true;
}
if (isValidValue(fields()[2], other.ageYrs)) {
this.ageYrs = data().deepCopy(fields()[2].schema(), other.ageYrs);
fieldSetFlags()[2] = true;
}
if (isValidValue(fields()[3], other.sizeCm)) {
this.sizeCm = data().deepCopy(fields()[3].schema(), other.sizeCm);
fieldSetFlags()[3] = true;
}
}
/** Creates a Builder by copying an existing Event instance */
private Builder(Event other) {
super(Event.SCHEMA$);
if (isValidValue(fields()[0], other.timestampMS)) {
this.timestampMS = data().deepCopy(fields()[0].schema(), other.timestampMS);
fieldSetFlags()[0] = true;
}
if (isValidValue(fields()[1], other.name)) {
this.name = data().deepCopy(fields()[1].schema(), other.name);
fieldSetFlags()[1] = true;
}
if (isValidValue(fields()[2], other.ageYrs)) {
this.ageYrs = data().deepCopy(fields()[2].schema(), other.ageYrs);
fieldSetFlags()[2] = true;
}
if (isValidValue(fields()[3], other.sizeCm)) {
this.sizeCm = data().deepCopy(fields()[3].schema(), other.sizeCm);
fieldSetFlags()[3] = true;
}
}
/** Gets the value of the 'timestampMS' field */
public Long getTimestampMS() {
return timestampMS;
}
/** Sets the value of the 'timestampMS' field */
public Event.Builder setTimestampMS(long value) {
validate(fields()[0], value);
this.timestampMS = value;
fieldSetFlags()[0] = true;
return this;
}
/** Checks whether the 'timestampMS' field has been set */
public boolean hasTimestampMS() {
return fieldSetFlags()[0];
}
/** Clears the value of the 'timestampMS' field */
public Event.Builder clearTimestampMS() {
fieldSetFlags()[0] = false;
return this;
}
/** Gets the value of the 'name' field */
public CharSequence getName() {
return name;
}
/** Sets the value of the 'name' field */
public Event.Builder setName(CharSequence value) {
validate(fields()[1], value);
this.name = value;
fieldSetFlags()[1] = true;
return this;
}
/** Checks whether the 'name' field has been set */
public boolean hasName() {
return fieldSetFlags()[1];
}
/** Clears the value of the 'name' field */
public Event.Builder clearName() {
name = null;
fieldSetFlags()[1] = false;
return this;
}
/** Gets the value of the 'ageYrs' field */
public Integer getAgeYrs() {
return ageYrs;
}
/** Sets the value of the 'ageYrs' field */
public Event.Builder setAgeYrs(int value) {
validate(fields()[2], value);
this.ageYrs = value;
fieldSetFlags()[2] = true;
return this;
}
/** Checks whether the 'ageYrs' field has been set */
public boolean hasAgeYrs() {
return fieldSetFlags()[2];
}
/** Clears the value of the 'ageYrs' field */
public Event.Builder clearAgeYrs() {
fieldSetFlags()[2] = false;
return this;
}
/** Gets the value of the 'sizeCm' field */
public Float getSizeCm() {
return sizeCm;
}
/** Sets the value of the 'sizeCm' field */
public Event.Builder setSizeCm(float value) {
validate(fields()[3], value);
this.sizeCm = value;
fieldSetFlags()[3] = true;
return this;
}
/** Checks whether the 'sizeCm' field has been set */
public boolean hasSizeCm() {
return fieldSetFlags()[3];
}
/** Clears the value of the 'sizeCm' field */
public Event.Builder clearSizeCm() {
fieldSetFlags()[3] = false;
return this;
}
#Override
public Event build() {
try {
Event record = new Event();
record.timestampMS = fieldSetFlags()[0] ? this.timestampMS : (Long) defaultValue(fields()[0]);
record.name = fieldSetFlags()[1] ? this.name : (CharSequence) defaultValue(fields()[1]);
record.ageYrs = fieldSetFlags()[2] ? this.ageYrs : (Integer) defaultValue(fields()[2]);
record.sizeCm = fieldSetFlags()[3] ? this.sizeCm : (Float) defaultValue(fields()[3]);
return record;
} catch (Exception e) {
throw new org.apache.avro.AvroRuntimeException(e);
}
}
}
}
Akka Streams is using asynchronous message passing between Actors to implement stream processing stages. Passing data across an asynchronous boundary has an overhead that you are seeing here: your computation seems to take only about 160ns (derived from the single-threaded measurement) while the streaming solution takes roughly 1µs per element, which is dominated by the message passing.
Another misconception is that saying “stream” implies parallelism: in your code all computation runs sequentially in a single Actor (the map stage), so no benefit can be expected over the primitive single-threaded solution.
In order to benefit from the parallelism afforded by Akka Streams you need to have multiple processing stages that each perform tasks of >1µs per element, see also the docs.
In addition to Roland's explanation, which I agree with fully, it should be understood that akka Streams are not just a concurrent programming framework. Streams also provide back pressure which means Events are only generated by the Source when there is demand to process them in the Sink. This communication of demand adds some overhead at each processing step.
Therefore your single-thread and multi-thread comparison is not "apples-to-apples".
If you want raw multi-threaded execution performance then Futures/Actors are a better way to go.

zipWithIndex on Apache Flink

I'd like to assign each row of my input an id - which should be a number from 0 to N - 1, where N is the number of rows in the input.
Roughly, I'd like to be able to do something like the following :
val data = sc.textFile(textFilePath, numPartitions)
val rdd = data.map(line => process(line))
val rddMatrixLike = rdd.zipWithIndex.map { case (v, idx) => someStuffWithIndex(idx, v) }
But in Apache Flink. Is it possible?
This is now a part of the 0.10-SNAPSHOT release of Apache Flink. Examples for zipWithIndex(in) and zipWithUniqueId(in) are available in the official Flink documentation.
Here is a simple implementation of the function:
public class ZipWithIndex {
public static void main(String[] args) throws Exception {
ExecutionEnvironment ee = ExecutionEnvironment.getExecutionEnvironment();
DataSet<String> in = ee.readTextFile("/home/robert/flink-workdir/debug/input");
// count elements in each partition
DataSet<Tuple2<Integer, Long>> counts = in.mapPartition(new RichMapPartitionFunction<String, Tuple2<Integer, Long>>() {
#Override
public void mapPartition(Iterable<String> values, Collector<Tuple2<Integer, Long>> out) throws Exception {
long cnt = 0;
for (String v : values) {
cnt++;
}
out.collect(new Tuple2<Integer, Long>(getRuntimeContext().getIndexOfThisSubtask(), cnt));
}
});
DataSet<Tuple2<Long, String>> result = in.mapPartition(new RichMapPartitionFunction<String, Tuple2<Long, String>>() {
long start = 0;
#Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
List<Tuple2<Integer, Long>> offsets = getRuntimeContext().getBroadcastVariable("counts");
Collections.sort(offsets, new Comparator<Tuple2<Integer, Long>>() {
#Override
public int compare(Tuple2<Integer, Long> o1, Tuple2<Integer, Long> o2) {
return ZipWithIndex.compare(o1.f0, o2.f0);
}
});
for(int i = 0; i < getRuntimeContext().getIndexOfThisSubtask(); i++) {
start += offsets.get(i).f1;
}
}
#Override
public void mapPartition(Iterable<String> values, Collector<Tuple2<Long, String>> out) throws Exception {
for(String v: values) {
out.collect(new Tuple2<Long, String>(start++, v));
}
}
}).withBroadcastSet(counts, "counts");
result.print();
}
public static int compare(int x, int y) {
return (x < y) ? -1 : ((x == y) ? 0 : 1);
}
}
This is how it works: I'm using the first mapPartition() operation to go over all elements in the partitions to count how many elements are in there.
I need to know the number of elements in each partition to properly set the offsets when assigning the IDs to the elements.
The result of the first mapPartition is a DataSet containing mappings. I'm broadcasting this DataSet to all the second mapPartition() operators which will assign the IDs to the elements from the input.
In the open() method of the second mapPartition() I'm computing the offset for each partition.
I'm probably going to contribute the code to Flink (after discussing it with the other committers).

How to process multi line input records in Spark

I have each record spread across multiple lines in the input file(Very huge file).
Ex:
Id: 2
ASIN: 0738700123
title: Test tile for this product
group: Book
salesrank: 168501
similar: 5 0738700811 1567184912 1567182813 0738700514 0738700915
categories: 2
|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Earth-Based Religions[12472]|Wicca[12484]
|Books[283155]|Subjects[1000]|Religion & Spirituality[22]|Earth-Based Religions[12472]|Witchcraft[12486]
reviews: total: 12 downloaded: 12 avg rating: 4.5
2001-12-16 cutomer: A11NCO6YTE4BTJ rating: 5 votes: 5 helpful: 4
2002-1-7 cutomer: A9CQ3PLRNIR83 rating: 4 votes: 5 helpful: 5
How to identify and process each multi line record in spark?
If the multi-line data has a defined record separator, you could use the hadoop support for multi-line records, providing the separator through a hadoop.Configuration object:
Something like this should do:
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
val conf = new Configuration
conf.set("textinputformat.record.delimiter", "id:")
val dataset = sc.newAPIHadoopFile("/path/to/data", classOf[TextInputFormat], classOf[LongWritable], classOf[Text], conf)
val data = dataset.map(x=>x._2.toString)
This will provide you with an RDD[String] where each element corresponds to a record. Afterwards you need to parse each record following your application requirements.
I have done this by implementing custom input format and record reader.
public class ParagraphInputFormat extends TextInputFormat {
#Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) {
return new ParagraphRecordReader();
}
}
public class ParagraphRecordReader extends RecordReader<LongWritable, Text> {
private long end;
private boolean stillInChunk = true;
private LongWritable key = new LongWritable();
private Text value = new Text();
private FSDataInputStream fsin;
private DataOutputBuffer buffer = new DataOutputBuffer();
private byte[] endTag = "\n\r\n".getBytes();
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = taskAttemptContext.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
fsin = fs.open(path);
long start = split.getStart();
end = split.getStart() + split.getLength();
fsin.seek(start);
if (start != 0) {
readUntilMatch(endTag, false);
}
}
public boolean nextKeyValue() throws IOException {
if (!stillInChunk) return false;
boolean status = readUntilMatch(endTag, true);
value = new Text();
value.set(buffer.getData(), 0, buffer.getLength());
key = new LongWritable(fsin.getPos());
buffer.reset();
if (!status) {
stillInChunk = false;
}
return true;
}
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return key;
}
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
public float getProgress() throws IOException, InterruptedException {
return 0;
}
public void close() throws IOException {
fsin.close();
}
private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
int i = 0;
while (true) {
int b = fsin.read();
if (b == -1) return false;
if (withinBlock) buffer.write(b);
if (b == match[i]) {
i++;
if (i >= match.length) {
return fsin.getPos() < end;
}
} else i = 0;
}
}
}
endTag identifies the end of each record.