With previous versions of Akka Streams, groupBy returned a Source of Sources that could be materialized into a Source[Seq[A]].
With Akka Streams 2.4 I see that groupBy returns a SubFlow - it's not clear to me how use this. The transformations I need to apply to the flow have to have the whole Seq available, so I can't just map over the SubFlow (I think).
I've written a class that extends GraphStage that does the aggregation via a mutable collection in the GraphStageLogic, but is there in-built functionality for this? Am I missing the point of SubFlow?
I ended up writing a GraphStage:
class FlowAggregation[A, B](f: A => B) extends GraphStage[FlowShape[A, Seq[A]]] {
val in: Inlet[A] = Inlet("in")
val out: Outlet[Seq[A]] = Outlet("out")
override val shape = FlowShape.of(in, out)
override def createLogic(inheritedAttributes: Attributes): GraphStageLogic =
new GraphStageLogic(shape) {
private var counter: Option[B] = None
private var aggregate = scala.collection.mutable.ArrayBuffer.empty[A]
setHandler(in, new InHandler {
override def onPush(): Unit = {
val element = grab(in)
counter.fold({
counter = Some(f(element))
aggregate += element
pull(in)
}) { p =>
if (f(element) == p) {
aggregate += element
pull(in)
} else {
push(out, aggregate)
counter = Some(f(element))
aggregate = scala.collection.mutable.ArrayBuffer(element)
}
}
}
override def onUpstreamFinish(): Unit = {
emit(out, aggregate)
complete(out)
}
})
setHandler(out, new OutHandler {
override def onPull(): Unit = {
pull(in)
}
})
}
}
Related
I would like to have a Source.queue (or something analogous to it to push items on a materialized graph) that is instrumented to tell me the current level of saturation of the queue.
I'd like to do that without (re-)implementing the functionality provided by the QueueSource graph stage.
One possible solution I came up with is the following:
object InstrumentedSource {
final class InstrumentedSourceQueueWithComplete[T](
delegate: SourceQueueWithComplete[T],
bufferSize: Int,
)(implicit executionContext: ExecutionContext)
extends SourceQueueWithComplete[T] {
override def complete(): Unit = delegate.complete()
override def fail(ex: Throwable): Unit = delegate.fail(ex)
override def watchCompletion(): Future[Done] = delegate.watchCompletion()
private val buffered = new AtomicLong(0)
private[InstrumentedSource] def onDequeue(): Unit = {
val _ = buffered.decrementAndGet()
}
object BufferSaturationRatioGauge extends RatioGauge {
override def getRatio: RatioGauge.Ratio = RatioGauge.Ratio.of(buffered.get(), bufferSize)
}
lazy val bufferSaturationGauge: RatioGauge = BufferSaturationRatioGauge
override def offer(elem: T): Future[QueueOfferResult] = {
val result = delegate.offer(elem)
result.foreach {
case QueueOfferResult.Enqueued =>
val _ = buffered.incrementAndGet()
case _ => // do nothing
}
result
}
}
def queue[T](bufferSize: Int, overflowStrategy: OverflowStrategy)(
implicit executionContext: ExecutionContext,
materializer: Materializer,
): Source[T, InstrumentedSourceQueueWithComplete[T]] = {
val (queue, source) = Source.queue[T](bufferSize, overflowStrategy).preMaterialize()
val instrumentedQueue = new InstrumentedSourceQueueWithComplete[T](queue, bufferSize)
source.mapMaterializedValue(_ => instrumentedQueue).map { item =>
instrumentedQueue.onDequeue()
item
}
}
}
This mostly appears to work from some manual testing (apart from the fact that buffered is at most eventually consistent with the actual number of items in the queue, which should be fine in my case), but I was wondering if there is are solutions that perhaps make better use of built-in functionalities that I might have missed.
I am trying to implement a custom Transformer in Flink following indications in its documentation but when I try to executed it seems the fit operation is never being called. Here it is what I've done so far:
class InfoGainTransformer extends Transformer[InfoGainTransformer] {
import InfoGainTransformer._
private[this] var counts: Option[collection.immutable.Vector[Map[Key, Double]]] = None
// here setters for params, as Flink does
}
object InfoGainTransformer {
// ====================================== Parameters =============================================
// ...
// ==================================== Factory methods ==========================================
// ...
// ========================================== Operations =========================================
implicit def fitLabeledVectorInfoGain = new FitOperation[InfoGainTransformer, LabeledVector] {
override def fit(instance: InfoGainTransformer, fitParameters: ParameterMap, input: DataSet[LabeledVector]): Unit = {
val counts = collection.immutable.Vector[Map[Key, Double]]()
input.map {
v =>
v.vector.map {
case (i, value) =>
println("INSIDE!!!")
val key = Key(value, v.label)
val cval = counts(i).getOrElse(key, .0)
counts(i) + (key -> cval)
}
}
}
}
implicit def fitVectorInfoGain[T <: Vector] = new FitOperation[InfoGainTransformer, T] {
override def fit(instance: InfoGainTransformer, fitParameters: ParameterMap, input: DataSet[T]): Unit = {
input
}
}
implicit def transformLabeledVectorsInfoGain = {
new TransformDataSetOperation[InfoGainTransformer, LabeledVector, LabeledVector] {
override def transformDataSet(
instance: InfoGainTransformer,
transformParameters: ParameterMap,
input: DataSet[LabeledVector]): DataSet[LabeledVector] = input
}
}
implicit def transformVectorsInfoGain[T <: Vector : BreezeVectorConverter : TypeInformation : ClassTag] = {
new TransformDataSetOperation[InfoGainTransformer, T, T] {
override def transformDataSet(instance: InfoGainTransformer, transformParameters: ParameterMap, input: DataSet[T]): DataSet[T] = input
}
}
}
Then I tried to use it in two ways:
val scaler = StandardScaler()
val polyFeatures = PolynomialFeatures()
val mlr = MultipleLinearRegression()
val gain = InfoGainTransformer().setK(2)
// Construct the pipeline
val pipeline = scaler
.chainTransformer(polyFeatures)
.chainTransformer(gain)
.chainPredictor(mlr)
val r = pipeline.predict(dataSet map (_.vector))
r.print()
And only my transformer:
pipeline.fit(dataSet)
In both cases, when I set a breakpoint inside fitLabeledVectorInfoGain, for example in the line input.map, the debugger stops there, but if I also set a breakpoint inside the nested map, for example bellow println("INSIDE!!!"), it never stops there.
Does anyone knows how could I debug this custom transformer?
It seems its working now. I think what was happening was I wasn't implementing right the FitOperation because nothing was being saved in the instance state, this is the implementation now:
implicit def fitLabeledVectorInfoGain = new FitOperation[InfoGainTransformer, LabeledVector] {
override def fit(instance: InfoGainTransformer, fitParameters: ParameterMap, input: DataSet[LabeledVector]): Unit = {
// val counts = collection.immutable.Vector[Map[Key, Double]]()
val r = input.map {
v =>
v.vector.foldLeft(Map.empty[Key, Double]) {
case (m, (i, value)) =>
println("INSIDE fit!!!")
val key = Key(value, v.label)
val cval = m.getOrElse(key, .0) + 1.0
m + (key -> cval)
}
}
instance.counts = Some(r)
}
}
Now the debugger enters correctly in all breakpoints and the TransformOperation its also being called.
I am trying to test my sliding window stage using the Akka Streams TestKit and I see this exception.
Exception in thread "main" java.lang.AssertionError: assertion failed: expected OnNext(Stream(2, ?)), found OnError(java.lang.IllegalArgumentException: Cannot push port (Sliding.out(2043106095)) twice, or before it being pulled
Akka, Akka Streams, Akka Streams TestKit version: 2.5.9
Scala version: 2.12.4
case class Sliding[T](duration: Duration, step: Duration, f: T => Long) extends GraphStage[FlowShape[T, immutable.Seq[T]]] {
val in = Inlet[T]("Sliding.in")
val out = Outlet[immutable.Seq[T]]("Sliding.out")
override val shape: FlowShape[T, immutable.Seq[T]] = FlowShape(in, out)
override protected val initialAttributes: Attributes = Attributes.name("sliding")
override def createLogic(inheritedAttributes: Attributes): GraphStageLogic = new GraphStageLogic(shape) with InHandler with OutHandler {
private var buf = Vector.empty[T]
var watermark = 0L
var dropUntilDuration = step.toMillis
private def isWindowDone(current: T) = {
if (buf.nonEmpty) {
val hts = f(buf.head)
val cts = f(current)
cts >= hts + duration.toMillis
} else false
}
override def onPush(): Unit = {
val data = grab(in)
val timeStamp = f(data)
if (timeStamp > watermark) {
watermark = timeStamp
if (isWindowDone(data)) {
push(out, buf)
buf = buf.dropWhile { x =>
val ts = f(x)
ts < dropUntilDuration
}
dropUntilDuration = dropUntilDuration + step.toMillis
}
buf :+= data
pull(in)
} else {
pull(in)
}
}
override def onPull(): Unit = {
pull(in)
}
override def onUpstreamFinish(): Unit = {
if (buf.nonEmpty) {
push(out, buf)
}
completeStage()
}
this.setHandlers(in, out, this)
}
}
Test code:
object WindowTest extends App {
implicit val as = ActorSystem("WindowTest")
implicit val m = ActorMaterializer()
val expectedResultIterator = Stream.from(1).map(_.toLong)
val infinite = Iterator.from(1)
Source
.fromIterator(() => infinite)
.map(_.toLong)
.via(Sliding(10 millis, 2 millis, identity))
.runWith(TestSink.probe[Seq[Long]])
.request(1)
.expectNext(expectedResultIterator.take(10).toSeq)
.request(1)
.expectNext(expectedResultIterator.take(11).drop(1).toSeq)
.expectComplete()
}
I am intending to fetch all records that match a criteria from Mongo using Scala Mongo Driver.
Using Observables, you can access the stream by creating a subscription:
val MaxBuffer: Long = 100
var docs: Queue[Document] = Queue.empty
var sub: Option[Subscription] = None
val q: Observable[Document]
def fetchMoreRecords: Unit = sub.get.request(MaxBuffer)
q.subscribe(new Observer[Document] {
override def onSubscribe(subscription: Subscription): Unit = {
sub = Some(subscription)
fetchMoreRecords
}
override def onError(e: Throwable): Unit = fail(out, e)
override def onComplete(): Unit = {
println("Stream is complete")
complete(out)
}
override def onNext(result: Document): Unit = {
if (doc.size == maxBuffer) {
fail(out, new RuntimeException("Buffer overflow"))
} else {
docs = docs :+ result
}
}
})
(this code is incomplete)
I would need a function like:
def isReady: Future[Boolean] = {}
Which completes whenever onNext was called at least once.
The bad way to do this would be:
def isReady: Future[Boolean] = {
Future {
def wait: Unit = {
if (docs.nonEmpty) {
true
} else { wait }
}
wait
}
}
What would be the best way to achieve this?
You want to use Promise:
val promise = Promise[Boolean]()
...
override def onNext() = {
...
promise.tryComplete(Success(true))
}
override def onError(e: Throwable) =
promise.tryComplete(Failure(e))
val future = promise.future
You should do something to handle the case when there are no result (as it is now, the future will never be satisfied ...
I have Kryo-serialized binary data stored on S3 (thousands of serialized objects).
Alpakka allows to read the content as data: Source[ByteString, NotUsed]. But Kryo format doesn't use delimiters so I can't split each serialized object into a separate ByteString using data.via(Framing.delimiter(...)).
So, Kryo actually needs to read the data to understand when an object ends, and it doesn't look streaming-friendly.
Is it possible to implement this case in streaming fashion so that I get Source[MyObject, NotUsed] in the end of the day?
Here is a graph stage that does that. It handles the case when a serialized object spans two byte strings. It needs to be improved when objects are large (not my use case) and can take more than two byte strings in Source[ByteString, NotUsed].
object KryoReadStage {
def flow[T](kryoSupport: KryoSupport,
`class`: Class[T],
serializer: Serializer[_]): Flow[ByteString, immutable.Seq[T], NotUsed] =
Flow.fromGraph(new KryoReadStage[T](kryoSupport, `class`, serializer))
}
final class KryoReadStage[T](kryoSupport: KryoSupport,
`class`: Class[T],
serializer: Serializer[_])
extends GraphStage[FlowShape[ByteString, immutable.Seq[T]]] {
override def shape: FlowShape[ByteString, immutable.Seq[T]] = FlowShape.of(in, out)
override def createLogic(inheritedAttributes: Attributes): GraphStageLogic = {
new GraphStageLogic(shape) {
setHandler(in, new InHandler {
override def onPush(): Unit = {
val bytes =
if (previousBytes.length == 0) grab(in)
else ByteString.fromArrayUnsafe(previousBytes) ++ grab(in)
Managed(new Input(new ByteBufferBackedInputStream(bytes.asByteBuffer))) { input =>
var position = 0
val acc = ListBuffer[T]()
kryoSupport.withKryo { kryo =>
var last = false
while (!last && !input.eof()) {
tryRead(kryo, input) match {
case Some(t) =>
acc += t
position = input.total().toInt
previousBytes = EmptyArray
case None =>
val bytesLeft = new Array[Byte](bytes.length - position)
val bb = bytes.asByteBuffer
bb.position(position)
bb.get(bytesLeft)
last = true
previousBytes = bytesLeft
}
}
push(out, acc.toList)
}
}
}
private def tryRead(kryo: Kryo, input: Input): Option[T] =
try {
Some(kryo.readObject(input, `class`, serializer))
} catch {
case _: KryoException => None
}
})
setHandler(out, new OutHandler {
override def onPull(): Unit = {
pull(in)
}
})
private val EmptyArray: Array[Byte] = Array.empty
private var previousBytes: Array[Byte] = EmptyArray
}
}
override def toString: String = "KryoReadStage"
private lazy val in: Inlet[ByteString] = Inlet("KryoReadStage.in")
private lazy val out: Outlet[immutable.Seq[T]] = Outlet("KryoReadStage.out")
}
Example usage:
client.download(BucketName, key)
.via(KryoReadStage.flow(kryoSupport, `class`, serializer))
.flatMapConcat(Source(_))
It uses some additional helpers below.
ByteBufferBackedInputStream:
class ByteBufferBackedInputStream(buf: ByteBuffer) extends InputStream {
override def read: Int = {
if (!buf.hasRemaining) -1
else buf.get & 0xFF
}
override def read(bytes: Array[Byte], off: Int, len: Int): Int = {
if (!buf.hasRemaining) -1
else {
val read = Math.min(len, buf.remaining)
buf.get(bytes, off, read)
read
}
}
}
Managed:
object Managed {
type AutoCloseableView[T] = T => AutoCloseable
def apply[T: AutoCloseableView, V](resource: T)(op: T => V): V =
try {
op(resource)
} finally {
resource.close()
}
}
KryoSupport:
trait KryoSupport {
def withKryo[T](f: Kryo => T): T
}
class PooledKryoSupport(serializers: (Class[_], Serializer[_])*) extends KryoSupport {
override def withKryo[T](f: Kryo => T): T = {
pool.run(new KryoCallback[T] {
override def execute(kryo: Kryo): T = f(kryo)
})
}
private val pool = {
val factory = new KryoFactory() {
override def create(): Kryo = {
val kryo = new Kryo
(KryoSupport.ScalaSerializers ++ serializers).foreach {
case ((clazz, serializer)) =>
kryo.register(clazz, serializer)
}
kryo
}
}
new KryoPool.Builder(factory).softReferences().build()
}
}