process function of DoFn not executing

process function of DoFn not executing - apache-beam

I am trying to write a beam transform such as
util.py
class GroupIntoBatches(PTransform):
def __init__(self, batch_size):
self.batch_size = batch_size
#staticmethod
def of_size(batch_size):
return GroupIntoBatches(batch_size)
def expand(self, pcoll):
input_coder = coders.registry.get_coder(pcoll)
if not input_coder.is_kv_coder():
raise ValueError(
'coder specified in the input PCollection is not a KvCoder')
key_coder = input_coder.key_coder()
value_coder = input_coder.value_coder()
return pcoll | ParDo(_GroupIntoBatchesDoFn(self.batch_size, key_coder, value_coder))
class _GroupIntoBatchesDoFn(DoFn):
def __init__(self, batch_size, input_key_coder, input_value_coder):
self.batch_size = batch_size
self.batch_spec = BagStateSpec("GroupIntoBatches", input_value_coder)
def process(self, element):
raise Exception("Not getting to this point") # This is not working
print element
Trying to execute this transform via test case
util_test.py
class GroupIntoBatchesTest(unittest.TestCase):
NUM_ELEMENTS = 10
BATCH_SIZE = 5
#staticmethod
def _create_test_data():
scientists = [
"Einstein",
"Darwin",
"Copernicus",
"Pasteur",
"Curie",
"Faraday",
"Newton",
"Bohr",
"Galilei",
"Maxwell"
]
data = []
for i in range(GroupIntoBatchesTest.NUM_ELEMENTS):
index = i % len(scientists)
data.append(("key", scientists[index]))
return data
def test_in_global_window(self):
pipeline = TestPipeline()
collection = pipeline | beam.Create(GroupIntoBatchesTest._create_test_data()) | util.GroupIntoBatches.of_size(GroupIntoBatchesTest.BATCH_SIZE)
My Question is what is the reason the process function is not getting called on my _GroupIntoBatchesDoFn
I get this result on running my test case
test_in_global_window
(apache_beam.transforms.util_test.GroupIntoBatchesTest) ... ok

Your test is constructing the pipeline, but not actually executing it. You need to either write
pipeline = TestPipeline()
collection = pipeline | ...
pipeline.run()
or, alternatively
with TestPipeline() as pipeline:
collection = pipeline | ...
# run is implicitly called on exit of the with block
(You may also be interested in the BatchElements transform.)

Related

Use generator with ruamel.yaml

I would like to have a bunch of generators in my config dict. So I tried this:
#yaml.register_class
class UniformDistribution:
yaml_tag = '!uniform'
#classmethod
def from_yaml(cls, a, node):
for x in node.value:
if x[0].value == 'min':
min_ = float(x[1].value)
if x[0].value == 'max':
max_ = float(x[1].value)
def f():
while True:
yield np.random.uniform(min_, max_)
g = f()
return g
However, the parser never returns because generators are used internally to resolve reference like &A and *A. Therefore, something like returning (g,) is a fairly simple workaround, but I would prefer a solution where I don't need the additional and very confusing index 0 term in next(config['position_generator'][0]).
Any Ideas?

This wrapper adapted from a different question did exactly what I was looking for.
class GeneratorWrapper(Generator):
def __init__(self, function, *args):
self.function = function
self.args = args
def send(self, ignored_arg):
return self.function(*self.args)
def throw(self, typ=None, val=None, tb=None):
raise StopIteration
#yaml.register_class
class UniformDistribution:
yaml_tag = '!uniform'
#classmethod
def from_yaml(cls, constructor, node):
for x in node.value:
value = float(x[1].value)
if x[0].value == 'min':
min_ = value
if x[0].value == 'max':
max_ = value
return GeneratorWrapper(np.random.uniform, min_, max_)

How to test function depended on another one in python

I need to test this type of code bellow:
list = [1,2,3,4]
def getData(list):
return list[0] + list[1]
def processData():
data = getData(list)
multiply = data*data
return multiply
def test_functions():
assert getData([0,1]) == 1
assert processData() == 1
How to tell the test I need data = getData([0,1]), so basically replace data with my test values.

py4j.Py4JException: Method getstate([]) does not exist when pass class method to map in pyspark

I have a rdd in a class, and I defined the map function as class method, when I pass the map function to pyspark map, it raised an error:py4j.Py4JException: Method __getstate__([]) does not exist, my code:
class A(object):
def __init__(self):
conf = SparkConf().setMaster("local[*]").setAppName("A")
self.spark = SparkSession.builder.config(conf=conf).getOrCreate()
def f(self):
mapper = self.mapper
rdds = self.spark.sparkContext.parallelize([1, 2, 3])
print(rdds.map(mapper).collect())
# #staticmethod
def mapper(self, row):
s = []
for i in range(5):
if row == 1:
if len(s) >= 2:
break
if row == 2:
if len(s) >= 3:
break
s.append(row)
return s
Someone say the self cannot pass to workers, so I use mapper = self.mapper, but it still not work, How can I deal with it except adding staticmethod decorator to mapper?

dynamically parse a string and return a function in scala using reflection and interpretors

I am trying to dinamically interpret code given as a String.
Eg:
val myString = "def f(x:Int):Int=x+1".
Im looking for a method that will return the real function out of it:
Eg:
val myIncrementFunction = myDarkMagicFunctionThatWillBuildMyFunction(myString)
println(myIncrementFunction(3))
will print 4
Use case: I want to use some simple functions from that interpreted code later in my code. For example they can provide something like def fun(x: Int): Int = x + 1 as a string, then I use the interpreter to compile/execute that code and then I'd like to be able to use this fun(x) in a map for example.
The problem is that that function type is unknown for me, and this is one of the big problems because I need to cast back from IMain.
I've read about reflection, type system and such, and after some googling I reached this point. Also I checked twitter's util-eval but I cant see too much from the docs and the examples in their tests, it's pretty the same thing.
If I know the type I can do something like
val settings = new Settings
val imain = new IMain(settings)
val res = imain.interpret("def f(x:Int):Int=x+1; val ret=f _ ")
val myF = imain.valueOfTerm("ret").get.asInstanceOf[Function[Int,Int]]
println(myF(2))
which works correctly and prints 3 but I am blocked by the problem I said above, that I dont know the type of the function, and this example works just because I casted to the type I used when I defined the string function for testing how IMain works.
Do you know any method how I could achieve this functionality ?
I'm a newbie so please excuse me if I wrote any mistakes.
Thanks

Ok, I managed to achieve the functionality I wanted, I am still looking for improving this code, but this snippet does what I want.
I used scala toolbox and quasiquotes
import scala.reflect.runtime.universe.{Quasiquote, runtimeMirror}
import scala.tools.reflect.ToolBox
object App {
def main(args: Array[String]): Unit = {
val mirror = runtimeMirror(getClass.getClassLoader)
val tb = ToolBox(mirror).mkToolBox()
val data = Array(1, 2, 3)
println("Data before function applied on it")
println(data.mkString(","))
println("Please enter the map function you want:")
val function = scala.io.StdIn.readLine()
val functionWrapper = "object FunctionWrapper { " + function + "}"
val functionSymbol = tb.define(tb.parse(functionWrapper).asInstanceOf[tb.u.ImplDef])
// Map each element using user specified function
val dataAfterFunctionApplied = data.map(x => tb.eval(q"$functionSymbol.function($x)"))
println("Data after function applied on it")
println(dataAfterFunctionApplied.mkString(","))
}
}
And here is the result in the terminal:
Data before function applied on it
1,2,3
Please enter the map function you want:
def function(x: Int): Int = x + 2
Data after function applied on it
3,4,5
Process finished with exit code 0

I wanted to elaborate the previous answer with the comment and perform an evaluation of the solutions:
import scala.reflect.runtime.universe.{Quasiquote, runtimeMirror}
import scala.tools.reflect.ToolBox
object Runtime {
def time[R](block: => R): R = {
val t0 = System.nanoTime()
val result = block // call-by-name
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0) + " ns")
result
}
def main(args: Array[String]): Unit = {
val mirror = runtimeMirror(getClass.getClassLoader)
val tb = ToolBox(mirror).mkToolBox()
val data = Array(1, 2, 3)
println(s"Data before function applied on it: '${data.toList}")
val function = "def apply(x: Int): Int = x + 2"
println(s"Function: '$function'")
println("#######################")
// Function with tb.eval
println(".... with tb.eval")
val functionWrapper = "object FunctionWrapper { " + function + "}"
// This takes around 1sec!
val functionSymbol = time { tb.define(tb.parse(functionWrapper).asInstanceOf[tb.u.ImplDef])}
// This takes around 0.5 sec!
val result = time {data.map(x => tb.eval(q"$functionSymbol.apply($x)"))}
println(s"Data after function applied on it: '${result.toList}'")
println(".... without tb.eval")
val func = time {tb.eval(q"$functionSymbol.apply _").asInstanceOf[Int => Int]}
// This takes around 0.5 sec!
val result2 = time {data.map(func)}
println(s"Data after function applied on it: '${result2.toList}'")
}
}
If we execute the code above we see the following output:
Data before function applied on it: 'List(1, 2, 3)
Function: 'def apply(x: Int): Int = x + 2'
#######################
.... with tb.eval
Elapsed time: 716542980 ns
Elapsed time: 661386581 ns
Data after function applied on it: 'List(3, 4, 5)'
.... without tb.eval
Elapsed time: 394119232 ns
Elapsed time: 85713 ns
Data after function applied on it: 'List(3, 4, 5)'
Just to emphasize the importance of do the evaluation to extract a Function, and then apply to the data, without the end to evaluate again, as the comment in the answer indicates.

You can use twitter-util library to do this, check the test file:
https://github.com/twitter/util/blob/b0696d0/util-eval/src/test/scala/com/twitter/util/EvalTest.scala
If you need to use IMain, maybe because you want to use the intepreter with your own custom settings, you can do something like this:
a. First create a class meant to hold your result:
class ResHolder(var value: Any)
b. Create a container object to hold the result and interpret the code into that object:
val settings = new Settings()
val writer = new java.io.StringWriter()
val interpreter = new IMain(settings, writer)
val code = "def f(x:Int):Int=x+1"
// Create a container object to hold the result and bind in the interpreter
val holder = new ResHolder(null)
interpreter.bind("$result", holder.getClass.getName, holder) match {
case Success =>
case Error => throw new ScriptException("error in: binding '$result' value\n" + writer)
case Incomplete => throw new ScriptException("incomplete in: binding '$result' value\n" + writer)
}
val ir = interpreter.interpret("$result.value = " + code)
// Return cast value or throw an exception based on result
ir match {
case Success =>
val any = holder.value
any.asInstanceOf[(Int) => Int]
case Error => throw new ScriptException("error in: '" + code + "'\n" + writer)
case Incomplete => throw new ScriptException("incomplete in :'" + code + "'\n" + writer)
}

ipython parallel and own class

I'm working on a class that handles numeric operations to perform with a data array.
Unfortunately I do not get to work by applying a function of the class to the created direct view. I get the error:
, copy)
163 assert len(bufs) >= 2, "not enough buffers!"
164 pf = buffer_to_bytes_py2(bufs.pop(0))
--> 165 f = uncan(pickle.loads(pf), g)
166 pinfo = buffer_to_bytes_py2(bufs.pop(0))
167 info = pickle.loads(pinfo)
AttributeError: Can't get attribute 'calcParallel' on <IPython.core.interactiveshell.DummyMod object at 0x00000000047E4C50>
and the class:
import numpy as np
import ipyparallel as parallel
class calcParallel():
def __init__(self):
self.data = np.random.rand(10,23)
def calc(self,variables):
#parallel view
rc = parallel.Client()
dview = rc.direct_view()
dview.block = False
#Serial
self.res_serial = [self.__multiply(var) for var in variables]
#Parallel
imports = [
'import numpy as np'
]
#imports
[dview.execute(imp) for imp in imports]
#shared data
dview['data'] = self.data
#run calculation
self.pr_list = [dview.apply_async(self.__multiply, var) for var in variables]
dview.wait(self.pr_list)
#process results
self.res_parallel = []
for r in self.pr_list:
self.res_parallel.append(r.get())
def __multiply(self, num):
return data*num
t = calcParallel()
t.calc(np.random.rand(3))
Pleace help me with my problem and sorry for the bad english

f = uncan(pickle.loads(pf), g), It seems the calc function is not the right attribute for pickle, consider to define the calc outside the Class calcParallel

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

process function of DoFn not executing - apache-beam

Related

Use generator with ruamel.yaml

How to test function depended on another one in python

py4j.Py4JException: Method getstate([]) does not exist when pass class method to map in pyspark

dynamically parse a string and return a function in scala using reflection and interpretors

ipython parallel and own class

Categories

Resources

We Keep Coding

iphone swift flutter scala powershell matlab mongodb postgresql perl eclipse

process function of DoFn not executing - apache-beam

Related

Use generator with ruamel.yaml

How to test function depended on another one in python

py4j.Py4JException: Method __getstate__([]) does not exist when pass class method to map in pyspark

dynamically parse a string and return a function in scala using reflection and interpretors

ipython parallel and own class

Categories

Resources

py4j.Py4JException: Method getstate([]) does not exist when pass class method to map in pyspark