process function of DoFn not executing - apache-beam

I am trying to write a beam transform such as
util.py
class GroupIntoBatches(PTransform):
def __init__(self, batch_size):
self.batch_size = batch_size
#staticmethod
def of_size(batch_size):
return GroupIntoBatches(batch_size)
def expand(self, pcoll):
input_coder = coders.registry.get_coder(pcoll)
if not input_coder.is_kv_coder():
raise ValueError(
'coder specified in the input PCollection is not a KvCoder')
key_coder = input_coder.key_coder()
value_coder = input_coder.value_coder()
return pcoll | ParDo(_GroupIntoBatchesDoFn(self.batch_size, key_coder, value_coder))
class _GroupIntoBatchesDoFn(DoFn):
def __init__(self, batch_size, input_key_coder, input_value_coder):
self.batch_size = batch_size
self.batch_spec = BagStateSpec("GroupIntoBatches", input_value_coder)
def process(self, element):
raise Exception("Not getting to this point") # This is not working
print element
Trying to execute this transform via test case
util_test.py
class GroupIntoBatchesTest(unittest.TestCase):
NUM_ELEMENTS = 10
BATCH_SIZE = 5
#staticmethod
def _create_test_data():
scientists = [
"Einstein",
"Darwin",
"Copernicus",
"Pasteur",
"Curie",
"Faraday",
"Newton",
"Bohr",
"Galilei",
"Maxwell"
]
data = []
for i in range(GroupIntoBatchesTest.NUM_ELEMENTS):
index = i % len(scientists)
data.append(("key", scientists[index]))
return data
def test_in_global_window(self):
pipeline = TestPipeline()
collection = pipeline | beam.Create(GroupIntoBatchesTest._create_test_data()) | util.GroupIntoBatches.of_size(GroupIntoBatchesTest.BATCH_SIZE)
My Question is what is the reason the process function is not getting called on my _GroupIntoBatchesDoFn
I get this result on running my test case
test_in_global_window
(apache_beam.transforms.util_test.GroupIntoBatchesTest) ... ok

Your test is constructing the pipeline, but not actually executing it. You need to either write
pipeline = TestPipeline()
collection = pipeline | ...
pipeline.run()
or, alternatively
with TestPipeline() as pipeline:
collection = pipeline | ...
# run is implicitly called on exit of the with block
(You may also be interested in the BatchElements transform.)

Related

Use generator with ruamel.yaml

I would like to have a bunch of generators in my config dict. So I tried this:
#yaml.register_class
class UniformDistribution:
yaml_tag = '!uniform'
#classmethod
def from_yaml(cls, a, node):
for x in node.value:
if x[0].value == 'min':
min_ = float(x[1].value)
if x[0].value == 'max':
max_ = float(x[1].value)
def f():
while True:
yield np.random.uniform(min_, max_)
g = f()
return g
However, the parser never returns because generators are used internally to resolve reference like &A and *A. Therefore, something like returning (g,) is a fairly simple workaround, but I would prefer a solution where I don't need the additional and very confusing index 0 term in next(config['position_generator'][0]).
Any Ideas?
This wrapper adapted from a different question did exactly what I was looking for.
class GeneratorWrapper(Generator):
def __init__(self, function, *args):
self.function = function
self.args = args
def send(self, ignored_arg):
return self.function(*self.args)
def throw(self, typ=None, val=None, tb=None):
raise StopIteration
#yaml.register_class
class UniformDistribution:
yaml_tag = '!uniform'
#classmethod
def from_yaml(cls, constructor, node):
for x in node.value:
value = float(x[1].value)
if x[0].value == 'min':
min_ = value
if x[0].value == 'max':
max_ = value
return GeneratorWrapper(np.random.uniform, min_, max_)

How to test function depended on another one in python

I need to test this type of code bellow:
list = [1,2,3,4]
def getData(list):
return list[0] + list[1]
def processData():
data = getData(list)
multiply = data*data
return multiply
def test_functions():
assert getData([0,1]) == 1
assert processData() == 1
How to tell the test I need data = getData([0,1]), so basically replace data with my test values.

py4j.Py4JException: Method __getstate__([]) does not exist when pass class method to map in pyspark

I have a rdd in a class, and I defined the map function as class method, when I pass the map function to pyspark map, it raised an error:py4j.Py4JException: Method __getstate__([]) does not exist, my code:
class A(object):
def __init__(self):
conf = SparkConf().setMaster("local[*]").setAppName("A")
self.spark = SparkSession.builder.config(conf=conf).getOrCreate()
def f(self):
mapper = self.mapper
rdds = self.spark.sparkContext.parallelize([1, 2, 3])
print(rdds.map(mapper).collect())
# #staticmethod
def mapper(self, row):
s = []
for i in range(5):
if row == 1:
if len(s) >= 2:
break
if row == 2:
if len(s) >= 3:
break
s.append(row)
return s
Someone say the self cannot pass to workers, so I use mapper = self.mapper, but it still not work, How can I deal with it except adding staticmethod decorator to mapper?

dynamically parse a string and return a function in scala using reflection and interpretors

I am trying to dinamically interpret code given as a String.
Eg:
val myString = "def f(x:Int):Int=x+1".
Im looking for a method that will return the real function out of it:
Eg:
val myIncrementFunction = myDarkMagicFunctionThatWillBuildMyFunction(myString)
println(myIncrementFunction(3))
will print 4
Use case: I want to use some simple functions from that interpreted code later in my code. For example they can provide something like def fun(x: Int): Int = x + 1 as a string, then I use the interpreter to compile/execute that code and then I'd like to be able to use this fun(x) in a map for example.
The problem is that that function type is unknown for me, and this is one of the big problems because I need to cast back from IMain.
I've read about reflection, type system and such, and after some googling I reached this point. Also I checked twitter's util-eval but I cant see too much from the docs and the examples in their tests, it's pretty the same thing.
If I know the type I can do something like
val settings = new Settings
val imain = new IMain(settings)
val res = imain.interpret("def f(x:Int):Int=x+1; val ret=f _ ")
val myF = imain.valueOfTerm("ret").get.asInstanceOf[Function[Int,Int]]
println(myF(2))
which works correctly and prints 3 but I am blocked by the problem I said above, that I dont know the type of the function, and this example works just because I casted to the type I used when I defined the string function for testing how IMain works.
Do you know any method how I could achieve this functionality ?
I'm a newbie so please excuse me if I wrote any mistakes.
Thanks
Ok, I managed to achieve the functionality I wanted, I am still looking for improving this code, but this snippet does what I want.
I used scala toolbox and quasiquotes
import scala.reflect.runtime.universe.{Quasiquote, runtimeMirror}
import scala.tools.reflect.ToolBox
object App {
def main(args: Array[String]): Unit = {
val mirror = runtimeMirror(getClass.getClassLoader)
val tb = ToolBox(mirror).mkToolBox()
val data = Array(1, 2, 3)
println("Data before function applied on it")
println(data.mkString(","))
println("Please enter the map function you want:")
val function = scala.io.StdIn.readLine()
val functionWrapper = "object FunctionWrapper { " + function + "}"
val functionSymbol = tb.define(tb.parse(functionWrapper).asInstanceOf[tb.u.ImplDef])
// Map each element using user specified function
val dataAfterFunctionApplied = data.map(x => tb.eval(q"$functionSymbol.function($x)"))
println("Data after function applied on it")
println(dataAfterFunctionApplied.mkString(","))
}
}
And here is the result in the terminal:
Data before function applied on it
1,2,3
Please enter the map function you want:
def function(x: Int): Int = x + 2
Data after function applied on it
3,4,5
Process finished with exit code 0
I wanted to elaborate the previous answer with the comment and perform an evaluation of the solutions:
import scala.reflect.runtime.universe.{Quasiquote, runtimeMirror}
import scala.tools.reflect.ToolBox
object Runtime {
def time[R](block: => R): R = {
val t0 = System.nanoTime()
val result = block // call-by-name
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0) + " ns")
result
}
def main(args: Array[String]): Unit = {
val mirror = runtimeMirror(getClass.getClassLoader)
val tb = ToolBox(mirror).mkToolBox()
val data = Array(1, 2, 3)
println(s"Data before function applied on it: '${data.toList}")
val function = "def apply(x: Int): Int = x + 2"
println(s"Function: '$function'")
println("#######################")
// Function with tb.eval
println(".... with tb.eval")
val functionWrapper = "object FunctionWrapper { " + function + "}"
// This takes around 1sec!
val functionSymbol = time { tb.define(tb.parse(functionWrapper).asInstanceOf[tb.u.ImplDef])}
// This takes around 0.5 sec!
val result = time {data.map(x => tb.eval(q"$functionSymbol.apply($x)"))}
println(s"Data after function applied on it: '${result.toList}'")
println(".... without tb.eval")
val func = time {tb.eval(q"$functionSymbol.apply _").asInstanceOf[Int => Int]}
// This takes around 0.5 sec!
val result2 = time {data.map(func)}
println(s"Data after function applied on it: '${result2.toList}'")
}
}
If we execute the code above we see the following output:
Data before function applied on it: 'List(1, 2, 3)
Function: 'def apply(x: Int): Int = x + 2'
#######################
.... with tb.eval
Elapsed time: 716542980 ns
Elapsed time: 661386581 ns
Data after function applied on it: 'List(3, 4, 5)'
.... without tb.eval
Elapsed time: 394119232 ns
Elapsed time: 85713 ns
Data after function applied on it: 'List(3, 4, 5)'
Just to emphasize the importance of do the evaluation to extract a Function, and then apply to the data, without the end to evaluate again, as the comment in the answer indicates.
You can use twitter-util library to do this, check the test file:
https://github.com/twitter/util/blob/b0696d0/util-eval/src/test/scala/com/twitter/util/EvalTest.scala
If you need to use IMain, maybe because you want to use the intepreter with your own custom settings, you can do something like this:
a. First create a class meant to hold your result:
class ResHolder(var value: Any)
b. Create a container object to hold the result and interpret the code into that object:
val settings = new Settings()
val writer = new java.io.StringWriter()
val interpreter = new IMain(settings, writer)
val code = "def f(x:Int):Int=x+1"
// Create a container object to hold the result and bind in the interpreter
val holder = new ResHolder(null)
interpreter.bind("$result", holder.getClass.getName, holder) match {
case Success =>
case Error => throw new ScriptException("error in: binding '$result' value\n" + writer)
case Incomplete => throw new ScriptException("incomplete in: binding '$result' value\n" + writer)
}
val ir = interpreter.interpret("$result.value = " + code)
// Return cast value or throw an exception based on result
ir match {
case Success =>
val any = holder.value
any.asInstanceOf[(Int) => Int]
case Error => throw new ScriptException("error in: '" + code + "'\n" + writer)
case Incomplete => throw new ScriptException("incomplete in :'" + code + "'\n" + writer)
}

ipython parallel and own class

I'm working on a class that handles numeric operations to perform with a data array.
Unfortunately I do not get to work by applying a function of the class to the created direct view. I get the error:
, copy)
163 assert len(bufs) >= 2, "not enough buffers!"
164 pf = buffer_to_bytes_py2(bufs.pop(0))
--> 165 f = uncan(pickle.loads(pf), g)
166 pinfo = buffer_to_bytes_py2(bufs.pop(0))
167 info = pickle.loads(pinfo)
AttributeError: Can't get attribute 'calcParallel' on <IPython.core.interactiveshell.DummyMod object at 0x00000000047E4C50>
and the class:
import numpy as np
import ipyparallel as parallel
class calcParallel():
def __init__(self):
self.data = np.random.rand(10,23)
def calc(self,variables):
#parallel view
rc = parallel.Client()
dview = rc.direct_view()
dview.block = False
#Serial
self.res_serial = [self.__multiply(var) for var in variables]
#Parallel
imports = [
'import numpy as np'
]
#imports
[dview.execute(imp) for imp in imports]
#shared data
dview['data'] = self.data
#run calculation
self.pr_list = [dview.apply_async(self.__multiply, var) for var in variables]
dview.wait(self.pr_list)
#process results
self.res_parallel = []
for r in self.pr_list:
self.res_parallel.append(r.get())
def __multiply(self, num):
return data*num
t = calcParallel()
t.calc(np.random.rand(3))
Pleace help me with my problem and sorry for the bad english
f = uncan(pickle.loads(pf), g), It seems the calc function is not the right attribute for pickle, consider to define the calc outside the Class calcParallel