I have a use case where in I'm writing a dataframe to DynamoDB.
The issue is, that although I'm able to run the code on the cluster, the function used inside of mapPartitions() is getting hard to test.
Here is my code
import boto3
import json
def write_df(table_name, region):
the_list=[]
def my_map_partition(partition):
table = boto3.resource("dynamodb", region=region).Table(table_name)
for row in partition:
table.put_item(Item = json.loads(row.col1))
the_list.append(row.col1)
return the_list
return my_map_partition
def load_to_db(df):
df.rdd.mapPartitions(write_df("actual_table", "eu-west-2"))
#mock_dynamodb2
def init_dynamo():
client = boto3.client("dynamodb", "eu-west-2")
client.create_table(
TableName="my_test_table",
KeySchema=[
{"AttributeName": "partitionKey", "KeyType": "HASH"},
{"AttributeName": "sortKey", "KeyType": "RANGE"}
],
AttributeDefinitions=[
{"AttributeName": "partitionKey", "AttributeType": "S"},
{"AttributeName": "sortKey", "AttributeType": "S"},
]
)
def test_write_df():
self.init_dynamo()
columns = ['col1', 'col2']
vals = [('{"partitionKey":"pk1", "sortKey":"sk1", "random_value":"hello_world"}', 'dummy'),('{"partitionKey":"pk2", "sortKey":"sk2", "random_value":"hello_world"}', 'dummy')]
data = spark.createDataframe(vals, columns).select("col1")
a = data.rdd.mapPartitions(write_df("my_test_table", "eu-west-2"))
a.count()
pass #here is where I will assert values
Here is the error
objc[99130]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called.
objc[99130]: +[__NSCFConstantString initialize] may have been in progress in another thread when fork() was called. We cannot safely call it or ignore it in the fork() child process. Crashing instead. Set a breakpoint on objc_initializeAfterForkError to debug.
22/07/15 14:24:38 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 32)
org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:585)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:567)
.
.
ERROR TaskSetManager: Task 0 in stage 3.0 failed 1 times; aborting job
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
/usr/lib/python3.9/site-packages/pyspark/sql/session.py:66: in toDF
return sparkSession.createDataFrame(self, schema, sampleRatio)
/usr/lib/python3.9/site-packages/pyspark/sql/session.py:675: in createDataFrame
return self._create_dataframe(data, schema, samplingRatio, verifySchema)
/usr/lib/python3.9/site-packages/pyspark/sql/session.py:698: in _create_dataframe
rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
/usr/lib/python3.9/site-packages/pyspark/sql/session.py:486: in _createFromRDD
struct = self._inferSchema(rdd, samplingRatio, names=schema)
/usr/lib/python3.9/site-packages/pyspark/sql/session.py:460: in _inferSchema
first = rdd.first()
/usr/lib/python3.9/site-packages/pyspark/rdd.py:1588: in first
rs = self.take(1)
/usr/lib/python3.9/site-packages/pyspark/rdd.py:1568: in take
res = self.context.runJob(self, takeUpToNumLeft, p)
/usr/lib/python3.9/site-packages/pyspark/context.py:1227: in runJob
sock_info = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
/usr/lib/python3.9/site-packages/py4j/java_gateway.py:1309: in __call__
return_value = get_return_value(
/usr/lib/python3.9/site-packages/pyspark/sql/utils.py:111: in deco
return f(*a, **kw)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
answer = 'xro77'
gateway_client = <py4j.clientserver.JavaClient object at 0x124798820>
target_id = 'z:org.apache.spark.api.python.PythonRDD', name = 'runJob'
def get_return_value(answer, gateway_client, target_id=None, name=None):
"""Converts an answer received from the Java gateway into a Python object.
For example, string representation of integers are converted to Python
integer, string representation of objects are converted to JavaObject
instances, etc.
:param answer: the string returned by the Java gateway
:param gateway_client: the gateway client used to communicate with the Java
Gateway. Only necessary if the answer is a reference (e.g., object,
list, map)
:param target_id: the name of the object from which the answer comes from
(e.g., *object1* in `object1.hello()`). Optional.
:param name: the name of the member from which the answer comes from
(e.g., *hello* in `object1.hello()`). Optional.
"""
if is_error(answer)[0]:
if len(answer) > 1:
type = answer[1]
value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
if answer[1] == REFERENCE_TYPE:
> raise Py4JJavaError(
"An error occurred while calling {0}{1}{2}.\n".
format(target_id, ".", name), value)
E py4j.protocol.Py4JJavaError: An error occurred while calling
I'm not sure what is wrong in the unit tests since this is a working code.
Can someone guide me in the right direction on how do I test a function in PySpark that is being used for mapPartitions() ?
Related
I need to test if a function send was called with specific key:
from unittest.mock import patch, MagicMock
def send(key=None):
print('key is', key)
def do_smth():
send(key='a')
#patch('pitest.decorat.send', return_value=MagicMock())
def test_do_smth(mo):
do_smth()
assert mo.assert_called_with(key='a')
But I get error:
Launching pytest with arguments /Users/alber.aleksandrov/PycharmProjects/Playground/pitest/decorat.py --no-header --no-summary -q in /Users/alber.aleksandrov/PycharmProjects/Playground/pitest
============================= test session starts ==============================
collecting ... collected 1 item
decorat.py::test_do_smth FAILED [100%]key is a
decorat.py:11 (test_do_smth)
mo = <MagicMock name='send' id='4551497888'>
#patch('pitest.decorat.send', return_value=MagicMock())
def test_do_smth(mo):
do_smth()
> assert mo.assert_called_with(key='a')
decorat.py:15:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <MagicMock name='send' id='4551497888'>, args = (), kwargs = {'key': 'a'}
expected = "send(key='a')", actual = 'not called.'
error_message = "expected call not found.\nExpected: send(key='a')\nActual: not called."
def assert_called_with(self, /, *args, **kwargs):
"""assert that the last call was made with the specified arguments.
Raises an AssertionError if the args and keyword args passed in are
different to the last call to the mock."""
if self.call_args is None:
expected = self._format_mock_call_signature(args, kwargs)
actual = 'not called.'
error_message = ('expected call not found.\nExpected: %s\nActual: %s'
% (expected, actual))
> raise AssertionError(error_message)
E AssertionError: expected call not found.
E Expected: send(key='a')
E Actual: not called.
Files:
How to write test properly?
I'm using the following code, and I'm looking for some ideas to make some optimizations.
analyzePayload:
Input: payload which is JsObject and list of rules, each rule has several conditions.
Output: MyReport of all the rules which succeed, notApplicable or failed on this specific payload.
The size of the list can be pretty big, also each Rule has a big amount of conditions.
I am looking for some ideas on how to optimize that code, maybe with a lazy collection? view? stream? tailrec? and why - Thanks!
Also, note that I have anaylzeMode which can run only until one rule succeeds for ex.
def analyzePayload(payload: JsObject, rules: List[Rule]): MyReport = {
val analyzeMode = appConfig.analyzeMode
val (succeed, notApplicable, failed) = rules.foldLeft((List[Rule](), List[Rule](), List[Rule]())) { case ( seed # (succeedRules,notApplicableRules,failedRules), currRule) =>
// Evaluate Single Rule
def step(): (List[Rule], List[Rule], List[Rule]) = evalService.eval(currRule, payload) match {
// If the result is succeed
case EvalResult(true, _, _) => (currRule :: succeedRules, notApplicableRules, failedRules)
// If the result is notApplicable
case EvalResult(_, missing # _ :: _, _) => (succeedRules, currRule :: notApplicableRules, failedRules
)
// If the result is unmatched
case EvalResult(_, _, unmatched # _ :: _) => (succeedRules, notApplicableRules, currRule :: failedRules)
}
analyzeMode match {
case UNTIL_FIRST_SUCCEED => if(succeedRules.isEmpty) step() else seed
case UNTIL_FIRST_NOT_APPLICABLE => if(notApplicableRules.isEmpty) step() else seed
case UNTIL_FIRST_FAILED => if(failedRules.isEmpty) step() else seed
case DEFAULT => step()
case _ => throw new IllegalArgumentException(s"Unknown mode = ${analyzeMode}")
}
}
MyReport(succeed.reverse, notApplicable.reverse, failed.reverse)
}
First Edit:
Changed the code to use tailrec from #Tim Advise, any other suggestions? or some suggestions to make the code a little prettier?
Also, i wanted to ask if there any difference to use view before the foldLeft on the previous implementation.
Also maybe use other collection such as ListBuffer or Vector
def analyzePayload(payload: JsObject, actionRules: List[ActionRule]): MyReport = {
val analyzeMode = appConfig.analyzeMode
def isCompleted(succeed: List[Rule], notApplicable: List[Rule], failed: List[Rule]) = ((succeed, notApplicable, failed), analyzeMode) match {
case (( _ :: _, _, _), UNTIL_FIRST_SUCCEED) | (( _,_ :: _, _), UNTIL_FIRST_NOT_APPLICABLE) | (( _, _, _ :: _), UNTIL_FIRST_FAILED) => true
case (_, DEFAULT) => false
case _ => throw new IllegalArgumentException(s"Unknown mode on analyzePayload with mode = ${analyzeMode}")
}
#tailrec
def _analyzePayload(actionRules: List[ActionRule])(succeed: List[Rule], notApplicable: List[Rule], failed: List[Rule]): (List[Rule], List[Rule] ,List[Rule]) = actionRules match {
case Nil | _ if isCompleted(succeed, notApplicable, failed) => (succeed, notApplicable, failed)
case actionRule :: tail => actionRuleService.eval(actionRule, payload) match {
// If the result is succeed
case EvalResult(true, _, _) => _analyzePayload(tail)(actionRule :: succeed, notApplicable, failed)
// If the result is notApplicable
case EvalResult(_, missing # _ :: _, _) => _analyzePayload(tail)(succeed, actionRule :: notApplicable, failed)
// If the result is unmatched
case EvalResult(_, _, unmatched # _ :: _) => _analyzePayload(tail)(succeed, notApplicable, actionRule :: failed)
}
}
val res = _analyzePayload(actionRules)(Nil,Nil,Nil)
MyReport(res._1, res._2, res._3)
}
Edit 2: (Questions)
If there result will be forwarded to the Client - There no meaning for do it as view? since all the data will be evaluated right?
Maybe should I use ParSeq instead? or this will be just slower since the operation of the evalService.eval(...) is not a heavy operation?
Two obvious optimisations:
Use a tail-recursive function rater than foldLeft so that the compiler can generate an optimised loop and terminate as soon as the appropriate rule is found.
Since analyzeMode is constant, take the match outside the foldLeft. Either have separate code paths for each mode, or use analyzeMode to select a function that is used inside the loop to check for termination.
The code is rather fine, the main thing to revisit would be to make evalService.eval evaluate multiple rules in a single traversal of the json object, assuming the size of the json is not negligible
Over the years I have been testing ruamel.yaml with tox and pytest on a regular basis for multiple versions of Python. Soon after the first Python 3.7 beta got out I included that and upgraded testing against the release version of 3.7 when that was released. I have been still been doing most of my day-to-day work however with Python 3.6 (and 2.7 where necessary).
I was therefore quite surprised to get an issue logged on bitbucket, for a DeprecationWarning because ruamel.yaml was still importing things from collections the Python 2.X way (starting 3.8 these have to be imported from collections.abc, where they already live). I would have expected that my tox runs, which are a mandatory prerequisite in my toolchain for being able to push a new version to PyPI, to have caught on this a few months ago.
From the commandline you can see the warnings, e.g. when you do:
python3.7 -W always -c "import ruamel.yaml"
After some researching I added:
[pytest]
filterwarnings =
error::DeprecationWarning
error::PendingDeprecationWarning
to my tox.ini, which did not change test results for target py37 (321 passes/2 skipped/7 xfail).
Then I added:
setenv =
PYTHONWARNINGS=error
to the default ([testenv]) target. That gave some interesting changes in the result, as testing crashed because of Deprecation warnings in the tox/pytest/virtualenv toolchain itself.
I fixed those by hand (intending to automate that after a clean tox -r run), to see if going through with this would at least get an error on tox for ruamel.yaml itself, but it didn't. If you instead add:
setenv =
PYTHONWARNINGS=always::DeprecationWarning
to [testenv] you'll see that the toolchain has:
DeprecationWarning: 'U' mode is deprecated
DeprecationWarning: the imp module is deprecated in favour of importlib
DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
That last one is actually is what I was looking for but those error was because of code in the tox dependency pyparsing...
Then I new file test_import.py with a single test:
def test_import():
from ruamel.yaml
and double checked that tox executes the test (322 tests passing), but no message or warning are displayed, not even when adding -ra to pytest.
I had been expecting tox to help me find deprecations early on, but in fact it seems impossible to get them to trigger at all. I can of course add the commandline shown above, as additional command in my tox.ini. But some deprecation might not be so easily triggered and I don't want to duplicate my test effort, just to catch potential deprecations.
How can I trigger the DeprecationWarning in my code using tox?
If you start with a minimal test_one.py
def test_one():
from collections import Hashable
a simple setup.py:
from setuptools import setup, find_packages
if __name__ == '__main__':
setup(
name="depwarntest",
version="0.0.1",
description="test to get DeprecationWarning in code on 3.7",
long_description = "more details soon",
author_email="a.van.der.neut#ruamel.eu",
author="Anthon van der Neut",
license="MIT",
url="",
packages=find_packages(),
)
And a basic tox.ini:
[tox]
envlist = py37,py36,py27
[testenv]
commands =
/bin/bash -c 'pytest test_*.py'
deps =
pytest
[pytest]
filterwarnings =
error::DeprecationWarning
error::PendingDeprecationWarning
and run tox, you'll get a nice clean exception because of your import:
==================================================================================== FAILURES =====================================================================================
____________________________________________________________________________________ test_one _____________________________________________________________________________________
def test_one():
> from collections import Hashable
test_one.py:6:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
<frozen importlib._bootstrap>:1032: in _handle_fromlist
???
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
name = 'Hashable'
def __getattr__(name):
# For backwards compatibility, continue to make the collections ABCs
# through Python 3.6 available through the collections module.
# Note, no new collections ABCs were added in Python 3.7
if name in _collections_abc.__all__:
obj = getattr(_collections_abc, name)
import warnings
warnings.warn("Using or importing the ABCs from 'collections' instead "
"of from 'collections.abc' is deprecated, "
"and in 3.8 it will stop working",
> DeprecationWarning, stacklevel=2)
E DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
.tox/py37/lib/python3.7/collections/__init__.py:52: DeprecationWarning
============================================================================ 1 failed in 0.31 seconds =============================================================================
on py37, whereas py36 and py27 run fine.
Interestingly enough if you change the test file to read
from collections import Hashable
def test_one():
from collections import Hashable
running tox will run fine on py37 as well. And this even is the case if you move that module level import to another test_XYZ.py file.
For ruamel.yaml this means that all the module level imports of ruamel.yaml in the test files need to be moved to methods/functions; that any root level classes in the test that depend, e.g. on ruamel.yaml.YAML() need to use a generator; and that the module level yaml_object() needs to be handled in a special way as well.
An extra tox target helps to test gradual move by doing conformance testing:
# deprecation warning fail
[testenv:dwf]
basepython = python3.7
commands =
/bin/sed 's/collections.abc/collections/' -i .tox/dwf/lib/python3.7/site-packages/ruamel/yaml/comments.py
/bin/bash -c 'pytest --maxfail=2 _test/test_[a-cz]*.py'
Here the already corrected source comments.py is reverted, just for the modules that have been adapted are tested. ted -e py37,dwf should pass the first (once more with 321 test passing) and fail on the second target.
Just playing with continuations. The goal is to create function which will receive another function as parameter, and execution amount - and return function which will apply parameter given amount times.
The implementation looks pretty obvious
def n_times[T](func:T=>T,count:Int):T=>T = {
#tailrec
def n_times_cont(cnt:Int, continuation:T=>T):T=>T= cnt match {
case _ if cnt < 1 => throw new IllegalArgumentException(s"count was wrong $count")
case 1 => continuation
case _ => n_times_cont(cnt-1,i=>continuation(func(i)))
}
n_times_cont(count, func)
}
def inc (x:Int) = x+1
val res1 = n_times(inc,1000)(1) // Works OK, returns 1001
val res = n_times(inc,10000000)(1) // FAILS
But there is no problem - this code fails with StackOverflow error. Why there is no tail-call optimization here?
I'm running it in Eclipse using Scala plugin, and it returns
Exception in thread "main" java.lang.StackOverflowError
at scala.runtime.BoxesRunTime.boxToInteger(Unknown Source)
at Task_Mult$$anonfun$1.apply(Task_Mult.scala:25)
at Task_Mult$$anonfun$n_times_cont$1$1.apply(Task_Mult.scala:18)
p.s.
F# code, which is almost direct translation, is working without any issues
let n_times_cnt func count =
let rec n_times_impl count' continuation =
match count' with
| _ when count'<1 -> failwith "wrong count"
| 1 -> continuation
| _ -> n_times_impl (count'-1) (func >> continuation)
n_times_impl count func
let inc x = x+1
let res = (n_times_cnt inc 10000000) 1
printfn "%o" res
The Scala standard library has an implementation of trampolines in scala.util.control.TailCalls. So revisiting your implementation... When you build up the nested calls with continuation(func(t)), those are tail calls, just not optimized by the compiler. So, let's build up a T => TailRec[T], where the stack frames will be replaced with objects in the heap. Then return a function that will take the argument and pass it to that trampolined function:
import util.control.TailCalls._
def n_times_trampolined[T](func: T => T, count: Int): T => T = {
#annotation.tailrec
def n_times_cont(cnt: Int, continuation: T => TailRec[T]): T => TailRec[T] = cnt match {
case _ if cnt < 1 => throw new IllegalArgumentException(s"count was wrong $count")
case 1 => continuation
case _ => n_times_cont(cnt - 1, t => tailcall(continuation(func(t))))
}
val lifted : T => TailRec[T] = t => done(func(t))
t => n_times_cont(count, lifted)(t).result
}
I could be wrong here but I suspect that the n_times_cont inner function is properly converted to use tail recursion; the culprit's not there.
The stack is blown up by the collected continuation closures (i.e. the i=>continuation(func(i))) which make 10000000 nested calls to your inc method, once you apply the result of the main function.
in fact you can try
scala> val rs = n_times(inc, 1000000)
rs: Int => Int = <function1> //<- we're happy here
scala> rs(1) //<- this blows up the stack!
As an aside, you can rewrite
i=>continuation(func(i))
as
continuation compose func
for the sake of greater readability
I'm looking for a solution for rewriting URLs in lift using a list declared outside the scope of LiftRules.statelessRewrite.append
LiftRules.statelessRewrite.append {
case RewriteRequest(ParsePath("abc" :: Nil, _ , _ , _ ), _ , _ ) =>
RewriteResponse("index" :: Nil)
}
I'd like to have the following code working the same as the one above:
val requestList = "abc" :: Nil
LiftRules.statelessRewrite.append {
case RewriteRequest(ParsePath(requestList, _ , _ , _ ), _ , _ ) =>
RewriteResponse("index" :: Nil)
}
Could anyone write how to get such functionality with lift 2.0?
[edit]
Could you also suggest the best way to access this list's suffix as parameter. What I would like to get is similar to:
LiftRules.statelessRewrite.append {
case RewriteRequest(ParsePath(`requestList` ::: List(someId), _ , _ , _ ), _ , _ ) =>
RewriteResponse("index" :: Nil, Map("someId" -> someId))
}
Any lowercased variable in a case statement will create a new variable with that name, therefore requestList is going to be shadowed. Try this:
val requestList = "abc" :: Nil
LiftRules.statelessRewrite.append {
case RewriteRequest(ParsePath(list, _ , _ , _ ), _ , _ ) if list == requestList =>
RewriteResponse("index" :: Nil)
}
Another approach would be to use backticks (Scala ref: ‘stable identifier patterns’):
LiftRules.statelessRewrite.append {
case RewriteRequest(ParsePath(`requestList`, _ , _ , _ ), _ , _ ) =>
RewriteResponse("index" :: Nil)
}
In your case, the second form would be the canonical one to choose, but in general the first form will be more powerful.
As a third alternative, you could also define val RequestList = requestList and match against the uppercased version, though I would advise against this unless you have a good reason for creating a capitalised RequestList.