Pyspark Streaming - pyspark

I wrote this code for streaming iris classification on pyspark but I have got this error "'RDD' object has no attribute '_jdf' ". I have changed RDD to dataframe but it told that "RDD is not a itterable". Please help me solve it !!!
Many thanks.
Here is my code:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.ml import PipelineModel, Pipeline
from pyspark.sql import Row, DataFrame
from pyspark.sql.types import *
from pyspark.sql.functions import *
conf = SparkConf().setMaster("local[2]").setAppName("NetworkWordCount")
sc = SparkContext.getOrCreate(conf = conf)
ssc = StreamingContext(sc,1)
lines = ssc.socketTextStream("localhost", 8889)
#Load ML model
sameModel = PipelineModel.load("g:/Demo/DecisionTree_Model1")
#Predict the type of iris from features
result = line.foreachRDD(lambda rdd: sameModel.transform(rdd))
ssc.start()
ssc.awaitTermination()
AND THE ERROR: 'RDD' object has no attribute '_jdf'
Py4JJavaError Traceback (most recent call last)
<ipython-input-6-18f3db416f1c> in <module>()
1 ssc.start()
----> 2 ssc.awaitTermination()
E:\Spark\spark\python\pyspark\streaming\context.py in awaitTermination(self,
timeout)
204 """
205 if timeout is None:
--> 206 self._jssc.awaitTermination()
207 else:
208 self._jssc.awaitTerminationOrTimeout(int(timeout * 1000))
E:\Spark\spark\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py in
__call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
E:\Spark\spark\python\pyspark\sql\utils.py in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
E:\Spark\spark\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py in
get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling o35.awaitTermination.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
File "E:\Spark\spark\python\pyspark\streaming\util.py", line 65, in call
r = self.func(t, *rdds)
File "E:\Spark\spark\python\pyspark\streaming\dstream.py", line 159, in
<lambda>
func = lambda t, rdd: old_func(rdd)
File "<ipython-input-5-64e27204db5a>", line 1, in <lambda>
result = lines.foreachRDD(lambda rdd: sameModel.transform(rdd))
File "E:\Spark\spark\python\pyspark\ml\base.py", line 173, in transform
return self._transform(dataset)
File "E:\Spark\spark\python\pyspark\ml\pipeline.py", line 262, in _transform
dataset = t.transform(dataset)
File "E:\Spark\spark\python\pyspark\ml\base.py", line 173, in transform
return self._transform(dataset)
File "E:\Spark\spark\python\pyspark\ml\wrapper.py", line 305, in _transform
return DataFrame(self._java_obj.transform(dataset._jdf), dataset.sql_ctx)
AttributeError: 'RDD' object has no attribute '_jdf'
at org.apache.spark.streaming.api.python.TransformFunction.callPythonTransformFunction(PythonDStream.scala:95)
at org.apache.spark.streaming.api.python.TransformFunction.apply(PythonDStream.scala:78)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.api.python.PythonDStream$$anonfun$callForeachRDD$1.apply(PythonDStream.scala:179)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)

The code below is shows how to load a pre-trained model. Start Spark stream with source of socket and use the transform on it. And then sink it to console.
spark = SparkSession \
.builder \
.appName("transform ml") \
.getOrCreate()
model = PipelineModel.load("./model")
lines = spark \
.readStream \
.format("socket") \
.option("host", "localhost") \
.option("port", 9999) \
.load()
random = Random()
words = lines.select(f.lit(random.randint(1, 10000))
.alias("id"),
lines.value.alias("text")
)
prediction = model.transform(words)
query = prediction \
.writeStream \
.outputMode("append") \
.format("console") \
.start()
query.awaitTermination()

Related

Pyspark error on Dataproc while creating dataframe with the schema details

I have a Dataproc cluster with Anaconda. I've created a virtual env. inside the anaconda my-env as I need to install open source RDkit there and hence I've installed PySpark again (not using pre-installed one). Now with the below code I'm getting error in my-env but not on outside of the my-env
Code:
from pyspark.sql.types import StructField, StructType, StringType, LongType
from pyspark.sql import SparkSession
from py4j.protocol import Py4JJavaError
spark = SparkSession.builder.appName("test").getOrCreate()
fields = [StructField("col0", StringType(), True),
StructField("col1", StringType(), True),
StructField("col2", StringType(), True),
StructField("col3", StringType(), True)]
schema = StructType(fields)
chem_info = spark.createDataFrame([], schema)
This is the error I'm getting:
File
"/home/.conda/envs/my-env/lib/python3.6/site-packages/pyspark/sql/session.py",
line 749, in createDataFrame
jrdd = self._jvm.SerDeUtil.toJavaArray(rdd._to_java_object_rdd()) File
"/home/.conda/envs/my-env/lib/python3.6/site-packages/pyspark/rdd.py",
line 2297, in _to_java_object_rdd
rdd = self._pickled() File "/home/.conda/envs/my-env/lib/python3.6/site-packages/pyspark/rdd.py",
line 196, in _pickled
return self._reserialize(AutoBatchedSerializer(PickleSerializer())) File
"/home/.conda/envs/my-env/lib/python3.6/site-packages/pyspark/rdd.py",
line 594, in _reserialize
self = self.map(lambda x: x, preservesPartitioning=True) File "/home/.conda/envs/my-env/lib/python3.6/site-packages/pyspark/rdd.py",
line 325, in map
return self.mapPartitionsWithIndex(func, preservesPartitioning) File
"/home/.conda/envs/my-env/lib/python3.6/site-packages/pyspark/rdd.py",
line 365, in mapPartitionsWithIndex
return PipelinedRDD(self, f, preservesPartitioning) File "/home/.conda/envs/my-env/lib/python3.6/site-packages/pyspark/rdd.py",
line 2514, in __init__
self.is_barrier = prev._is_barrier() or isFromBarrier File "/home/.conda/envs/my-env/lib/python3.6/site-packages/pyspark/rdd.py",
line 2414, in _is_barrier
return self._jrdd.rdd().isBarrier() File "/home/.conda/envs/my-env/lib/python3.6/site-packages/py4j/java_gateway.py",
line 1257, in __call__
answer, self.gateway_client, self.target_id, self.name) File "/home/.conda/envs/my-env/lib/python3.6/site-packages/pyspark/sql/utils.py",
line 63, in deco
return f(*a, **kw) File "/home/.conda/envs/my-env/lib/python3.6/site-packages/py4j/protocol.py",
line 332, in get_return_value
format(target_id, ".", name, value)) py4j.protocol.Py4JError: An error occurred while calling o57.isBarrier. Trace: py4j.Py4JException:
Method isBarrier([]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:274)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Can you help me resolve it?
As mentioned in the pyspark: Method isBarrier([]) does not exist question, this error caused by incompatibilities between different versions of Spark installed in Dataproc cluster and PySpark that you manually installed in your conda environment.
To solve this issues you need to check Spark version on the cluster and install appropriate version of PySpark:
$ spark-submit --version
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.4.4
/_/
Using Scala version 2.12.10, OpenJDK 64-Bit Server VM, 1.8.0_232
$ conda install pyspark==2.4.4

Apply transformations on a RDD column while selecting other columns in Pyspark

I want to be able to select multiple columns of a RDD while applying transformations to one of the values. I am able to
- select specific columns
- apply transformations on one of the columns
I am unable to apply both of them together
1) Selecting specific columns
from pyspark import SparkContext
logFile = "/FileStore/tables/tendulkar.csv"
rdd = sc.textFile(logFile)
rdd.map(lambda line: (line.split(",")[0],line.split(",")[1],line.split(",")
[2])).take(4)
[('Runs', 'Mins', 'BF'),
('15', '28', '24'),
('DNB', '-', '-'),
('59', '254', '172')]
2) Apply transformations to the 1st column
df=(rdd.map(lambda line: line.split(",")[0])
.filter(lambda x: x !="DNB")
.filter(lambda x: x!= "TDNB")
.filter(lambda x: x!="absent")
.map(lambda x: x.replace("*","")))
df.take(4)
['Runs', '15', '59', '8']
I tried to do them together as follows
rdd.map(lambda line: ( (line.split(",")[0]).filter(lambda
x:x!="DNB"),line.split(",")[1],line.split(",")[2])).count()
I get an error
Py4JJavaError Traceback (most recent call last)
<command-2766458519992264> in <module>()
10 .map(lambda x: x.replace("*","")))
11
---> 12 rdd.map(lambda line: ( (line.split(",")[0]).filter(lambda x:x!="DNB"),line.split(",")[1],line.split(",")[2])).count()
/databricks/spark/python/pyspark/rdd.py in count(self)
1067 3
1068 """
-> 1069 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
1070
1071 def stats(self):
Please help
Just apply the filter with the first element in each row after your map where you select all columns you wanted:
rdd.map(lambda line: line.split(",")[:3]) \
.filter(lambda x: x[0] not in ["DNB", "TDNB", "absent"])

pyspark streaming word count example no console output

I want to use pyspark streaming to count words for files inside /predix/test/ and save output in /predix/output/. Meanwhile the console print out the word count such as : {hello: 5}.
Below is the code, but console never gives output {hello: 5} . Can someone point out where my errors are?
Thanks.
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
if __name__ == "__main__":
conf = SparkConf().setMaster("local[2]")
sc = SparkContext(appName='streamingWordsCount', conf=conf)
ssc = StreamingContext(sc, 5) # batch interval in seconds 5
lines = ssc.textFileStream("/predix/test")
words = lines.flatMap(lambda line: line.split(" "))
pairs = words.map(lambda word: (word, 1))
wordCounts = pairs.reduceByKey(lambda x, y: x + y)
wordCounts.pprint()
wordCounts.saveASTextFile("/predix/output")
ssc.start() # Start the computation
ssc.awaitTermination() # Wait for the computation to terminate

Error when calling UDF using broadcasted objects in PySpark

I am trying to invoke a UDF that uses a broadcasted object in PySpark.
Here is a minimal example that reproduces the situation and error:
import pyspark.sql.functions as sf
from pyspark.sql.types import LongType
class SquareClass:
def compute(self, n):
return n ** 2
square = SquareClass()
square_sc = sc.broadcast(square)
def f(n):
return square_sc.value.compute(n)
numbers = sc.parallelize([{'id': i} for i in range(10)]).toDF()
f_udf = sf.udf(f, LongType())
numbers.select(f_udf(numbers.id)).show(10)
The stacktrace and error message that this snippet produces:
Traceback (most recent call last)
<ipython-input-75-6e38c014e4b2> in <module>()
13 f_udf = sf.udf(f, LongType())
14
---> 15 numbers.select(f_udf(numbers.id)).show(10)
/usr/hdp/current/spark-client/python/pyspark/sql/dataframe.py in show(self, n, truncate)
255 +---+-----+
256 """
--> 257 print(self._jdf.showString(n, truncate))
258
259 def __repr__(self):
/usr/local/lib/python3.5/dist-packages/py4j/java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id,
<snip>
An error occurred while calling o938.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 49.0 failed 1 times, most recent failure: Lost task 1.0 in stage 49.0 (TID 587, localhost): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
When calling the attributes of square_sc you're calling the module SquareClass which is not present on the workers.
If you want to use a python package, class, function in a UDF, workers should be able to have access to it you can achieve this by putting the code in a python script and deploying it using --py-files when running you spark-submit, pyspark
One thing you can do is, keep the class as separate module and add the module to sparkContext.
class_module.py
class SquareClass:
def compute(self, n):
return n ** 2
pyspark-shell
import pyspark.sql.functions as sf
from pyspark.sql.types import LongType
from class_module import SquareClass
sc.addFile('class_module.py')
square = SquareClass()
square_sc = sc.broadcast(square)
def f(n):
return square_sc.value.compute(n)
f_udf = sf.udf(f, LongType())
numbers = sc.parallelize([{'id': i} for i in range(10)]).toDF()
numbers.select(f_udf(numbers.id)).show(10)
+-----+
|f(id)|
+-----+
| 0|
| 1|
| 4|
| 9|
| 16|
| 25|
| 36|
| 49|
| 64|
| 81|
+-----+

spark UDF Java Error: Method col([class java.util.ArrayList]) does not exist

I have a python dict as:
fileClass = {'a1' : ['a','b','c','d'], 'b1':['a','e','d'], 'c1': ['a','c','d','f','g']}
and a list of tuples as:
C = [('a','b'), ('c','d'),('e')]
I want to finally create a spark dataframe as:
Name (a,b) (c,d) (e)
a1 2 2 0
b1 1 1 1
c1 1 2 0
which simply contains the counts for the element in each tuples that appear in each item in dict A
to do this I create a dict to mapping each element to col index
classLoc = {'a':0,'b':0,'c':1,'d':1,'e':2}
then I use udf to define
import numpy as np
def convertDictToDF(v, classLoc, length) :
R = np.zeros((1,length))
for c in v:
try:
loc = classLoc[c]
R[loc] += 1
except:
pass
return(R)
udfConvertDictToDF = udf(convertDictToDF, ArrayType(IntegerType()))
df = sc.parallelize([
[k] + list(udfConvertDictToDF(v, classLoc, len(C)))
for k, v in fileClass.items()]).toDF(['Name']+ C)
then I got error msg as
---------------------------------------------------------------------------
Py4JError Traceback (most recent call last)
<ipython-input-40-ab668a12838a> in <module>()
1 df = sc.parallelize([
2 [k] + list(udfConvertDictToDF(v,classLoc, len(C)))
----> 3 for k, v in fileClass.items()]).toDF(['Name'] + C)
4
5 df.show()
/home/yizhng/spark-1.6.0-bin-hadoop2.6/python/pyspark/sql/functions.pyc in __call__(self, *cols)
1582 def __call__(self, *cols):
1583 sc = SparkContext._active_spark_context
-> 1584 jc = self._judf.apply(_to_seq(sc, cols, _to_java_column))
1585 return Column(jc)
1586
/home/yizhng/spark-1.6.0-bin-hadoop2.6/python/pyspark/sql/column.pyc in _to_seq(sc, cols, converter)
58 """
59 if converter:
---> 60 cols = [converter(c) for c in cols]
61 return sc._jvm.PythonUtils.toSeq(cols)
62
/home/yizhng/spark-1.6.0-bin-hadoop2.6/python/pyspark/sql/column.pyc in _to_java_column(col)
46 jcol = col._jc
47 else:
---> 48 jcol = _create_column_from_name(col)
49 return jcol
50
/home/yizhng/spark-1.6.0-bin-hadoop2.6/python/pyspark/sql/column.pyc in _create_column_from_name(name)
39 def _create_column_from_name(name):
40 sc = SparkContext._active_spark_context
---> 41 return sc._jvm.functions.col(name)
42
43
/home/yizhng/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/home/yizhng/spark-1.6.0-bin-hadoop2.6/python/pyspark/sql/utils.pyc in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/home/yizhng/spark-1.6.0-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
310 raise Py4JError(
311 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
--> 312 format(target_id, ".", name, value))
313 else:
314 raise Py4JError(
Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.col. Trace:
py4j.Py4JException: Method col([class java.util.ArrayList]) does not exist
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:335)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:360)
at py4j.Gateway.invoke(Gateway.java:254)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
I don't understand what is wrong with my UDF leads to that error msg. Please help
I think it has to do with the way you are using this line
[k] + list(udfConvertDictToDF(v, classLoc, len(C)))
at the bottom.
when I do a simple python version of it I get an error as well.
import numpy as np
C = [('a','b'), ('c','d'),('e')]
classLoc = {'a':0,'b':0,'c':1,'d':1,'e':2}
import numpy as np
def convertDictToDF(v, classLoc, length) :
# I also got rid of (1,length) for (length)
# b/c pandas .from_dict() method handles this for me
R = np.zeros(length)
for c in v:
try:
loc = classLoc[c]
R[loc] += 1
except:
pass
return(R)
[[k] + convertDictToDF(v, classLoc, len(C))
for k, v in fileClass.items()]
which produces these errors
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('S32') dtype('S32') dtype('S32')
If you were to change the list comprehension to a dict comprehension, you could get it to work.
dict = {k:convertDictToDF(v, classLoc, len(C))
for k, v in fileClass.items()}
the output of which looks like this
> {'a1': array([ 2., 2., 0.]), 'c1': array([ 1., 2., 0.]), 'b1': array([ 1., 1., 1.])}
Without knowing what you're end use case is, I'm going to get you to the output you requested, but using a slightly different way, which may not scale how you'd like, so I'm sure there's a better way.
The following code will get you the rest of the way to a dataframe,
import pandas as pd
df = pd.DataFrame.from_dict(data=dict,orient='index').sort_index()
df.columns=C
which produces your desired output
(a, b) (c, d) e
a1 2.0 2.0 0.0
b1 1.0 1.0 1.0
c1 1.0 2.0 0.0
And this will get you a Spark dataframe
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
df_s = sqlContext.createDataFrame(df)
df_s.show()
+----------+----------+---+
|('a', 'b')|('c', 'd')| e|
+----------+----------+---+
| 2.0| 2.0|0.0|
| 1.0| 1.0|1.0|
| 1.0| 2.0|0.0|
+----------+----------+---+