I need to retrieve data from kafka using pyspark.But I'm continuously getting "py4j.protocol.Py4JError: An error occurred while calling o26.createStream" . I'm totally new to kafka and pyspark. Any help will be greatful. My code is as follows:-
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.10:2.2.1 pyspark-shell'
if __name__ == "__main__":
sc = SparkContext(appName="PythonStreamingKafkaWordCount")
ssc = StreamingContext(sc, 60)
print("spark cotext set")
zkQuorum, topic = 'localhost:2181','near_line'
kvs = KafkaUtils.createStream(ssc, zkQuorum, "console-consumer-68081", {topic: 1})
print("connection set")
lines = kvs.map(lambda x: x[1])
counts = lines.flatMap(lambda line: line.split(" ")) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda a, b: a + b)
counts.pprint()
ssc.start()
ssc.awaitTermination()
I'm getting an error as follows:-
Exception in thread "Thread-2" java.lang.NoClassDefFoundError: kafka/common/TopicAndPartition
at java.lang.Class.getDeclaredMethods0(Native Method)
at java.lang.Class.privateGetDeclaredMethods(Class.java:2701)
at java.lang.Class.privateGetPublicMethods(Class.java:2902)
at java.lang.Class.getMethods(Class.java:1615)
at py4j.reflection.ReflectionEngine.getMethodsByNameAndLength(ReflectionEngine.java:345)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:305)
at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
at py4j.Gateway.invoke(Gateway.java:272)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: kafka.common.TopicAndPartition
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 12 more
ERROR:root:Exception while sending command.
Traceback (most recent call last):
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/py4j/java_gateway.py", line 1035, in send_command
raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/py4j/java_gateway.py", line 883, in send_command
response = connection.send_command(command)
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/py4j/java_gateway.py", line 1040, in send_command
"Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
Traceback (most recent call last):
File "/home/nayanam/PycharmProjects/recommendation_engine/derivation/kafka_consumer_test.py", line 37, in <module>
kvs = KafkaUtils.createStream(ssc, zkQuorum, "console-consumer-68081", {topic: 1})
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/pyspark/streaming/kafka.py", line 70, in createStream
jstream = helper.createStream(ssc._jssc, kafkaParams, topics, jlevel)
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/py4j/java_gateway.py", line 1133, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "/home/nayanam/anaconda3/lib/python3.5/site-packages/py4j/protocol.py", line 327, in get_return_value
format(target_id, ".", name))
py4j.protocol.Py4JError: An error occurred while calling o26.createStream
Related
My problem: How can i call a function inside another function in a class using pyspark udf.
I am trying to write a pyspark udf using a method from a class called Anomalie in the file devAM_hive.py
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
import re
class Anomalie():
def __init__(self):
self.Anomalie_udf = F.udf(Anomalie.aux,ArrayType(StringType()))
def aux(texte):
code_utilisateur=re.findall(r'[\s]*\d{2}.\d{2}.\d{4}[\s]*\d{2}.\d{2}.\d{2}\s(\w?\.?\s?.*)\s\(', texte)
return code_utilisateur
def auto_test(self,df):
df=df.withColumn("name",self.Anomalie_udf(F.col("Description")))
return df
When i call this from the main file. I am getting an error named " No module named 'devAM_hive'".But my module in which I defined the class is imported.
from devAM_hive import *
A=Anomalie()
df=A.auto_test(row_data)
df.select("name").show(50)
The error message:
22/04/09 14:30:58 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID 5)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/worker.py", line 588, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/worker.py", line 447, in read_udfs
udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/worker.py", line 249, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/worker.py", line 69, in read_command
command = serializer._read_with_length(file)
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
return self.loads(obj)
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'devAM_hive'
When i call this from the main file. I am getting an error named " No module named 'devAM_hive'". But my module in which I defined the class is imported.
Importing works because you were importing it from the driver where it's available (sitting next to your main file). But running won't work because your executors don't have it. So what you wanted to do is distributing that class using --py-files. By doing that, the class will be in executor's classpath.
spark = (SparkSession
.builder
.appName('Test App')
.config('spark.submit.pyFiles', '/path/to/devAM_hive.py')
.getOrCreate()
)
I am trying to save a CSV file using a windows path (with "" instead of "/"). I think it does not works, because of the windows path.
Is this the problem why the code does not works?
Is there a workaround for the problem?
The code:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
def init_spark(appname):
spark = SparkSession.builder.appName(appname).getOrCreate()
sc = spark.sparkContext
return spark,sc
def run_on_configs_spark():
spark,sc = init_spark(appname="bucket_analysis")
p_configs_RDD = sc.parallelize([1,4,5])
p_configs_RDD=p_configs_RDD.map(mul)
schema = StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
df=spark.createDataFrame(p_configs_RDD,schema)
df.write.saveAsTable(r"C:\Users\yuvalr\Desktop\example_csv",format="csv")
def mul(x):
return (x,x**2)
run_on_configs_spark()
The error code:
Traceback (most recent call last):
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 426, in <module>
analysis()
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 408, in analysis
run_CDH()
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 420, in run_CDH
max_prob_for_extension=None, max_base_size_B=4096,OP_arr=[0.2],
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 173, in settings_print
dic=get_map_of_worst_seq(params)
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 245, in get_map_of_worst_seq
run_over_settings_spark_test(info_obj)
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 239, in run_over_settings_spark_test
run_on_configs_spark(configs)
File "C:\Users\yuvalr\Desktop\Git_folder\algo_sim\Bucket_analysis\Set_multiple_configurations\spark_parallelized_configs.py", line 17, in run_on_configs_spark
df.write.saveAsTable(r"C:\Users\yuvalr\Desktop\example_csv",format="csv")
File "C:\Users\yuvalr\Desktop\spark\Spark\python\pyspark\sql\readwriter.py", line 868, in saveAsTable
self._jwrite.saveAsTable(name)
File "C:\Users\yuvalr\venv\lib\site-packages\py4j\java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "C:\Users\yuvalr\Desktop\spark\Spark\python\pyspark\sql\utils.py", line 137, in deco
raise_from(converted)
File "<string>", line 3, in raise_from
pyspark.sql.utils.ParseException:
mismatched input ':' expecting {<EOF>, '.', '-'}(line 1, pos 1)
== SQL ==
C:\Users\yuvalr\Desktop\example_csv
-^^^
As I see it the problem is with your output line:
Try this instead:
df.write.csv("file:///C:/Users/yuvalr/Desktop/example_csv.csv")
Yes, I know you're on Windows so you're expecting backslashes, but PySpark isn't
Windows is very sensitive to file extensions - without the .csv, you'll probably just make a folder called example_csv
You don't need a Regex r"" string for this
Using the file:/// doubly-confirms that this is a file we're talking about
As you can see saveAsTable() expects a tablename to be provided which can written in
directory spark.sql.warehouse.dir
saveAsTable(name, format=None, mode=None, partitionBy=None, **options)
Parameters
name – the table name
format – the format used to save
mode – one of append, overwrite, error, errorifexists, ignore (default: error)
partitionBy – names of partitioning columns
options – all other string options
Source: https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter
Workaround: (mind for windows C:\\)
set spark.sql.warehouse.dir pointing to destination directory as below
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
def init_spark(appname):
spark = SparkSession.builder\
.config("spark.sql.warehouse.dir", "C:\\Users\yuvalr\Desktop")\
.appName(appname).getOrCreate()
sc = spark.sparkContext
return spark,sc
def run_on_configs_spark():
spark,sc = init_spark(appname="bucket_analysis")
p_configs_RDD = sc.parallelize([1,4,5])
p_configs_RDD=p_configs_RDD.map(mul)
schema = StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
df=spark.createDataFrame(p_configs_RDD,schema)
df.write.saveAsTable("example_csv",format="csv",mode="overwrite")
def mul(x):
return (x,x**2)
run_on_configs_spark()
Edit 1:
If it is an external table (external path where underlying file is stored), you can use below
#df.write.option("path","C:\\Users\yuvalr\Desktop").saveAsTable("example_csv",format="csv",mode="overwrite")
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
def init_spark(appname):
spark = SparkSession.builder\
.appName(appname).getOrCreate()
sc = spark.sparkContext
return spark,sc
def run_on_configs_spark():
spark,sc = init_spark(appname="bucket_analysis")
p_configs_RDD = sc.parallelize([1,4,5])
p_configs_RDD=p_configs_RDD.map(mul)
schema = StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
df=spark.createDataFrame(p_configs_RDD,schema)
df.write.option("path","C:\\Users\yuvalr\Desktop").saveAsTable("example_csv",format="csv",mode="overwrite")
def mul(x):
return (x,x**2)
run_on_configs_spark()
i'm facing a problem trying to include com.databricks:spark-xml_2.10:0.4.1 to my pyspark code in pycharm
import pyspark
from pyspark.shell import sc
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import os
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
os.environ["PYSPARK_SUBMIT_ARGS"] = (
"--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell"
)
if __name__ == '__main__':
df = sqlContext.read.format('org.apache.spark.sql.xml') \
.option('rowTag', 'lei:Extension')
.load('C:\\Users\\Consultant\\Desktop\\20170501-gleif-concatenated-file'
'-lei2.xml')
df.show()
but what it returns is
Exception in thread "main" org.apache.spark.SparkException: Cannot load main class from JAR file:/C:/spark-2.4.5-bin-hadoop2.7/python/dependency
at org.apache.spark.deploy.SparkSubmitArguments.error(SparkSubmitArguments.scala:657)
at org.apache.spark.deploy.SparkSubmitArguments.loadEnvironmentArguments(SparkSubmitArguments.scala:221)
at org.apache.spark.deploy.SparkSubmitArguments.<init>(SparkSubmitArguments.scala:116)
at org.apache.spark.deploy.SparkSubmit$$anon$2$$anon$1.<init>(SparkSubmit.scala:907)
at org.apache.spark.deploy.SparkSubmit$$anon$2.parseArguments(SparkSubmit.scala:907)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:81)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Traceback (most recent call last):
File "C:/spark-2.4.5-bin-hadoop2.7/python/test.py", line 2, in <module>
from pyspark.shell import sc
File "C:\spark-2.4.5-bin-hadoop2.7\python\pyspark\shell.py", line 38, in <module>
SparkContext._ensure_initialized()
File "C:\spark-2.4.5-bin-hadoop2.7\python\pyspark\context.py", line 316, in _ensure_initialized
SparkContext._gateway = gateway or launch_gateway(conf)
File "C:\spark-2.4.5-bin-hadoop2.7\python\pyspark\java_gateway.py", line 46, in launch_gateway
return _launch_gateway(conf)
File "C:\spark-2.4.5-bin-hadoop2.7\python\pyspark\java_gateway.py", line 108, in _launch_gateway
raise Exception("Java gateway process exited before sending its port number")
Exception: Java gateway process exited before sending its port number
i'd like to add external jar directly in pycharm. Is this possible?
Thanks in advance.
You should set your environmet variable as the 1st step of your script:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = (
"--packages com.databricks:spark-xml_2.10:0.4.1"
)
import pyspark
...
Then, if you want to do this for any script you run, use Run Configurations of pycharm. You can add a template following these steps:
Go to Edit Configurations
In Templates, edit the python template
Add an Environment value like PYSPARK_SUBMIT_ARGS="--packages com.databricks:spark-xml_2.10:0.4.1"
Hope it helps.
I use pyspark streaming with enabled checkpoints.
The first launch is successful but when restart crashes with the error:
INFO scheduler.DAGScheduler: ResultStage 6 (runJob at PythonRDD.scala:441) failed in 1,160 s due to Job aborted due to stage failure: Task 0 in stage 6.0 failed 4 times, most recent failure: Lost task 0.3 in stage 6.0 (TID 86, h-1.e-contenta.com, executor 2): org.apache.spark.api.python.PythonException:
Traceback (most recent call last):
File"/data1/yarn/nm/usercache/appcache/application_1481115309392_0229/container_1481115309392_0229_01_000003/pyspark.zip/pyspark/worker.py", line 163, in main
func, profiler, deserializer, serializer = read_command(pickleSer, infile)
File"/data1/yarn/nm/usercache/appcache/application_1481115309392_0229/container_1481115309392_0229_01_000003/pyspark.zip/pyspark/worker.py", line 56, in read_command
command = serializer.loads(command.value)
File"/data1/yarn/nm/usercache/appcache/application_1481115309392_0229/container_1481115309392_0229_01_000003/pyspark.zip/pyspark/serializers.py", line 431, in loads return pickle.loads(obj, encoding=encoding)
ImportError: No module named ...
Python modules added via spark context addPyFile()
def create_streaming():
"""
Create streaming context and processing functions
:return: StreamingContext
"""
sc = SparkContext(conf=spark_config)
zip_path = zip_lib(PACKAGES, PY_FILES)
sc.addPyFile(zip_path)
ssc = StreamingContext(sc, BATCH_DURATION)
stream = KafkaUtils.createStream(ssc=ssc, zkQuorum=','.join(ZOOKEEPER_QUORUM),
groupId='new_group',
topics={topic: 1})
stream.checkpoint(BATCH_DURATION)
stream = stream \
.map(lambda x: process(ujson.loads(x[1]), geo_data_bc_value)) \
.foreachRDD(lambda_log_writer(topic, schema_bc_value))
ssc.checkpoint(STREAM_CHECKPOINT)
return ssc
if __name__ == '__main__':
ssc = StreamingContext.getOrCreate(STREAM_CHECKPOINT, lambda: create_streaming())
ssc.start()
ssc.awaitTermination()
Sorry it is my mistake.
Try this :
if __name__ == '__main__':
ssc = StreamingContext.getOrCreate('', None)
ssc.sparkContext.addPyFile()
ssc.start()
ssc.awaitTermination()
Please guide me the steps to connect and read data from MS SQL by using Pyspark.
Below is my code and the error message that i am getting when i am trying to load data from MS SQL Server. Please guide me.
import urllib
import findspark
findspark.init()
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
APP_NAME = 'My Spark Application'
conf = SparkConf().setAppName("APP_NAME").setMaster("local[4]")
sc = SparkContext(conf=conf)
sqlcontext = SQLContext(sc)
jdbcDF = sqlcontext.read.format("jdbc")\
.option("url", "jdbc:sqlserver:XXXX:1433")\
.option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver")\
.option("dbtable", "dbo.XXXX")\
.option("user", "XXXX")\
.option("password", "XXX")\
.load()
******************************ERROR***************************************
teway.py", line 1133, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "C:\spark-2.0.1-bin-hadoop2.6\python\pyspark\sql\utils.py", line 63, in d
eco
return f(*a, **kw)
File "C:\spark-2.0.1-bin-hadoop2.6\python\lib\py4j-0.10.3-src.zip\py4j\protoco
l.py", line 319, in get_return_value
format(target_id, ".", name), value)
py4j.protocol.Py4JJavaError: An error occurred while calling o66.load.
: java.lang.NullPointerException
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable
(JDBCRDD.scala:167)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(J
DBCRelation.scala:117)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.
createRelation(JdbcRelationProvider.scala:53)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation
(DataSource.scala:330)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:149)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:122)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.
java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAcces
sorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Thread.java:745)
The following solution worked for me:
Include mssql-jdbc-7.0.0.jre8.jar file into jars sub-folder( ex: C:\spark\spark-2.2.2-bin-hadoop2.7\jars) or you can paste any of the jar file based on your system.
Then use the following command to connect to MS SQL server & create the Spark Dataframe:
dbData = spark.read.jdbc("jdbc:sqlserver://servername;databaseName:ExampleDB;user:username;password:password","tablename")
Download mssql-jdbc-x.x.x.jrex.jar file (https://learn.microsoft.com/en-us/sql/connect/jdbc/download-microsoft-jdbc-driver-for-sql-server?view=sql-server-ver15)
Run the following code:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf, SQLContext
appName = "PySpark SQL Server Example - via JDBC"
master = "local[*]"
conf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("spark.driver.extraClassPath","path/to/mssql-jdbc-x.x.x.jrex.jar")
sc = SparkContext.getOrCreate(conf=conf)
sqlContext = SQLContext(sc)
spark = sqlContext.sparkSession
database = "mydatabase"
table = "dbo.mytable"
user = "username"
password = "password"
jdbcDF = spark.read.format("jdbc") \
.option("driver" , "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
.option("url", f"jdbc:sqlserver://serverip:1433;databaseName={database}") \
.option("dbtable", "mytable") \
.option("user", user) \
.option("password", password) \
.load()
jdbcDF.show()