including external jar into pyspark using pycharm - import

i'm facing a problem trying to include com.databricks:spark-xml_2.10:0.4.1 to my pyspark code in pycharm
import pyspark
from pyspark.shell import sc
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import SparkSession
import os
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
os.environ["PYSPARK_SUBMIT_ARGS"] = (
"--packages com.databricks:spark-xml_2.10:0.4.1 pyspark-shell"
)
if __name__ == '__main__':
df = sqlContext.read.format('org.apache.spark.sql.xml') \
.option('rowTag', 'lei:Extension')
.load('C:\\Users\\Consultant\\Desktop\\20170501-gleif-concatenated-file'
'-lei2.xml')
df.show()
but what it returns is
Exception in thread "main" org.apache.spark.SparkException: Cannot load main class from JAR file:/C:/spark-2.4.5-bin-hadoop2.7/python/dependency
at org.apache.spark.deploy.SparkSubmitArguments.error(SparkSubmitArguments.scala:657)
at org.apache.spark.deploy.SparkSubmitArguments.loadEnvironmentArguments(SparkSubmitArguments.scala:221)
at org.apache.spark.deploy.SparkSubmitArguments.<init>(SparkSubmitArguments.scala:116)
at org.apache.spark.deploy.SparkSubmit$$anon$2$$anon$1.<init>(SparkSubmit.scala:907)
at org.apache.spark.deploy.SparkSubmit$$anon$2.parseArguments(SparkSubmit.scala:907)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:81)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Traceback (most recent call last):
File "C:/spark-2.4.5-bin-hadoop2.7/python/test.py", line 2, in <module>
from pyspark.shell import sc
File "C:\spark-2.4.5-bin-hadoop2.7\python\pyspark\shell.py", line 38, in <module>
SparkContext._ensure_initialized()
File "C:\spark-2.4.5-bin-hadoop2.7\python\pyspark\context.py", line 316, in _ensure_initialized
SparkContext._gateway = gateway or launch_gateway(conf)
File "C:\spark-2.4.5-bin-hadoop2.7\python\pyspark\java_gateway.py", line 46, in launch_gateway
return _launch_gateway(conf)
File "C:\spark-2.4.5-bin-hadoop2.7\python\pyspark\java_gateway.py", line 108, in _launch_gateway
raise Exception("Java gateway process exited before sending its port number")
Exception: Java gateway process exited before sending its port number
i'd like to add external jar directly in pycharm. Is this possible?
Thanks in advance.

You should set your environmet variable as the 1st step of your script:
import os
os.environ["PYSPARK_SUBMIT_ARGS"] = (
"--packages com.databricks:spark-xml_2.10:0.4.1"
)
import pyspark
...
Then, if you want to do this for any script you run, use Run Configurations of pycharm. You can add a template following these steps:
Go to Edit Configurations
In Templates, edit the python template
Add an Environment value like PYSPARK_SUBMIT_ARGS="--packages com.databricks:spark-xml_2.10:0.4.1"
Hope it helps.

Related

udf using class method pyspark

My problem: How can i call a function inside another function in a class using pyspark udf.
I am trying to write a pyspark udf using a method from a class called Anomalie in the file devAM_hive.py
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
import re
class Anomalie():
def __init__(self):
self.Anomalie_udf = F.udf(Anomalie.aux,ArrayType(StringType()))
def aux(texte):
code_utilisateur=re.findall(r'[\s]*\d{2}.\d{2}.\d{4}[\s]*\d{2}.\d{2}.\d{2}\s(\w?\.?\s?.*)\s\(', texte)
return code_utilisateur
def auto_test(self,df):
df=df.withColumn("name",self.Anomalie_udf(F.col("Description")))
return df
When i call this from the main file. I am getting an error named " No module named 'devAM_hive'".But my module in which I defined the class is imported.
from devAM_hive import *
A=Anomalie()
df=A.auto_test(row_data)
df.select("name").show(50)
The error message:
22/04/09 14:30:58 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID 5)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/worker.py", line 588, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/worker.py", line 447, in read_udfs
udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/worker.py", line 249, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/worker.py", line 69, in read_command
command = serializer._read_with_length(file)
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
return self.loads(obj)
File "/opt/mapr/spark/spark-3.1.2/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'devAM_hive'
When i call this from the main file. I am getting an error named " No module named 'devAM_hive'". But my module in which I defined the class is imported.
Importing works because you were importing it from the driver where it's available (sitting next to your main file). But running won't work because your executors don't have it. So what you wanted to do is distributing that class using --py-files. By doing that, the class will be in executor's classpath.
spark = (SparkSession
.builder
.appName('Test App')
.config('spark.submit.pyFiles', '/path/to/devAM_hive.py')
.getOrCreate()
)

Access home directory within spark task node

Directory: /home/hadoop/
module.py
def incr(value):
return int(value + 1)
main.py
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
import sys
sys.path.append('/home/hadoop/')
import module
if __name__ == '__main__':
df = spark.createDataFrame([['a', 1], ['b', 2]], schema=['id', 'value'])
df.show()
print(module.incr(5)) #this works
# this throws module not found error
incr_udf = F.udf(lambda val: module.incr(val), T.IntegerType())
df = df.withColumn('new_value', incr_udf('value'))
df.show()
Spark task nodes do not have access to /home/hadoop/
How do I import module.py from within spark task nodes?
if you are submitting the spark to yarn. the task will be progress launched by user 'yarn' in the worknode and will not have permission to access.
you can add --py-files module.py to your spark-submit command, then you want directly call the function module.py by adding from module import * since they are all in the container now

Pyspark: SaveTable in windows cannot handle windows path

I am trying to save a CSV file using a windows path (with "" instead of "/"). I think it does not works, because of the windows path.
Is this the problem why the code does not works?
Is there a workaround for the problem?
The code:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
def init_spark(appname):
spark = SparkSession.builder.appName(appname).getOrCreate()
sc = spark.sparkContext
return spark,sc
def run_on_configs_spark():
spark,sc = init_spark(appname="bucket_analysis")
p_configs_RDD = sc.parallelize([1,4,5])
p_configs_RDD=p_configs_RDD.map(mul)
schema = StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
df=spark.createDataFrame(p_configs_RDD,schema)
df.write.saveAsTable(r"C:\Users\yuvalr\Desktop\example_csv",format="csv")
def mul(x):
return (x,x**2)
run_on_configs_spark()
The error code:
Traceback (most recent call last):
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 426, in <module>
analysis()
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 408, in analysis
run_CDH()
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 420, in run_CDH
max_prob_for_extension=None, max_base_size_B=4096,OP_arr=[0.2],
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 173, in settings_print
dic=get_map_of_worst_seq(params)
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 245, in get_map_of_worst_seq
run_over_settings_spark_test(info_obj)
File "C:/Users/yuvalr/Desktop/Git_folder/algo_sim/Bucket_analysis/Set_multiple_configurations/run_multiple_configurations.py", line 239, in run_over_settings_spark_test
run_on_configs_spark(configs)
File "C:\Users\yuvalr\Desktop\Git_folder\algo_sim\Bucket_analysis\Set_multiple_configurations\spark_parallelized_configs.py", line 17, in run_on_configs_spark
df.write.saveAsTable(r"C:\Users\yuvalr\Desktop\example_csv",format="csv")
File "C:\Users\yuvalr\Desktop\spark\Spark\python\pyspark\sql\readwriter.py", line 868, in saveAsTable
self._jwrite.saveAsTable(name)
File "C:\Users\yuvalr\venv\lib\site-packages\py4j\java_gateway.py", line 1305, in __call__
answer, self.gateway_client, self.target_id, self.name)
File "C:\Users\yuvalr\Desktop\spark\Spark\python\pyspark\sql\utils.py", line 137, in deco
raise_from(converted)
File "<string>", line 3, in raise_from
pyspark.sql.utils.ParseException:
mismatched input ':' expecting {<EOF>, '.', '-'}(line 1, pos 1)
== SQL ==
C:\Users\yuvalr\Desktop\example_csv
-^^^
As I see it the problem is with your output line:
Try this instead:
df.write.csv("file:///C:/Users/yuvalr/Desktop/example_csv.csv")
Yes, I know you're on Windows so you're expecting backslashes, but PySpark isn't
Windows is very sensitive to file extensions - without the .csv, you'll probably just make a folder called example_csv
You don't need a Regex r"" string for this
Using the file:/// doubly-confirms that this is a file we're talking about
As you can see saveAsTable() expects a tablename to be provided which can written in
directory spark.sql.warehouse.dir
saveAsTable(name, format=None, mode=None, partitionBy=None, **options)
Parameters
name – the table name
format – the format used to save
mode – one of append, overwrite, error, errorifexists, ignore (default: error)
partitionBy – names of partitioning columns
options – all other string options
Source: https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter
Workaround: (mind for windows C:\\)
set spark.sql.warehouse.dir pointing to destination directory as below
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
def init_spark(appname):
spark = SparkSession.builder\
.config("spark.sql.warehouse.dir", "C:\\Users\yuvalr\Desktop")\
.appName(appname).getOrCreate()
sc = spark.sparkContext
return spark,sc
def run_on_configs_spark():
spark,sc = init_spark(appname="bucket_analysis")
p_configs_RDD = sc.parallelize([1,4,5])
p_configs_RDD=p_configs_RDD.map(mul)
schema = StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
df=spark.createDataFrame(p_configs_RDD,schema)
df.write.saveAsTable("example_csv",format="csv",mode="overwrite")
def mul(x):
return (x,x**2)
run_on_configs_spark()
Edit 1:
If it is an external table (external path where underlying file is stored), you can use below
#df.write.option("path","C:\\Users\yuvalr\Desktop").saveAsTable("example_csv",format="csv",mode="overwrite")
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import Row
def init_spark(appname):
spark = SparkSession.builder\
.appName(appname).getOrCreate()
sc = spark.sparkContext
return spark,sc
def run_on_configs_spark():
spark,sc = init_spark(appname="bucket_analysis")
p_configs_RDD = sc.parallelize([1,4,5])
p_configs_RDD=p_configs_RDD.map(mul)
schema = StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
df=spark.createDataFrame(p_configs_RDD,schema)
df.write.option("path","C:\\Users\yuvalr\Desktop").saveAsTable("example_csv",format="csv",mode="overwrite")
def mul(x):
return (x,x**2)
run_on_configs_spark()

'module' object has no attribute 'analyse' when using jieba

My pyspark job fail, and the error says that: 'module' object has no attribute 'analyse'. But I have already import jieba.analyse in the script. And similar script can run successfully in the vm locally. Not sure why the job fail.
part of my code is as follow:
# -*- coding: utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import jieba
from jieba import analyse
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)
text_file = sc.textFile("gs://xxx")
def process_uinfo(line):
line = line.strip()
line_arr = line.split('\t')
(title, content) = line_arr
l_title = jieba.analyse.extract_tags(title, topK=20, withWeight=True)
return "\t".join([l_title, content])
out_rdd = text_file.map(process_uinfo)
And the error "'module' object has no attribute 'analyse'" occur in the following line:
l_title = jieba.analyse.extract_tags(title, topK=20, withWeight=True)

Using magellan geospatial library with apache spark for standalone applications

I am trying to run simple test code in pyspark for printing points using magellan library like from the github repository, but I have problem of undefined sc context.
If I run it from command line with proposed command $SPARK_HOME/bin/spark-submit --packages harsha2010:magellan:1.0.2-s_2.10 everything works because sc is imported automatically but if I run it as a standalone application from eclipse it does not recognize sc.
I have tried all combinations for its initialization including this piece of code:
from pyspark import SparkConf,SparkContext
from magellan.types import Point
from pyspark.sql import Row, SQLContext
#from magellan-master.python.magellan.context import sc
sc = SparkContext(appName="MyGeoFencing")
#sql = SQLContext(sc)
#from magellan.context import sc
#from magellan.context import sc
#from magellan.context import SQLContext
PointRecord = Row("id", "point")
#sparkConf = SparkConf().setAppName("MyGeoFencing")
#sc = SparkContext(conf=sparkConf)
#sql = SQLContext(sc)
sqlCont = SQLContext(sc)
points = sqlCont.parallelize([
(0, Point(-1.0, -1.0)),
(1, Point(-1.0, 1.0)),
(2, Point(1.0, -1.0))]).map(lambda x: PointRecord(*x)).toDF()
points.show()
Here is the problem that sqlCont does not have method parallelize.
I even tried importing directly sc from magellan.context, but does not work either.
The same problem stands when I use scala!
Do you have some idea how this should work?
Thanks!
This works for me:
sc = spark.sparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
sqlContext = SQLContext(sc)