sedona error : java.lang.NoClassDefFoundError: org/opengis/referencing/FactoryException - geospark

/usr/share/spark-3.0/bin/pyspark --queue=szsc
--master=yarn
--packages org.apache.sedona:sedona-core-3.0_2.12:1.0.0-incubating,org.apache.sedona:sedona-sql-3.0_2.12:1.0.0-incubating,org.apache.sedona:sedona-viz-3.0_2.12:1.0.0-incubating,org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.0-incubating
--driver-memory 4g
--num-executors 100
--executor-memory 8g
--conf spark.driver.memoryOverhead=5G
--conf spark.executor.memoryOverhead=5G
spark-sql:
sql5="""
select
'aoi' as type,
b.shipment_id,
b.order_type,
b.sub_order_type,
b.buyer_geo_lat,
b.buyer_geo_lng,
a.aoi_id as region_id,
100 as region_level
from tmp_aoi_polygon_tab a, tmp_buyer_pin_tab b
where ST_Contains(a.aoi_polygon, b.point)
"""
df5=spark.sql(sql5)
df5.count()
error log:
21/05/25 23:31:20 INFO FileSourceScanExec: Planning scan with bin packing, max size: 134217728 bytes, open cost is considered as scanning 4194304 bytes.
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/share/spark-3.0/python/pyspark/sql/dataframe.py", line 585, in count
return int(self._jdf.count())
File "/usr/share/spark-3.0/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
File "/usr/share/spark-3.0/python/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/usr/share/spark-3.0/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o92.count.
: java.lang.NoClassDefFoun`enter code here`dError: org/opengis/referencing/FactoryException
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.toSpatialRdd(TraitJoinQueryExec.scala:169)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.toSpatialRdd$(TraitJoinQueryExec.scala:166)
at org.apache.spark.sql.sedona_sql.strategy.join.RangeJoinExec.toSpatialRdd(RangeJoinExec.scala:37)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.toSpatialRddPair(TraitJoinQueryExec.scala:164)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.toSpatialRddPair$(TraitJoinQueryExec.scala:160)
at org.apache.spark.sql.sedona_sql.strategy.join.RangeJoinExec.toSpatialRddPair(RangeJoinExec.scala:37)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.doExecute(TraitJoinQueryExec.scala:65)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.doExecute$(TraitJoinQueryExec.scala:56)
at org.apache.spark.sql.sedona_sql.strategy.join.RangeJoinExec.doExecute(RangeJoinExec.scala:37)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:47)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:720)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.columnar.CachedRDDBuilder.buildBuffers(InMemoryRelation.scala:89)
at org.apache.spark.sql.execution.columnar.CachedRDDBuilder.cachedColumnBuffers(InMemoryRelation.scala:65)
at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.filteredCachedBatches(InMemoryTableScanExec.scala:310)
at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.inputRDD$lzycompute(InMemoryTableScanExec.scala:135)
at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.inputRDD(InMemoryTableScanExec.scala:124)
at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.doExecute(InMemoryTableScanExec.scala:341)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:162)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:720)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD$lzycompute(ShuffleExchangeExec.scala:106)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD(ShuffleExchangeExec.scala:106)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture$lzycompute(ShuffleExchangeExec.scala:110)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture(ShuffleExchangeExec.scala:109)
at org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.$anonfun$doMaterialize$1(QueryStageExec.scala:160)
at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
at org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.doMaterialize(QueryStageExec.scala:160)
at org.apache.spark.sql.execution.adaptive.QueryStageExec.$anonfun$materialize$1(QueryStageExec.scala:79)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.adaptive.QueryStageExec.materialize(QueryStageExec.scala:79)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$4(AdaptiveSparkPlanExec.scala:175)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$4$adapted(AdaptiveSparkPlanExec.scala:173)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$1(AdaptiveSparkPlanExec.scala:173)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.getFinalPhysicalPlan(AdaptiveSparkPlanExec.scala:159)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:255)
at org.apache.spark.sql.Dataset.$anonfun$count$1(Dataset.scala:2981)
at org.apache.spark.sql.Dataset.$anonfun$count$1$adapted(Dataset.scala:2980)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
at org.apache.spark.sql.Dataset.count(Dataset.scala:2980)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.opengis.referencing.FactoryException
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
... 87 more

The same thing happened to me about 2 days ago and I finally found the solution, try to use and import the library:
For Scala:
"org.datasyslab" % "geotools-wrapper" % "geotools-24.1"
"org.locationtech.jts" % "jts-core" % "1.17.0"
import org.datasyslab
And for pyspark you need to import datasyslab geotools (ST sql functions) and jts.
This happens because sedona no longer incorporates the dependencies for its sql functions, I hope it helps you.

For the Python solution, I use pyspark, within a virtual env. I added missing jars, into the virtual env directory of Spark $DIR_VIRTUAL_ENV/lib/python3.8/site-packages/pyspark/jars, as follows:
wget https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/1.1.0-25.2/geotools-wrapper-1.1.0-25.2.jar
wget https://repo1.maven.org/maven2/org/apache/sedona/sedona-python-adapter-3.0_2.12/1.2.0-incubating/sedona-python-adapter-3.0_2.12-1.2.0-incubating.jar
wget https://repo1.maven.org/maven2/org/apache/sedona/sedona-viz-3.0_2.12/1.2.0-incubating/sedona-viz-3.0_2.12-1.2.0-incubating.jar
Instead, you can download them manually, and locate in the aforementioned directory.
Afterwards, exit and start over the pyspark shell, no need to import anything else explicitly.
Partially based on https://sedona.apache.org/setup/databricks/.
Actual Python environment:
anytree==2.8.0
apache-sedona==1.2.0
astroid==1.3.2
attrs==21.4.0
certifi==2021.10.8
click==8.1.2
click-plugins==1.1.1
cligj==0.7.2
cycler==0.11.0
Fiona==1.8.21
fonttools==4.32.0
geopandas==0.10.2
importlib-metadata==4.11.3
joblib==1.1.0
jts==0.0.3
kiwisolver==1.4.2
logilab-common==1.9.2
mapclassify==2.4.3
matplotlib==3.5.1
munch==2.5.0
mypy-extensions==0.4.3
networkx==2.8
numpy==1.22.3
packaging==21.3
pandas==1.4.2
Pillow==9.1.0
py2puml==0.5.4
py4j==0.10.9.3
pyarrow==7.0.0
pydoop==2.0.0
pylint==1.4.0
pypandoc==1.7.4
pyparsing==3.0.8
pyproj==3.3.0
pyspark==3.2.1
python-dateutil==2.8.2
pytz==2022.1
scikit-learn==1.0.2
scipy==1.8.0
Shapely==1.8.1.post1
six==1.16.0
threadpoolctl==3.1.0
typing-extensions==4.1.1
venv-pack==0.2.0
xlrd==2.0.1
zipp==3.7.0
Disclaimer: I don't have enough reputation to comment in answers.

Related

Getting error while writing the spark dataframe as CSV file

23/01/10 14:51:56 ERROR FileFormatWriter: Aborting job 18d74180-1f1e-44ea-80dc-caa5a2fe0525.
java.io.IOException: Failed to rename DeprecatedRawLocalFileStatus{path=file:/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/test/\_temporary/0/task_2023011020
21531054031999327673221_0005_m_000000/part-00000-250cb7ce-e146-4cfd-b9f1-f810af4630f2-c000.csv; isDirectory=false; length=13520; replication=1; blocksize=33554432; modification_time=16
73362315593; access_time=1673362315593; owner=; group=; permission=rw-rw-rw-; isSymlink=false; hasAcl=false; isEncrypted=false; isErasureCoded=false} to file:/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/test/part-00000-250cb7ce-e146-4cfd-b9f1-f810af4630f2-c000.csv
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.mergePaths(FileOutputCommitter.java:477)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.mergePaths(FileOutputCommitter.java:490)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:405)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$25(FileFormatWriter.scala:267)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:642)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:267)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:851)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)
23/01/10 14:51:56 WARN FileUtil: Failed to delete file or dir \[/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/test/\_temporary/0/task_202301102021531054031999327673221_0005_m_000000/.part-00000-250cb7ce-e146-4cfd-b9f1-f810af4630f2-c000.csv.crc\]: it still exists.
23/01/10 14:51:56 WARN FileUtil: Failed to delete file or dir \[/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/test/\_temporary/0/task_202301102021531054031999327673221_0005_m_000000/part-00000-250cb7ce-e146-4cfd-b9f1-f810af4630f2-c000.csv\]: it still exists.
Traceback (most recent call last):
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/manage.py", line 21, in \<module\>
main()
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/manage.py", line 17, in main
execute_from_command_line(sys.argv)
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/django/core/management/__init__.py", line 419, in execute_from_command_line
utility.execute()
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/django/core/management/__init__.py", line 413, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/django/core/management/base.py", line 354, in run_from_argv
self.execute(\*args, \*\*cmd_options)
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/django/core/management/base.py", line 398, in execute
output = self.handle(\*args, \*\*options)
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/core/management/commands/prepare_ncoa_sp.py", line 26, in handle
step.start()
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/core/management/commands/prepare_ncoa_sp.py", line 45, in start
self.prepare_agent_address_updates()
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/core/management/commands/prepare_ncoa_sp.py", line 141, in prepare_agent_address_updates
self.\_load_and_normalize(file_glob, {
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/core/management/commands/prepare_ncoa_sp.py", line 127, in \_load_and_normalize
df.write.option("header", True).csv('test')
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/pyspark/sql/readwriter.py", line 1240, in csv
self.\_jwrite.csv(path)
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/py4j/java_gateway.py", line 1321, in __call__
return_value = get_return_value(
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/pyspark/sql/utils.py", line 190, in deco
return f(\*a, \*\*kw)
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o298.csv.
: org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:651)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:278)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:851)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.IOException: Failed to rename DeprecatedRawLocalFileStatus{path=file:/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/test/\_temporary/0/task
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.mergePaths(FileOutputCommitter.java:490)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:405)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$25(FileFormatWriter.scala:267)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:642)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:267)
... 42 more
As you can see above error occurred while try to write the spark dataframe as csv file
sparkContext = SparkContext("spark://DESKTOP-1L1BM8L.localdomain:7077", "fcd_spark_session")
spark_configuration = sparkContext._conf.setAll(
[("spark.shuffle.service.enabled", "false"), ("spark.dynamicAllocation.enabled", "false"),
("spark.executor.memory", "2g"), ("spark.executor.instances", 2)])
sparkContext.stop()
self.spark_session = SparkSession.builder.appName("fcd_spark_session").config(
conf=spark_configuration) \
.master('spark://DESKTOP-1L1BM8L.localdomain:7077').getOrCreate()
def _load_and_normalize(self, glob_paths, renames=None, columns=[], processed_columns=[],
remove_duplicates=[], ):
renames = renames or {}
files = sorted(glob.glob(glob_paths))
for filepath in files:
file_name = basename(filepath)
logger.info(f'adding {basename(filepath)}')
file_write_path = self.csv_fullpath(self.cleaned_folder, "NCOA_address", file_name)
print(file_write_path)
if not os.path.exists(file_write_path):
df = self.read_csv(filepath)
df = df[columns].copy()
df = df[:100]
df = df.fillna('').astype('str')
df = df.apply(tuple, axis=1).tolist()
df = self.spark_session.createDataFrame(df, columns)
df = self.add_procuredate(df, file_name)
df = self.uppercase_and_trim_all_columns(df)
for rename_columns in renames:
df = df.withColumnRenamed(rename_columns, renames[rename_columns])
all_cols_except_procure = [col for col in df.schema.names if col != 'procure_date']
df = df.dropDuplicates(all_cols_except_procure)
df = self.get_normalized_address(df)
df = self.get_normalized_address(df, col_name='orig_normalized_address',
full_address_col='orig_address', city_col='orig_city',
state_col='orig_state',
zip_col='orig_zip')
df = df.where((df.full_address != '') & (df.normalized_address != ''))
df = df.select(processed_columns)
df = df.dropDuplicates(remove_duplicates)
df.write.option("header", True).format("csv").csv('test')
gc.collect()
else:
logger.info(f'{basename(filepath)} file is already available in cleaned folder')
Any suggestions and please comment if you need any additional info regarding the code and config

"org.apache.spark.SparkException: Writing job aborted" using Databricks and spark connector

I have used Databricks to ingest my knowledges graph (stored as nodes/edges in delta tables (parquets)) to neo4j
FORMAT = "org.neo4j.spark.DataSource"
BATCH_SIZE = 25000
URL = "***://***"
USERNAME = "***"
PASSWORD = "***"
DATABASE = "***"
neo4jconfig = {
"url" : URL,
"database": DATABASE,
"authentication.type":"basic",
"authentication.basic.username" : USERNAME,
"authentication.basic.password" : PASSWORD,
"batch.size": BATCH_SIZE,
"relationship.source.save.mode" :"Overwrite",
"relationship.target.save.mode" :"Overwrite",
"schema.optimization.type": "NODE_CONSTRAINTS",
"relationship.save.strategy": "keys",
"relationship.source.node.keys": "source:id",
"relationship.target.node.keys": "target:id",
"transaction.retries": "20",
"transaction.retry.timeout": 1000,
}
I already ingested the nodes and now i'm trying to ingest the edges it took 8 hrs
done_edges = ["people_friends.delta/"]
def writeEdgeIntoNeo4j(edge: str, input_path: str):
print(edge)
edge_df = a.read(input_path + f"edges/{edge.strip('/')}")
labels = edge_df.select('label').distinct().collect()
for label in labels:
filtered_df = edge_df.filter((F.col("label") == label[0]))
print(f"label : {label[0]}")
for relation in filtered_df.select('source_label', 'target_label').distinct().collect():
print(f"source : {relation['source_label']} :: target : {relation['target_label']}")
neo4jconfig["relationship"] = label[0]
neo4jconfig["relationship.source.labels"] = f":{relation['source_label']}"
neo4jconfig["relationship.target.labels"] = f":{relation['target_label']}"
filtered_df.repartition(1).write.format(FORMAT).mode('overwrite').options(**neo4jconfig).save()
for edge in ["people_friends.delta/"]:
if "association" not in edge and edge not in done_edges:
writeEdgeIntoNeo4j(edge, input_path)
and it gives me this error org.apache.spark.SparkException
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-4276563430066569> in <module>
1 for edge in ["people_friends.delta/"]:
2 if "association" not in edge:
----> 3 write_edge_into_neo4j(edge, input_path)
<command-4276563430066568> in write_edge_into_neo4j(edge, input_path)
11 neo4jconfig["relationship.source.labels"] = f":{relation['source_label']}"
12 neo4jconfig["relationship.target.labels"] = f":{relation['target_label']}"
---> 13 filtered_df.repartition(1).write.format(FORMAT).mode('overwrite').options(**neo4jconfig).save()
/databricks/spark/python/pyspark/sql/readwriter.py in save(self, path, format, mode, partitionBy, **options)
1132 self.format(format)
1133 if path is None:
-> 1134 self._jwrite.save()
1135 else:
1136 self._jwrite.save(path)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
115 def deco(*a, **kw):
116 try:
--> 117 return f(*a, **kw)
118 except py4j.protocol.Py4JJavaError as e:
119 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o1319.save.
: org.apache.spark.SparkException: Writing job aborted.
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:423)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:371)
at org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec.writeWithV2(WriteToDataSourceV2Exec.scala:264)
at org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec.run(WriteToDataSourceV2Exec.scala:279)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:41)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:41)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.doExecute(V2CommandExec.scala:58)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:213)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:209)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:167)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:166)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:1080)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:130)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:273)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:104)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:854)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:223)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:1080)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:400)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:312)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1054.1 failed 4 times, most recent failure: Lost task 0.3 in stage 1054.1 (TID 24864) (10.139.64.35 executor 35): org.neo4j.driver.exceptions.TransientException: Database 'neo4j' not up to the requested version: 4768. Latest database version is 4742
at org.neo4j.driver.internal.util.Futures.blockingGet(Futures.java:144)
at org.neo4j.driver.internal.InternalTransaction.run(InternalTransaction.java:60)
at org.neo4j.driver.internal.AbstractQueryRunner.run(AbstractQueryRunner.java:37)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:64)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.write(BaseDataWriter.scala:45)
at org.neo4j.spark.writer.Neo4jDataWriter.write(Neo4jDataWriter.scala:9)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$1(WriteToDataSourceV2Exec.scala:451)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1680)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:487)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:395)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$3(ResultTask.scala:75)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$1(ResultTask.scala:75)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:55)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:150)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:119)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:91)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:813)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1646)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:816)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:672)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Suppressed: org.neo4j.driver.internal.util.ErrorUtil$InternalExceptionCause
at org.neo4j.driver.internal.util.ErrorUtil.newNeo4jError(ErrorUtil.java:103)
at org.neo4j.driver.internal.async.inbound.InboundMessageDispatcher.handleFailureMessage(InboundMessageDispatcher.java:122)
at org.neo4j.driver.internal.messaging.common.CommonMessageReader.unpackFailureMessage(CommonMessageReader.java:83)
at org.neo4j.driver.internal.messaging.common.CommonMessageReader.read(CommonMessageReader.java:59)
at org.neo4j.driver.internal.async.inbound.InboundMessageHandler.channelRead0(InboundMessageHandler.java:83)
at org.neo4j.driver.internal.async.inbound.InboundMessageHandler.channelRead0(InboundMessageHandler.java:35)
at org.neo4j.driver.internal.shaded.io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:327)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:299)
at org.neo4j.driver.internal.async.inbound.MessageDecoder.channelRead(MessageDecoder.java:47)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:327)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:314)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:435)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:279)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:722)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:658)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:584)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:496)
at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)
at org.neo4j.driver.internal.shaded.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
... 1 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2828)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2775)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2769)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2769)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1305)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1305)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1305)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3036)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2977)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2965)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1067)
at org.apache.spark.SparkContext.runJobInternal(SparkContext.scala:2477)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2460)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:392)
... 34 more
Caused by: org.neo4j.driver.exceptions.TransientException: Database 'neo4j' not up to the requested version: 4768. Latest database version is 4742
at org.neo4j.driver.internal.util.Futures.blockingGet(Futures.java:144)
at org.neo4j.driver.internal.InternalTransaction.run(InternalTransaction.java:60)
at org.neo4j.driver.internal.AbstractQueryRunner.run(AbstractQueryRunner.java:37)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:64)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.write(BaseDataWriter.scala:45)
at org.neo4j.spark.writer.Neo4jDataWriter.write(Neo4jDataWriter.scala:9)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$1(WriteToDataSourceV2Exec.scala:451)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1680)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:487)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:395)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$3(ResultTask.scala:75)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$1(ResultTask.scala:75)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:55)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:150)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:119)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:91)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:813)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1646)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:816)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:672)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Suppressed: org.neo4j.driver.internal.util.ErrorUtil$InternalExceptionCause
at org.neo4j.driver.internal.util.ErrorUtil.newNeo4jError(ErrorUtil.java:103)
at org.neo4j.driver.internal.async.inbound.InboundMessageDispatcher.handleFailureMessage(InboundMessageDispatcher.java:122)
at org.neo4j.driver.internal.messaging.common.CommonMessageReader.unpackFailureMessage(CommonMessageReader.java:83)
at org.neo4j.driver.internal.messaging.common.CommonMessageReader.read(CommonMessageReader.java:59)
at org.neo4j.driver.internal.async.inbound.InboundMessageHandler.channelRead0(InboundMessageHandler.java:83)
at org.neo4j.driver.internal.async.inbound.InboundMessageHandler.channelRead0(InboundMessageHandler.java:35)
at org.neo4j.driver.internal.shaded.io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:327)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:299)
at org.neo4j.driver.internal.async.inbound.MessageDecoder.channelRead(MessageDecoder.java:47)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:327)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:314)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:435)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:279)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:722)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:658)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:584)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:496)
at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)
at org.neo4j.driver.internal.shaded.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
... 1 more
I have read the this could be I may be wrong but what was the cause of the "org.apache.spark.SparkException: Writing job aborted" error. This error sometimes occur if your JAR version is incompatible with other JAR's
And Other times The error usually occurs when there is memory intensive operation and there is less memory
But i'm not sure maybe there is other source of the issue that i couldn't see
Error - Database 'neo4j' not up to the requested version: 4768. Latest database version is 4742
The error says Database 'neo4j' not up to the requested version: 4768. Latest database version is 4742. This is common error for ‘neo4j’
There is a version problem between the driver and the database version. You need to update the database and driver to the latest version.

pyflink with kafka java.lang.RuntimeException: Failed to create stage bundle factory

・Python3.8
・JDK 11
I've started learning pyflink and write a code instructed by official web which is https://nightlies.apache.org/flink/flink-docs-master/docs/dev/python/datastream/intro_to_datastream_api/
And here is my code
from pyflink.common.serialization import JsonRowDeserializationSchema,JsonRowSerializationSchema
from pyflink.common import WatermarkStrategy, Row
from pyflink.common.serialization import Encoder
from pyflink.common.typeinfo import Types
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer,FlinkKafkaProducer
def streaming():
env = StreamExecutionEnvironment.get_execution_environment()
deserialization_schema =JsonRowDeserializationSchema.builder().type_info(
type_info=Types.ROW([Types.INT(), Types.STRING()])).build()
kafka_consumer = FlinkKafkaConsumer(
topics='test',
deserialization_schema=deserialization_schema,
properties={'bootstrap.servers': 'localhost:9092','group.id': 'test_group'})
ds = env.add_source(kafka_consumer)
ds = ds.map(lambda a: Row(a % 4, 1),
output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
.key_by(lambda a: a[0]) \
.reduce(lambda a, b: Row(a[0], a[1] + b[1]))
serialization_schema = JsonRowSerializationSchema.builder().with_type_info(
type_info=Types.ROW([Types.LONG(), Types.LONG()])).build()
kafka_sink = FlinkKafkaProducer(
topic='test_sink_topic',
serialization_schema=serialization_schema,
producer_config={'bootstrap.servers': 'localhost:9092',
'group.id': 'test_group'})
ds.add_sink(kafka_sink)
env.execute('datastream_api_demo')
if __name__ == '__main__':
streaming()
Firstly it said to me to specify jarfile. So I downloaded flink-connector-kafka and kafka-clients jarfile for each from https://mvnrepository.com/artifact/org.apache.flink and put them into pyflink/lib directory.
And now I'm at next step getting this error;
(pyflink_demo) C:\work\pyflink_demo>python Kafka_stream_Kafka.py
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.flink.api.java.ClosureCleaner (file:/C:/work/pyflink_demo/Lib/site-packages/pyflink/lib/flink-dist_2.11-1.14.4.jar) to field java.util.P
roperties.serialVersionUID
WARNING: Please consider reporting this to the maintainers of org.apache.flink.api.java.ClosureCleaner
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
Traceback (most recent call last):
File "Kafka_stream_Kafka.py", line 38, in <module>
streaming()
File "Kafka_stream_Kafka.py", line 33, in streaming
env.execute('datastream_api_demo')
File "C:\work\pyflink_demo\lib\site-packages\pyflink\datastream\stream_execution_environment.py", line 691, in execute
return JobExecutionResult(self._j_stream_execution_environment.execute(j_stream_graph))
File "C:\work\pyflink_demo\lib\site-packages\py4j\java_gateway.py", line 1285, in __call__
return_value = get_return_value(
File "C:\work\pyflink_demo\lib\site-packages\pyflink\util\exceptions.py", line 146, in deco
return f(*a, **kw)
File "C:\work\pyflink_demo\lib\site-packages\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o0.execute.
: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
at org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$3(MiniClusterJobClient.java:137)
at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:642)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.rpc.akka.AkkaInvocationHandler.lambda$invokeRpc$1(AkkaInvocationHandler.java:258)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859)
at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1389)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859)
at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$1.onComplete(AkkaFutureUtils.java:47)
at akka.dispatch.OnComplete.internal(Future.scala:300)
at akka.dispatch.OnComplete.internal(Future.scala:297)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:224)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:221)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60)
at org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$DirectExecutionContext.execute(AkkaFutureUtils.java:65)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:68)
at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1(Promise.scala:284)
at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1$adapted(Promise.scala:284)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:284)
at akka.pattern.PromiseActorRef.$bang(AskSupport.scala:621)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:24)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:23)
at scala.concurrent.Future.$anonfun$andThen$1(Future.scala:532)
at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:29)
at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:29)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60)
at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:63)
at akka.dispatch.BatchingExecutor$BlockableBatch.$anonfun$run$1(BatchingExecutor.scala:100)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:81)
at akka.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:100)
at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:49)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:48)
at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290)
at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020)
at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656)
at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594)
at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183)
Caused by: org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:138)
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:82)
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:252)
at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:242)
at org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:233)
at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:684)
at org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(SchedulerNG.java:79)
at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:444)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRpcInvocation$1(AkkaRpcActor.java:316)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:83)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:314)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:217)
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:78)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:163)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20)
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at akka.actor.Actor.aroundReceive(Actor.scala:537)
at akka.actor.Actor.aroundReceive$(Actor.scala:535)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580)
at akka.actor.ActorCell.invoke(ActorCell.scala:548)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270)
at akka.dispatch.Mailbox.run(Mailbox.scala:231)
at akka.dispatch.Mailbox.exec(Mailbox.scala:243)
... 5 more
Caused by: java.lang.RuntimeException: Failed to create stage bundle factory! INFO:root:Initializing Python harness: C:\work\pyflink_demo\lib\site-packages\pyflink\fn_execution\beam\bea
m_boot.py --id=4-1 --provision_endpoint=localhost:51794
INFO:root:Starting up Python harness in loopback mode.
at org.apache.flink.streaming.api.runners.python.beam.BeamPythonFunctionRunner.createStageBundleFactory(BeamPythonFunctionRunner.java:566)
at org.apache.flink.streaming.api.runners.python.beam.BeamPythonFunctionRunner.open(BeamPythonFunctionRunner.java:255)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.open(AbstractPythonFunctionOperator.java:131)
at org.apache.flink.streaming.api.operators.python.AbstractOneInputPythonFunctionOperator.open(AbstractOneInputPythonFunctionOperator.java:116)
at org.apache.flink.streaming.api.operators.python.PythonProcessOperator.open(PythonProcessOperator.java:59)
at org.apache.flink.streaming.runtime.tasks.RegularOperatorChain.initializeStateAndOpenOperators(RegularOperatorChain.java:110)
at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreGates(StreamTask.java:711)
at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$SynchronizedStreamTaskActionExecutor.call(StreamTaskActionExecutor.java:100)
at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreInternal(StreamTask.java:687)
at org.apache.flink.streaming.runtime.tasks.StreamTask.restore(StreamTask.java:654)
at org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(Task.java:958)
at org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:927)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:766)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:575)
at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.UncheckedExecutionException: java.lang.IllegalStateException: Process died with exit code 0
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2050)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache.get(LocalCache.java:3952)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache.getOrLoad(LocalCache.java:3974)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4958)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$LocalLoadingCache.getUnchecked(LocalCache.java:4964)
at org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory$SimpleStageBundleFactory.<init>(DefaultJobBundleFactory.java:451)
at org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory$SimpleStageBundleFactory.<init>(DefaultJobBundleFactory.java:436)
at org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory.forStage(DefaultJobBundleFactory.java:303)
at org.apache.flink.streaming.api.runners.python.beam.BeamPythonFunctionRunner.createStageBundleFactory(BeamPythonFunctionRunner.java:564)
... 14 more
Caused by: java.lang.IllegalStateException: Process died with exit code 0
at org.apache.beam.runners.fnexecution.environment.ProcessManager$RunningProcess.isAliveOrThrow(ProcessManager.java:75)
at org.apache.beam.runners.fnexecution.environment.ProcessEnvironmentFactory.createEnvironment(ProcessEnvironmentFactory.java:112)
at org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory$1.load(DefaultJobBundleFactory.java:252)
at org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory$1.load(DefaultJobBundleFactory.java:231)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3528)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2277)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2154)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2044)
... 22 more
I tried to figure out what's going on and found very similar question What's wrong with my Pyflink setup that Python UDFs throw py4j exceptions?
It says that was caused by network proxy problem. JVM and python uses local socket communication. So set local communication with no proxy.
I set environment valuable "no_proxy" but it doesn't work.
enter image description here
Could anyone provide solution for this?
There is no useful information in the exception stack to help to identify the problem. This should be caused by a known issue(FLINK-26543, already solved, however still not released). This issue only occurs in loopback mode which is enabled by default when executing the job locally.
For now, you could try to force the job run in process mode instead of loopback mode by setting environment variable _python_worker_execution_mode to process. After doing this, you should see the root cause of the failure.
Besides, there is also a small issue in your code. I guess you meant ds.map(lambda a: Row(a[0] % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) instead of ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) as it doesn't support % operation in Row object.
I have tried the script. I am not quite sure what caused the error. Try to start kafka first and create the topics, before running the script. Or start kafka and run the script a second time after first failure.

pyspark issue while connecting

I am new to the pyspark.
i was trying to initialize a pyspark session .
But getting the below error. I am doing the pyspark2 command in local machine .
When i tried first time using scala the spark session invokation is correct . Then i tried to invoke Pyspark that time i am getting error. Please let me know how i can come out of this error
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/08 22:55:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/08 22:55:41 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.base/java.lang.Thread.run(Thread.java:833)
C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\bin\..\python\pyspark\shell.py:42: UserWarning: Failed to initialize Spark session.
warnings.warn("Failed to initialize Spark session.")
Traceback (most recent call last):
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\bin\..\python\pyspark\shell.py", line 38, in <module>
spark = SparkSession._create_shell_session() # type: ignore
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\sql\session.py", line 553, in _create_shell_session
return SparkSession.builder.getOrCreate()
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\sql\session.py", line 228, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\context.py", line 392, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\context.py", line 146, in __init__
self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\context.py", line 209, in _do_init
self._jsc = jsc or self._initialize_context(self._conf._jconf)
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\context.py", line 329, in _initialize_context
return self._jvm.JavaSparkContext(jconf)
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\lib\py4j-0.10.9.3-src.zip\py4j\java_gateway.py", line 1585, in __call__
return_value = get_return_value(
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\lib\py4j-0.10.9.3-src.zip\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.NoClassDefFoundError: Could not initialize class org.apache.spark.storage.StorageUtils$
at org.apache.spark.storage.BlockManagerMasterEndpoint.<init>(BlockManagerMasterEndpoint.scala:110)
at org.apache.spark.SparkEnv$.$anonfun$create$9(SparkEnv.scala:348)
at org.apache.spark.SparkEnv$.registerOrLookupEndpoint$1(SparkEnv.scala:287)
at org.apache.spark.SparkEnv$.create(SparkEnv.scala:336)
at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:191)
at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:277)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:460)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:833)
C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\bin>SUCCESS: The process with PID 21928 (child process of PID 14900) has been terminated.
SUCCESS: The process with PID 14900 (child process of PID 31720) has been terminated.
SUCCESS: The process with PID 31720 (child process of PID 10468) has been terminated.

Spark to read Cassandra

Hi I am trying to extract data from Cassandra using AWS Glue and writing PySpark Code. Below is the code and gave me error. Please suggest me how i can import classes/drivers.
I want to extract from Cassandra and create files into S3 Buckets.
#from awsglue.transforms import sys
import sys
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sparkContext = SparkContext()
glueContext = GlueContext(sparkContext)
sparkSession = glueContext.spark_session
#Use the CData JDBC driver to read Cassandra data from the Customer table into a DataFrame ##Note the populated JDBC URL and driver class name
#source_df = sparkSession.read.format("jdbc").option("url","jdbc:cassandra:RTK=5246...;Database=MyCassandraDB;Port=7000;Server=db-datastax02c-dc2.stage.impello.co.uk;")\.option("dbtable","reads_by_received_date").option("driver","cdata.jdbc.cassandra.CassandraDriver").load()*/
#df = glueContext.read.format("jdbc").option("driver", jdbc_driver_name).option("url", db_url).option("dbtable", table_name).option("user", db_username).option("password", db_password).load()
glueJob = Job(glueContext)
glueJob.init(args['JOB_NAME'], args)
testdf = sparkSession.read.format("org.apache.spark.sql.cassandra")\
.option("spark.cassandra.connection.host", "server")\
.options(table="reads_by_received_date",keyspace="keyspace")\
.option("spark.cassandra.auth.username", "username") \
.option("spark.cassandra.auth.password", "username") \
.load()\
#.select(*)\
#.where( "received_year in (2020)")\
#.cache()
##Convert DataFrames to AWS Glue's DynamicFrames Object
dynamic_dframe = DynamicFrame.fromDF(testdf, glueContext, "dynamic_df")
##Write the DynamicFrame as a file in CSV format to a folder in an S3 bucket.
datatransfer = glueContext.write_dynamic_frame.from_options(frame = dynamic_dframe\
, connection_type = "s3"\
, connection_options = {"path": "s3://bucket/"}\
, format = "csv"\
, transformation_ctx = "datasink4"
)
glueJob.commit()
Error:
Aug 28, 2020, 4:43:27 PM Pending execution
Traceback (most recent call last): File "/tmp/CassandraToS3", line 27, in <module> .option("spark.cassandra.auth.password", "password") \ File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 172, in load return self._df(self._jreader.load()) File "/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__ answer, self.gateway_client, self.target_id, self.name) File "/opt/amazon/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 63, in deco return f(*a, **kw) File "/opt/amazon/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py", line 328, in get_return_value format(target_id, ".", name), value) py4j.protocol.Py4JJavaError: An error occurred while calling o75.load. : java.io.IOException: Failed to open native connection to Cassandra at {} :: Could not reach any contact point, make sure you've provided valid addresses (showing first 1 nodes, use getAllErrors() for more): Node(endPoint=/127.0.0.1:9042, hostId=null, hashCode=4f522a41): [com.datastax.oss.driver.api.core.connection.ConnectionInitException: [s0|control|connecting...] Protocol initialization request, step 1 (OPTIONS): failed to send request (java.nio.channels.ClosedChannelException)] at com.datastax.spark.connector.cql.CassandraConnector$.com$datastax$spark$connector$cql$CassandraConnector$$createSession(CassandraConnector.scala:181) at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$3.apply(CassandraConnector.scala:169) at com.datastax.spark.connector.cql.CassandraConnector$$anonfun$3.apply(CassandraConnector.scala:169) at com.datastax.spark.connector.cql.RefCountedCache.createNewValueAndKeys(RefCountedCache.scala:32) at com.datastax.spark.connector.cql.RefCountedCache.syncAcquire(RefCountedCache.scala:69) at com.datastax.spark.connector.cql.RefCountedCache.acquire(RefCountedCache.scala:57) at com.datastax.spark.connector.cql.CassandraConnector.openSession(CassandraConnector.scala:89) at com.datastax.spark.connector.cql.CassandraConnector.withSessionDo(CassandraConnector.scala:111) at com.datastax.spark.connector.rdd.partitioner.dht.TokenFactory$.forSystemLocalPartitioner(TokenFactory.scala:98) at org.apache.spark.sql.cassandra.CassandraSourceRelation$.apply(CassandraSourceRelation.scala:680) at org.apache.spark.sql.cassandra.DefaultSource.createRelation(DefaultSource.scala:57) at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:318) at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:223) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:167) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748) Caused by: com.datastax.oss.driver.api.core.AllNodesFailedException: Could not reach any contact point, make sure you've provided valid addresses (showing first 1 nodes, use getAllErrors() for more): Node(endPoint=/127.0.0.1:9042, hostId=null, hashCode=4f522a41): [com.datastax.oss.driver.api.core.connection.ConnectionInitException: [s0|control|connecting...] Protocol initialization request, step 1 (OPTIONS): failed to send request (java.nio.channels.ClosedChannelException)] at com.datastax.oss.driver.api.core.AllNodesFailedException.copy(AllNodesFailedException.java:141) at com.datastax.oss.driver.internal.core.util.concurrent.CompletableFutures.getUninterruptibly(CompletableFutures.java:149) at com.datastax.oss.driver.api.core.session.SessionBuilder.build(SessionBuilder.java:633) at com.datastax.spark.connector.cql.DefaultConnectionFactory$.createSession(CassandraConnectionFactory.scala:144) at com.datastax.spark.connector.cql.CassandraConnector$.com$datastax$spark$connector$cql$CassandraConnector$$createSession(CassandraConnector.scala:175) ... 25 more Suppressed: com.datastax.oss.driver.api.core.connection.ConnectionInitException: [s0|control|connecting...] Protocol initialization request, step 1 (OPTIONS): failed to send request (java.nio.channels.ClosedChannelException) at com.datastax.oss.driver.internal.core.channel.ProtocolInitHandler$InitRequest.fail(ProtocolInitHandler.java:342) at com.datastax.oss.driver.internal.core.channel.ChannelHandlerRequest.writeListener(ChannelHandlerRequest.java:87) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:577) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:551) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:490) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.addListener(DefaultPromise.java:183) at com.datastax.oss.driver.shaded.netty.channel.DefaultChannelPromise.addListener(DefaultChannelPromise.java:95) at com.datastax.oss.driver.shaded.netty.channel.DefaultChannelPromise.addListener(DefaultChannelPromise.java:30) at com.datastax.oss.driver.internal.core.channel.ChannelHandlerRequest.send(ChannelHandlerRequest.java:76) at com.datastax.oss.driver.internal.core.channel.ProtocolInitHandler$InitRequest.send(ProtocolInitHandler.java:183) at com.datastax.oss.driver.internal.core.channel.ProtocolInitHandler.onRealConnect(ProtocolInitHandler.java:118) at com.datastax.oss.driver.internal.core.channel.ConnectInitHandler.lambda$connect$0(ConnectInitHandler.java:57) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListener0(DefaultPromise.java:577) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListeners0(DefaultPromise.java:570) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListenersNow(DefaultPromise.java:549) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.notifyListeners(DefaultPromise.java:490) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.setValue0(DefaultPromise.java:615) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.setFailure0(DefaultPromise.java:608) at com.datastax.oss.driver.shaded.netty.util.concurrent.DefaultPromise.tryFailure(DefaultPromise.java:117) at com.datastax.oss.driver.shaded.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.fulfillConnectPromise(AbstractNioChannel.java:321) at com.datastax.oss.driver.shaded.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:337) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:702) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) at com.datastax.oss.driver.shaded.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) at com.datastax.oss.driver.shaded.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at com.datastax.oss.driver.shaded.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) ... 1 more Suppressed: com.datastax.oss.driver.shaded.netty.channel.AbstractChannel$AnnotatedConnectException: Connection refused: /127.0.0.1:9042 Caused by: java.net.ConnectException: Connection refused at sun.nio.ch.SocketChannelImpl.checkConnect(Native Method) at sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:714) at com.datastax.oss.driver.shaded.netty.channel.socket.nio.NioSocketChannel.doFinishConnect(NioSocketChannel.java:330) at com.datastax.oss.driver.shaded.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.finishConnect(AbstractNioChannel.java:334) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:702) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:650) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:576) at com.datastax.oss.driver.shaded.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:493) at com.datastax.oss.driver.shaded.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) at com.datastax.oss.driver.shaded.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at com.datastax.oss.driver.shaded.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.lang.Thread.run(Thread.java:748) Caused by: java.nio.channels.ClosedChannelException at com.datastax.oss.driver.shaded.netty.channel.AbstractChannel$AbstractUnsafe.newClosedChannelException(AbstractChannel.java:957) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannel$AbstractUnsafe.flush0(AbstractChannel.java:921) at com.datastax.oss.driver.shaded.netty.channel.nio.AbstractNioChannel$AbstractNioUnsafe.flush0(AbstractNioChannel.java:354) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannel$AbstractUnsafe.flush(AbstractChannel.java:897) at com.datastax.oss.driver.shaded.netty.channel.DefaultChannelPipeline$HeadContext.flush(DefaultChannelPipeline.java:1372) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.invokeFlush0(AbstractChannelHandlerContext.java:748) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.invokeFlush(AbstractChannelHandlerContext.java:740) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.flush(AbstractChannelHandlerContext.java:726) at com.datastax.oss.driver.shaded.netty.channel.ChannelDuplexHandler.flush(ChannelDuplexHandler.java:127) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.invokeFlush0(AbstractChannelHandlerContext.java:748) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.invokeWriteAndFlush(AbstractChannelHandlerContext.java:763) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.write(AbstractChannelHandlerContext.java:788) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.writeAndFlush(AbstractChannelHandlerContext.java:756) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannelHandlerContext.writeAndFlush(AbstractChannelHandlerContext.java:806) at com.datastax.oss.driver.shaded.netty.channel.DefaultChannelPipeline.writeAndFlush(DefaultChannelPipeline.java:1025) at com.datastax.oss.driver.shaded.netty.channel.AbstractChannel.writeAndFlush(AbstractChannel.java:294) at com.datastax.oss.driver.internal.core.channel.ChannelHandlerRequest.send(ChannelHandlerRequest.java:75) ... 20 more
AWS Glue does not provide native library support to Cassandra. You need to get Cassandra connector and follow the steps mentioned in ETL jobs against non-native JDBC data sources.
Once you have the jar downloaded from here then you can pass to your job and use it in your pyspark script.