"org.apache.spark.SparkException: Writing job aborted" using Databricks and spark connector - pyspark

I have used Databricks to ingest my knowledges graph (stored as nodes/edges in delta tables (parquets)) to neo4j
FORMAT = "org.neo4j.spark.DataSource"
BATCH_SIZE = 25000
URL = "***://***"
USERNAME = "***"
PASSWORD = "***"
DATABASE = "***"
neo4jconfig = {
"url" : URL,
"database": DATABASE,
"authentication.type":"basic",
"authentication.basic.username" : USERNAME,
"authentication.basic.password" : PASSWORD,
"batch.size": BATCH_SIZE,
"relationship.source.save.mode" :"Overwrite",
"relationship.target.save.mode" :"Overwrite",
"schema.optimization.type": "NODE_CONSTRAINTS",
"relationship.save.strategy": "keys",
"relationship.source.node.keys": "source:id",
"relationship.target.node.keys": "target:id",
"transaction.retries": "20",
"transaction.retry.timeout": 1000,
}
I already ingested the nodes and now i'm trying to ingest the edges it took 8 hrs
done_edges = ["people_friends.delta/"]
def writeEdgeIntoNeo4j(edge: str, input_path: str):
print(edge)
edge_df = a.read(input_path + f"edges/{edge.strip('/')}")
labels = edge_df.select('label').distinct().collect()
for label in labels:
filtered_df = edge_df.filter((F.col("label") == label[0]))
print(f"label : {label[0]}")
for relation in filtered_df.select('source_label', 'target_label').distinct().collect():
print(f"source : {relation['source_label']} :: target : {relation['target_label']}")
neo4jconfig["relationship"] = label[0]
neo4jconfig["relationship.source.labels"] = f":{relation['source_label']}"
neo4jconfig["relationship.target.labels"] = f":{relation['target_label']}"
filtered_df.repartition(1).write.format(FORMAT).mode('overwrite').options(**neo4jconfig).save()
for edge in ["people_friends.delta/"]:
if "association" not in edge and edge not in done_edges:
writeEdgeIntoNeo4j(edge, input_path)
and it gives me this error org.apache.spark.SparkException
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-4276563430066569> in <module>
1 for edge in ["people_friends.delta/"]:
2 if "association" not in edge:
----> 3 write_edge_into_neo4j(edge, input_path)
<command-4276563430066568> in write_edge_into_neo4j(edge, input_path)
11 neo4jconfig["relationship.source.labels"] = f":{relation['source_label']}"
12 neo4jconfig["relationship.target.labels"] = f":{relation['target_label']}"
---> 13 filtered_df.repartition(1).write.format(FORMAT).mode('overwrite').options(**neo4jconfig).save()
/databricks/spark/python/pyspark/sql/readwriter.py in save(self, path, format, mode, partitionBy, **options)
1132 self.format(format)
1133 if path is None:
-> 1134 self._jwrite.save()
1135 else:
1136 self._jwrite.save(path)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
1302
1303 answer = self.gateway_client.send_command(command)
-> 1304 return_value = get_return_value(
1305 answer, self.gateway_client, self.target_id, self.name)
1306
/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
115 def deco(*a, **kw):
116 try:
--> 117 return f(*a, **kw)
118 except py4j.protocol.Py4JJavaError as e:
119 converted = convert_exception(e.java_exception)
/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
Py4JJavaError: An error occurred while calling o1319.save.
: org.apache.spark.SparkException: Writing job aborted.
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:423)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2$(WriteToDataSourceV2Exec.scala:371)
at org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec.writeWithV2(WriteToDataSourceV2Exec.scala:264)
at org.apache.spark.sql.execution.datasources.v2.OverwriteByExpressionExec.run(WriteToDataSourceV2Exec.scala:279)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result$lzycompute(V2CommandExec.scala:41)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.result(V2CommandExec.scala:41)
at org.apache.spark.sql.execution.datasources.v2.V2CommandExec.doExecute(V2CommandExec.scala:58)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:213)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:209)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:167)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:166)
at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:1080)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$5(SQLExecution.scala:130)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:273)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withCustomExecutionEnv$1(SQLExecution.scala:104)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:854)
at org.apache.spark.sql.execution.SQLExecution$.withCustomExecutionEnv(SQLExecution.scala:77)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:223)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:1080)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:400)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:312)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)
at py4j.Gateway.invoke(Gateway.java:295)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:251)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1054.1 failed 4 times, most recent failure: Lost task 0.3 in stage 1054.1 (TID 24864) (10.139.64.35 executor 35): org.neo4j.driver.exceptions.TransientException: Database 'neo4j' not up to the requested version: 4768. Latest database version is 4742
at org.neo4j.driver.internal.util.Futures.blockingGet(Futures.java:144)
at org.neo4j.driver.internal.InternalTransaction.run(InternalTransaction.java:60)
at org.neo4j.driver.internal.AbstractQueryRunner.run(AbstractQueryRunner.java:37)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:64)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.write(BaseDataWriter.scala:45)
at org.neo4j.spark.writer.Neo4jDataWriter.write(Neo4jDataWriter.scala:9)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$1(WriteToDataSourceV2Exec.scala:451)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1680)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:487)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:395)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$3(ResultTask.scala:75)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$1(ResultTask.scala:75)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:55)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:150)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:119)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:91)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:813)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1646)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:816)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:672)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Suppressed: org.neo4j.driver.internal.util.ErrorUtil$InternalExceptionCause
at org.neo4j.driver.internal.util.ErrorUtil.newNeo4jError(ErrorUtil.java:103)
at org.neo4j.driver.internal.async.inbound.InboundMessageDispatcher.handleFailureMessage(InboundMessageDispatcher.java:122)
at org.neo4j.driver.internal.messaging.common.CommonMessageReader.unpackFailureMessage(CommonMessageReader.java:83)
at org.neo4j.driver.internal.messaging.common.CommonMessageReader.read(CommonMessageReader.java:59)
at org.neo4j.driver.internal.async.inbound.InboundMessageHandler.channelRead0(InboundMessageHandler.java:83)
at org.neo4j.driver.internal.async.inbound.InboundMessageHandler.channelRead0(InboundMessageHandler.java:35)
at org.neo4j.driver.internal.shaded.io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:327)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:299)
at org.neo4j.driver.internal.async.inbound.MessageDecoder.channelRead(MessageDecoder.java:47)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:327)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:314)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:435)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:279)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:722)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:658)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:584)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:496)
at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)
at org.neo4j.driver.internal.shaded.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
... 1 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2828)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2775)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2769)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2769)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1305)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1305)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1305)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3036)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2977)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2965)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1067)
at org.apache.spark.SparkContext.runJobInternal(SparkContext.scala:2477)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2460)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.writeWithV2(WriteToDataSourceV2Exec.scala:392)
... 34 more
Caused by: org.neo4j.driver.exceptions.TransientException: Database 'neo4j' not up to the requested version: 4768. Latest database version is 4742
at org.neo4j.driver.internal.util.Futures.blockingGet(Futures.java:144)
at org.neo4j.driver.internal.InternalTransaction.run(InternalTransaction.java:60)
at org.neo4j.driver.internal.AbstractQueryRunner.run(AbstractQueryRunner.java:37)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:64)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.writeBatch(BaseDataWriter.scala:93)
at org.neo4j.spark.writer.BaseDataWriter.write(BaseDataWriter.scala:45)
at org.neo4j.spark.writer.Neo4jDataWriter.write(Neo4jDataWriter.scala:9)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.$anonfun$run$1(WriteToDataSourceV2Exec.scala:451)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1680)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:487)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:395)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$3(ResultTask.scala:75)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.$anonfun$runTask$1(ResultTask.scala:75)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:55)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:150)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:119)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:91)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:813)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1646)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:816)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:672)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Suppressed: org.neo4j.driver.internal.util.ErrorUtil$InternalExceptionCause
at org.neo4j.driver.internal.util.ErrorUtil.newNeo4jError(ErrorUtil.java:103)
at org.neo4j.driver.internal.async.inbound.InboundMessageDispatcher.handleFailureMessage(InboundMessageDispatcher.java:122)
at org.neo4j.driver.internal.messaging.common.CommonMessageReader.unpackFailureMessage(CommonMessageReader.java:83)
at org.neo4j.driver.internal.messaging.common.CommonMessageReader.read(CommonMessageReader.java:59)
at org.neo4j.driver.internal.async.inbound.InboundMessageHandler.channelRead0(InboundMessageHandler.java:83)
at org.neo4j.driver.internal.async.inbound.InboundMessageHandler.channelRead0(InboundMessageHandler.java:35)
at org.neo4j.driver.internal.shaded.io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:99)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:327)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:299)
at org.neo4j.driver.internal.async.inbound.MessageDecoder.channelRead(MessageDecoder.java:47)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:327)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:314)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.callDecode(ByteToMessageDecoder.java:435)
at org.neo4j.driver.internal.shaded.io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:279)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:357)
at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1410)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:379)
at org.neo4j.driver.internal.shaded.io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:365)
at org.neo4j.driver.internal.shaded.io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:919)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:166)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:722)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:658)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:584)
at org.neo4j.driver.internal.shaded.io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:496)
at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:986)
at org.neo4j.driver.internal.shaded.io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74)
at org.neo4j.driver.internal.shaded.io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30)
... 1 more
I have read the this could be I may be wrong but what was the cause of the "org.apache.spark.SparkException: Writing job aborted" error. This error sometimes occur if your JAR version is incompatible with other JAR's
And Other times The error usually occurs when there is memory intensive operation and there is less memory
But i'm not sure maybe there is other source of the issue that i couldn't see

Error - Database 'neo4j' not up to the requested version: 4768. Latest database version is 4742
The error says Database 'neo4j' not up to the requested version: 4768. Latest database version is 4742. This is common error for ‘neo4j’
There is a version problem between the driver and the database version. You need to update the database and driver to the latest version.

Related

Getting error while writing the spark dataframe as CSV file

23/01/10 14:51:56 ERROR FileFormatWriter: Aborting job 18d74180-1f1e-44ea-80dc-caa5a2fe0525.
java.io.IOException: Failed to rename DeprecatedRawLocalFileStatus{path=file:/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/test/\_temporary/0/task_2023011020
21531054031999327673221_0005_m_000000/part-00000-250cb7ce-e146-4cfd-b9f1-f810af4630f2-c000.csv; isDirectory=false; length=13520; replication=1; blocksize=33554432; modification_time=16
73362315593; access_time=1673362315593; owner=; group=; permission=rw-rw-rw-; isSymlink=false; hasAcl=false; isEncrypted=false; isErasureCoded=false} to file:/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/test/part-00000-250cb7ce-e146-4cfd-b9f1-f810af4630f2-c000.csv
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.mergePaths(FileOutputCommitter.java:477)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.mergePaths(FileOutputCommitter.java:490)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:405)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$25(FileFormatWriter.scala:267)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:642)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:267)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:851)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)
23/01/10 14:51:56 WARN FileUtil: Failed to delete file or dir \[/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/test/\_temporary/0/task_202301102021531054031999327673221_0005_m_000000/.part-00000-250cb7ce-e146-4cfd-b9f1-f810af4630f2-c000.csv.crc\]: it still exists.
23/01/10 14:51:56 WARN FileUtil: Failed to delete file or dir \[/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/test/\_temporary/0/task_202301102021531054031999327673221_0005_m_000000/part-00000-250cb7ce-e146-4cfd-b9f1-f810af4630f2-c000.csv\]: it still exists.
Traceback (most recent call last):
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/manage.py", line 21, in \<module\>
main()
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/manage.py", line 17, in main
execute_from_command_line(sys.argv)
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/django/core/management/__init__.py", line 419, in execute_from_command_line
utility.execute()
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/django/core/management/__init__.py", line 413, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/django/core/management/base.py", line 354, in run_from_argv
self.execute(\*args, \*\*cmd_options)
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/django/core/management/base.py", line 398, in execute
output = self.handle(\*args, \*\*options)
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/core/management/commands/prepare_ncoa_sp.py", line 26, in handle
step.start()
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/core/management/commands/prepare_ncoa_sp.py", line 45, in start
self.prepare_agent_address_updates()
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/core/management/commands/prepare_ncoa_sp.py", line 141, in prepare_agent_address_updates
self.\_load_and_normalize(file_glob, {
File "/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/core/management/commands/prepare_ncoa_sp.py", line 127, in \_load_and_normalize
df.write.option("header", True).csv('test')
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/pyspark/sql/readwriter.py", line 1240, in csv
self.\_jwrite.csv(path)
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/py4j/java_gateway.py", line 1321, in __call__
return_value = get_return_value(
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/pyspark/sql/utils.py", line 190, in deco
return f(\*a, \*\*kw)
File "/home/admin123/.virtualenvs/fcd/lib/python3.10/site-packages/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o298.csv.
: org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.errors.QueryExecutionErrors$.jobAbortedError(QueryExecutionErrors.scala:651)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:278)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:186)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:851)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.IOException: Failed to rename DeprecatedRawLocalFileStatus{path=file:/d/myproject/FCD/Sparck_updated_code/first-class-data-backend/first_class/test/\_temporary/0/task
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.mergePaths(FileOutputCommitter.java:490)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:405)
at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$25(FileFormatWriter.scala:267)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:642)
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:267)
... 42 more
As you can see above error occurred while try to write the spark dataframe as csv file
sparkContext = SparkContext("spark://DESKTOP-1L1BM8L.localdomain:7077", "fcd_spark_session")
spark_configuration = sparkContext._conf.setAll(
[("spark.shuffle.service.enabled", "false"), ("spark.dynamicAllocation.enabled", "false"),
("spark.executor.memory", "2g"), ("spark.executor.instances", 2)])
sparkContext.stop()
self.spark_session = SparkSession.builder.appName("fcd_spark_session").config(
conf=spark_configuration) \
.master('spark://DESKTOP-1L1BM8L.localdomain:7077').getOrCreate()
def _load_and_normalize(self, glob_paths, renames=None, columns=[], processed_columns=[],
remove_duplicates=[], ):
renames = renames or {}
files = sorted(glob.glob(glob_paths))
for filepath in files:
file_name = basename(filepath)
logger.info(f'adding {basename(filepath)}')
file_write_path = self.csv_fullpath(self.cleaned_folder, "NCOA_address", file_name)
print(file_write_path)
if not os.path.exists(file_write_path):
df = self.read_csv(filepath)
df = df[columns].copy()
df = df[:100]
df = df.fillna('').astype('str')
df = df.apply(tuple, axis=1).tolist()
df = self.spark_session.createDataFrame(df, columns)
df = self.add_procuredate(df, file_name)
df = self.uppercase_and_trim_all_columns(df)
for rename_columns in renames:
df = df.withColumnRenamed(rename_columns, renames[rename_columns])
all_cols_except_procure = [col for col in df.schema.names if col != 'procure_date']
df = df.dropDuplicates(all_cols_except_procure)
df = self.get_normalized_address(df)
df = self.get_normalized_address(df, col_name='orig_normalized_address',
full_address_col='orig_address', city_col='orig_city',
state_col='orig_state',
zip_col='orig_zip')
df = df.where((df.full_address != '') & (df.normalized_address != ''))
df = df.select(processed_columns)
df = df.dropDuplicates(remove_duplicates)
df.write.option("header", True).format("csv").csv('test')
gc.collect()
else:
logger.info(f'{basename(filepath)} file is already available in cleaned folder')
Any suggestions and please comment if you need any additional info regarding the code and config

pyflink with kafka java.lang.RuntimeException: Failed to create stage bundle factory

・Python3.8
・JDK 11
I've started learning pyflink and write a code instructed by official web which is https://nightlies.apache.org/flink/flink-docs-master/docs/dev/python/datastream/intro_to_datastream_api/
And here is my code
from pyflink.common.serialization import JsonRowDeserializationSchema,JsonRowSerializationSchema
from pyflink.common import WatermarkStrategy, Row
from pyflink.common.serialization import Encoder
from pyflink.common.typeinfo import Types
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer,FlinkKafkaProducer
def streaming():
env = StreamExecutionEnvironment.get_execution_environment()
deserialization_schema =JsonRowDeserializationSchema.builder().type_info(
type_info=Types.ROW([Types.INT(), Types.STRING()])).build()
kafka_consumer = FlinkKafkaConsumer(
topics='test',
deserialization_schema=deserialization_schema,
properties={'bootstrap.servers': 'localhost:9092','group.id': 'test_group'})
ds = env.add_source(kafka_consumer)
ds = ds.map(lambda a: Row(a % 4, 1),
output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
.key_by(lambda a: a[0]) \
.reduce(lambda a, b: Row(a[0], a[1] + b[1]))
serialization_schema = JsonRowSerializationSchema.builder().with_type_info(
type_info=Types.ROW([Types.LONG(), Types.LONG()])).build()
kafka_sink = FlinkKafkaProducer(
topic='test_sink_topic',
serialization_schema=serialization_schema,
producer_config={'bootstrap.servers': 'localhost:9092',
'group.id': 'test_group'})
ds.add_sink(kafka_sink)
env.execute('datastream_api_demo')
if __name__ == '__main__':
streaming()
Firstly it said to me to specify jarfile. So I downloaded flink-connector-kafka and kafka-clients jarfile for each from https://mvnrepository.com/artifact/org.apache.flink and put them into pyflink/lib directory.
And now I'm at next step getting this error;
(pyflink_demo) C:\work\pyflink_demo>python Kafka_stream_Kafka.py
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.flink.api.java.ClosureCleaner (file:/C:/work/pyflink_demo/Lib/site-packages/pyflink/lib/flink-dist_2.11-1.14.4.jar) to field java.util.P
roperties.serialVersionUID
WARNING: Please consider reporting this to the maintainers of org.apache.flink.api.java.ClosureCleaner
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
Traceback (most recent call last):
File "Kafka_stream_Kafka.py", line 38, in <module>
streaming()
File "Kafka_stream_Kafka.py", line 33, in streaming
env.execute('datastream_api_demo')
File "C:\work\pyflink_demo\lib\site-packages\pyflink\datastream\stream_execution_environment.py", line 691, in execute
return JobExecutionResult(self._j_stream_execution_environment.execute(j_stream_graph))
File "C:\work\pyflink_demo\lib\site-packages\py4j\java_gateway.py", line 1285, in __call__
return_value = get_return_value(
File "C:\work\pyflink_demo\lib\site-packages\pyflink\util\exceptions.py", line 146, in deco
return f(*a, **kw)
File "C:\work\pyflink_demo\lib\site-packages\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o0.execute.
: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
at org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$3(MiniClusterJobClient.java:137)
at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:642)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.rpc.akka.AkkaInvocationHandler.lambda$invokeRpc$1(AkkaInvocationHandler.java:258)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859)
at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.util.concurrent.FutureUtils.doForward(FutureUtils.java:1389)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$null$1(ClassLoadingUtils.java:93)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:68)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.lambda$guardCompletionWithContextClassLoader$2(ClassLoadingUtils.java:92)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859)
at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$1.onComplete(AkkaFutureUtils.java:47)
at akka.dispatch.OnComplete.internal(Future.scala:300)
at akka.dispatch.OnComplete.internal(Future.scala:297)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:224)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:221)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60)
at org.apache.flink.runtime.concurrent.akka.AkkaFutureUtils$DirectExecutionContext.execute(AkkaFutureUtils.java:65)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:68)
at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1(Promise.scala:284)
at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1$adapted(Promise.scala:284)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:284)
at akka.pattern.PromiseActorRef.$bang(AskSupport.scala:621)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:24)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:23)
at scala.concurrent.Future.$anonfun$andThen$1(Future.scala:532)
at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:29)
at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:29)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:60)
at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:63)
at akka.dispatch.BatchingExecutor$BlockableBatch.$anonfun$run$1(BatchingExecutor.scala:100)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:12)
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:81)
at akka.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:100)
at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:49)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:48)
at java.base/java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:290)
at java.base/java.util.concurrent.ForkJoinPool$WorkQueue.topLevelExec(ForkJoinPool.java:1020)
at java.base/java.util.concurrent.ForkJoinPool.scan(ForkJoinPool.java:1656)
at java.base/java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1594)
at java.base/java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:183)
Caused by: org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:138)
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:82)
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:252)
at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:242)
at org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:233)
at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:684)
at org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(SchedulerNG.java:79)
at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:444)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.lambda$handleRpcInvocation$1(AkkaRpcActor.java:316)
at org.apache.flink.runtime.concurrent.akka.ClassLoadingUtils.runWithContextClassLoader(ClassLoadingUtils.java:83)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:314)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:217)
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:78)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:163)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:24)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:20)
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:20)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at akka.actor.Actor.aroundReceive(Actor.scala:537)
at akka.actor.Actor.aroundReceive$(Actor.scala:535)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:220)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:580)
at akka.actor.ActorCell.invoke(ActorCell.scala:548)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:270)
at akka.dispatch.Mailbox.run(Mailbox.scala:231)
at akka.dispatch.Mailbox.exec(Mailbox.scala:243)
... 5 more
Caused by: java.lang.RuntimeException: Failed to create stage bundle factory! INFO:root:Initializing Python harness: C:\work\pyflink_demo\lib\site-packages\pyflink\fn_execution\beam\bea
m_boot.py --id=4-1 --provision_endpoint=localhost:51794
INFO:root:Starting up Python harness in loopback mode.
at org.apache.flink.streaming.api.runners.python.beam.BeamPythonFunctionRunner.createStageBundleFactory(BeamPythonFunctionRunner.java:566)
at org.apache.flink.streaming.api.runners.python.beam.BeamPythonFunctionRunner.open(BeamPythonFunctionRunner.java:255)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.open(AbstractPythonFunctionOperator.java:131)
at org.apache.flink.streaming.api.operators.python.AbstractOneInputPythonFunctionOperator.open(AbstractOneInputPythonFunctionOperator.java:116)
at org.apache.flink.streaming.api.operators.python.PythonProcessOperator.open(PythonProcessOperator.java:59)
at org.apache.flink.streaming.runtime.tasks.RegularOperatorChain.initializeStateAndOpenOperators(RegularOperatorChain.java:110)
at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreGates(StreamTask.java:711)
at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$SynchronizedStreamTaskActionExecutor.call(StreamTaskActionExecutor.java:100)
at org.apache.flink.streaming.runtime.tasks.StreamTask.restoreInternal(StreamTask.java:687)
at org.apache.flink.streaming.runtime.tasks.StreamTask.restore(StreamTask.java:654)
at org.apache.flink.runtime.taskmanager.Task.runWithSystemExitMonitoring(Task.java:958)
at org.apache.flink.runtime.taskmanager.Task.restoreAndInvoke(Task.java:927)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:766)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:575)
at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.UncheckedExecutionException: java.lang.IllegalStateException: Process died with exit code 0
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2050)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache.get(LocalCache.java:3952)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache.getOrLoad(LocalCache.java:3974)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4958)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$LocalLoadingCache.getUnchecked(LocalCache.java:4964)
at org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory$SimpleStageBundleFactory.<init>(DefaultJobBundleFactory.java:451)
at org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory$SimpleStageBundleFactory.<init>(DefaultJobBundleFactory.java:436)
at org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory.forStage(DefaultJobBundleFactory.java:303)
at org.apache.flink.streaming.api.runners.python.beam.BeamPythonFunctionRunner.createStageBundleFactory(BeamPythonFunctionRunner.java:564)
... 14 more
Caused by: java.lang.IllegalStateException: Process died with exit code 0
at org.apache.beam.runners.fnexecution.environment.ProcessManager$RunningProcess.isAliveOrThrow(ProcessManager.java:75)
at org.apache.beam.runners.fnexecution.environment.ProcessEnvironmentFactory.createEnvironment(ProcessEnvironmentFactory.java:112)
at org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory$1.load(DefaultJobBundleFactory.java:252)
at org.apache.beam.runners.fnexecution.control.DefaultJobBundleFactory$1.load(DefaultJobBundleFactory.java:231)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3528)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2277)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2154)
at org.apache.beam.vendor.guava.v26_0_jre.com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2044)
... 22 more
I tried to figure out what's going on and found very similar question What's wrong with my Pyflink setup that Python UDFs throw py4j exceptions?
It says that was caused by network proxy problem. JVM and python uses local socket communication. So set local communication with no proxy.
I set environment valuable "no_proxy" but it doesn't work.
enter image description here
Could anyone provide solution for this?
There is no useful information in the exception stack to help to identify the problem. This should be caused by a known issue(FLINK-26543, already solved, however still not released). This issue only occurs in loopback mode which is enabled by default when executing the job locally.
For now, you could try to force the job run in process mode instead of loopback mode by setting environment variable _python_worker_execution_mode to process. After doing this, you should see the root cause of the failure.
Besides, there is also a small issue in your code. I guess you meant ds.map(lambda a: Row(a[0] % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) instead of ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) as it doesn't support % operation in Row object.
I have tried the script. I am not quite sure what caused the error. Try to start kafka first and create the topics, before running the script. Or start kafka and run the script a second time after first failure.

pyspark issue while connecting

I am new to the pyspark.
i was trying to initialize a pyspark session .
But getting the below error. I am doing the pyspark2 command in local machine .
When i tried first time using scala the spark session invokation is correct . Then i tried to invoke Pyspark that time i am getting error. Please let me know how i can come out of this error
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/08 22:55:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/08 22:55:41 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.base/java.lang.Thread.run(Thread.java:833)
C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\bin\..\python\pyspark\shell.py:42: UserWarning: Failed to initialize Spark session.
warnings.warn("Failed to initialize Spark session.")
Traceback (most recent call last):
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\bin\..\python\pyspark\shell.py", line 38, in <module>
spark = SparkSession._create_shell_session() # type: ignore
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\sql\session.py", line 553, in _create_shell_session
return SparkSession.builder.getOrCreate()
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\sql\session.py", line 228, in getOrCreate
sc = SparkContext.getOrCreate(sparkConf)
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\context.py", line 392, in getOrCreate
SparkContext(conf=conf or SparkConf())
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\context.py", line 146, in __init__
self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\context.py", line 209, in _do_init
self._jsc = jsc or self._initialize_context(self._conf._jconf)
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\pyspark\context.py", line 329, in _initialize_context
return self._jvm.JavaSparkContext(jconf)
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\lib\py4j-0.10.9.3-src.zip\py4j\java_gateway.py", line 1585, in __call__
return_value = get_return_value(
File "C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\python\lib\py4j-0.10.9.3-src.zip\py4j\protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.NoClassDefFoundError: Could not initialize class org.apache.spark.storage.StorageUtils$
at org.apache.spark.storage.BlockManagerMasterEndpoint.<init>(BlockManagerMasterEndpoint.scala:110)
at org.apache.spark.SparkEnv$.$anonfun$create$9(SparkEnv.scala:348)
at org.apache.spark.SparkEnv$.registerOrLookupEndpoint$1(SparkEnv.scala:287)
at org.apache.spark.SparkEnv$.create(SparkEnv.scala:336)
at org.apache.spark.SparkEnv$.createDriverEnv(SparkEnv.scala:191)
at org.apache.spark.SparkContext.createSparkEnv(SparkContext.scala:277)
at org.apache.spark.SparkContext.<init>(SparkContext.scala:460)
at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:480)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:238)
at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:833)
C:\Spark\spark-3.2.1-bin-hadoop3.2\spark-3.2.1-bin-hadoop3.2\bin>SUCCESS: The process with PID 21928 (child process of PID 14900) has been terminated.
SUCCESS: The process with PID 14900 (child process of PID 31720) has been terminated.
SUCCESS: The process with PID 31720 (child process of PID 10468) has been terminated.

sedona error : java.lang.NoClassDefFoundError: org/opengis/referencing/FactoryException

/usr/share/spark-3.0/bin/pyspark --queue=szsc
--master=yarn
--packages org.apache.sedona:sedona-core-3.0_2.12:1.0.0-incubating,org.apache.sedona:sedona-sql-3.0_2.12:1.0.0-incubating,org.apache.sedona:sedona-viz-3.0_2.12:1.0.0-incubating,org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.0-incubating
--driver-memory 4g
--num-executors 100
--executor-memory 8g
--conf spark.driver.memoryOverhead=5G
--conf spark.executor.memoryOverhead=5G
spark-sql:
sql5="""
select
'aoi' as type,
b.shipment_id,
b.order_type,
b.sub_order_type,
b.buyer_geo_lat,
b.buyer_geo_lng,
a.aoi_id as region_id,
100 as region_level
from tmp_aoi_polygon_tab a, tmp_buyer_pin_tab b
where ST_Contains(a.aoi_polygon, b.point)
"""
df5=spark.sql(sql5)
df5.count()
error log:
21/05/25 23:31:20 INFO FileSourceScanExec: Planning scan with bin packing, max size: 134217728 bytes, open cost is considered as scanning 4194304 bytes.
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/usr/share/spark-3.0/python/pyspark/sql/dataframe.py", line 585, in count
return int(self._jdf.count())
File "/usr/share/spark-3.0/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
File "/usr/share/spark-3.0/python/pyspark/sql/utils.py", line 128, in deco
return f(*a, **kw)
File "/usr/share/spark-3.0/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o92.count.
: java.lang.NoClassDefFoun`enter code here`dError: org/opengis/referencing/FactoryException
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.toSpatialRdd(TraitJoinQueryExec.scala:169)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.toSpatialRdd$(TraitJoinQueryExec.scala:166)
at org.apache.spark.sql.sedona_sql.strategy.join.RangeJoinExec.toSpatialRdd(RangeJoinExec.scala:37)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.toSpatialRddPair(TraitJoinQueryExec.scala:164)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.toSpatialRddPair$(TraitJoinQueryExec.scala:160)
at org.apache.spark.sql.sedona_sql.strategy.join.RangeJoinExec.toSpatialRddPair(RangeJoinExec.scala:37)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.doExecute(TraitJoinQueryExec.scala:65)
at org.apache.spark.sql.sedona_sql.strategy.join.TraitJoinQueryExec.doExecute$(TraitJoinQueryExec.scala:56)
at org.apache.spark.sql.sedona_sql.strategy.join.RangeJoinExec.doExecute(RangeJoinExec.scala:37)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:47)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:720)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.columnar.CachedRDDBuilder.buildBuffers(InMemoryRelation.scala:89)
at org.apache.spark.sql.execution.columnar.CachedRDDBuilder.cachedColumnBuffers(InMemoryRelation.scala:65)
at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.filteredCachedBatches(InMemoryTableScanExec.scala:310)
at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.inputRDD$lzycompute(InMemoryTableScanExec.scala:135)
at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.inputRDD(InMemoryTableScanExec.scala:124)
at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.doExecute(InMemoryTableScanExec.scala:341)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:162)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:720)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD$lzycompute(ShuffleExchangeExec.scala:106)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD(ShuffleExchangeExec.scala:106)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture$lzycompute(ShuffleExchangeExec.scala:110)
at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.mapOutputStatisticsFuture(ShuffleExchangeExec.scala:109)
at org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.$anonfun$doMaterialize$1(QueryStageExec.scala:160)
at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
at org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec.doMaterialize(QueryStageExec.scala:160)
at org.apache.spark.sql.execution.adaptive.QueryStageExec.$anonfun$materialize$1(QueryStageExec.scala:79)
at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
at org.apache.spark.sql.execution.adaptive.QueryStageExec.materialize(QueryStageExec.scala:79)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$4(AdaptiveSparkPlanExec.scala:175)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$4$adapted(AdaptiveSparkPlanExec.scala:173)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$getFinalPhysicalPlan$1(AdaptiveSparkPlanExec.scala:173)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.getFinalPhysicalPlan(AdaptiveSparkPlanExec.scala:159)
at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:255)
at org.apache.spark.sql.Dataset.$anonfun$count$1(Dataset.scala:2981)
at org.apache.spark.sql.Dataset.$anonfun$count$1$adapted(Dataset.scala:2980)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
at org.apache.spark.sql.Dataset.count(Dataset.scala:2980)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: org.opengis.referencing.FactoryException
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
... 87 more
The same thing happened to me about 2 days ago and I finally found the solution, try to use and import the library:
For Scala:
"org.datasyslab" % "geotools-wrapper" % "geotools-24.1"
"org.locationtech.jts" % "jts-core" % "1.17.0"
import org.datasyslab
And for pyspark you need to import datasyslab geotools (ST sql functions) and jts.
This happens because sedona no longer incorporates the dependencies for its sql functions, I hope it helps you.
For the Python solution, I use pyspark, within a virtual env. I added missing jars, into the virtual env directory of Spark $DIR_VIRTUAL_ENV/lib/python3.8/site-packages/pyspark/jars, as follows:
wget https://repo1.maven.org/maven2/org/datasyslab/geotools-wrapper/1.1.0-25.2/geotools-wrapper-1.1.0-25.2.jar
wget https://repo1.maven.org/maven2/org/apache/sedona/sedona-python-adapter-3.0_2.12/1.2.0-incubating/sedona-python-adapter-3.0_2.12-1.2.0-incubating.jar
wget https://repo1.maven.org/maven2/org/apache/sedona/sedona-viz-3.0_2.12/1.2.0-incubating/sedona-viz-3.0_2.12-1.2.0-incubating.jar
Instead, you can download them manually, and locate in the aforementioned directory.
Afterwards, exit and start over the pyspark shell, no need to import anything else explicitly.
Partially based on https://sedona.apache.org/setup/databricks/.
Actual Python environment:
anytree==2.8.0
apache-sedona==1.2.0
astroid==1.3.2
attrs==21.4.0
certifi==2021.10.8
click==8.1.2
click-plugins==1.1.1
cligj==0.7.2
cycler==0.11.0
Fiona==1.8.21
fonttools==4.32.0
geopandas==0.10.2
importlib-metadata==4.11.3
joblib==1.1.0
jts==0.0.3
kiwisolver==1.4.2
logilab-common==1.9.2
mapclassify==2.4.3
matplotlib==3.5.1
munch==2.5.0
mypy-extensions==0.4.3
networkx==2.8
numpy==1.22.3
packaging==21.3
pandas==1.4.2
Pillow==9.1.0
py2puml==0.5.4
py4j==0.10.9.3
pyarrow==7.0.0
pydoop==2.0.0
pylint==1.4.0
pypandoc==1.7.4
pyparsing==3.0.8
pyproj==3.3.0
pyspark==3.2.1
python-dateutil==2.8.2
pytz==2022.1
scikit-learn==1.0.2
scipy==1.8.0
Shapely==1.8.1.post1
six==1.16.0
threadpoolctl==3.1.0
typing-extensions==4.1.1
venv-pack==0.2.0
xlrd==2.0.1
zipp==3.7.0
Disclaimer: I don't have enough reputation to comment in answers.

Map and transform methods with using serializable class on DStream elements

I am writing app that will be extract logs, so I implemented function (Lisitng 1) which is taking string as parametr and extracts valuable informations (regexs: Listing 2) from it. I wanted that this method could be send to other workers so I impelemnt serializable class.
I have problem with apply this method on DStreams. Here is my streams minning solution:
def streamMinner(): Unit = {
val ssc = new StreamingContext(sc, Seconds(2))
val logsStream = ssc.textFileStream("logs/")
// Not works
val extractLogs = logsStream.map( log => new Matcher().matchLog(log))
extractLogs.print(1)
// Works
// val words = logsStream.transform( rdd => rdd.map( log => matchLog(log)))
// words.print()
ssc.start()
ssc.awaitTermination()
}
Problem is in line where every element of logsStream is maped with new object of Matcher class (new Matcher().matchLog(log)
Apache Spark gave my below errors:
ERROR YarnScheduler: Lost executor 2 on host1: Container marked as failed: container_e743_1499728610705_0043_01_000003 on host: host1. Exit status: 50. Diagnostics: Exception from container-launch.
Container id: container_e743_1499728610705_0043_01_000003
Exit code: 50
Stack trace: ExitCodeException exitCode=50:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:600)
at org.apache.hadoop.util.Shell.run(Shell.java:511)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:783)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:212)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:303)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:82)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Container exited with a non-zero exit code 50
ERROR YarnScheduler: Lost executor 5 on host2: Container marked as failed: container_e743_1499728610705_0043_01_000006 on host: host2. Exit status: 50. Diagnostics: Exception from container-launch.
Container id: container_e743_1499728610705_0043_01_000006
Exit code: 50
Stack trace: ExitCodeException exitCode=50:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:600)
at org.apache.hadoop.util.Shell.run(Shell.java:511)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:783)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:212)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:303)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:82)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Container exited with a non-zero exit code 50
...
ERROR YarnScheduler: Lost executor 6 ...
ERROR TaskSetManager: Task 0 in stage 0.0 failed 4 times; aborting job
17/07/11 09:41:09 ERROR JobScheduler: Error running job streaming job 1499758850000 ms.0
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, host1): ExecutorLostFailure (executor 6 exited caused by one of the running tasks) Reason: Container marked as failed: container_e743_1499728610705_0043_01_000007 on host: host1. Exit status: 50. Diagnostics: Exception from container-launch.
Container id: container_e743_1499728610705_0043_01_000007
Exit code: 50
Stack trace: ExitCodeException exitCode=50:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:600)
at org.apache.hadoop.util.Shell.run(Shell.java:511)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:783)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:212)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:303)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:82)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Container exited with a non-zero exit code 50
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1433)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1421)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1420)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1420)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:801)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:801)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:801)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1642)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1601)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1590)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:622)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1856)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1869)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1882)
at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1335)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
at org.apache.spark.rdd.RDD.take(RDD.scala:1309)
at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$5$1.apply(DStream.scala:768)
at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$5$1.apply(DStream.scala:767)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:227)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:227)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:227)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:226)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3 in stage 0.0 (TID 3, host1): ExecutorLostFailure (executor 6 exited caused by one of the running tasks) Reason: Container marked as failed: container_e743_1499728610705_0043_01_000007 on host: host1. Exit status: 50. Diagnostics: Exception from container-launch.
Container id: container_e743_1499728610705_0043_01_000007
Exit code: 50
Stack trace: ExitCodeException exitCode=50:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:600)
at org.apache.hadoop.util.Shell.run(Shell.java:511)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:783)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:212)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:303)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:82)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Container exited with a non-zero exit code 50
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1433)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1421)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1420)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1420)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:801)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:801)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:801)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1642)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1601)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1590)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:622)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1856)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1869)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1882)
at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1335)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
at org.apache.spark.rdd.RDD.take(RDD.scala:1309)
at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$5$1.apply(DStream.scala:768)
at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$5$1.apply(DStream.scala:767)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:426)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:49)
at scala.util.Try$.apply(Try.scala:161)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:227)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:227)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:227)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:57)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:226)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
scala> 17/07/11 09:41:11 ERROR TransportResponseHandler: Still have 1 requests outstanding when connection from host3/11.11.11.11:11111 is closed
17/07/11 09:41:11 ERROR YarnScheduler: Lost executor 4 on host3: Slave lost
17/07/11 09:41:12 ERROR TransportClient: Failed to send RPC 7741519719369750843 to host3/11.11.11.11:11111: java.nio.channels.ClosedChannelException
java.nio.channels.ClosedChannelException
17/07/11 09:41:12 ERROR YarnScheduler: Lost executor 1 on host2: Slave lost
17/07/11 09:41:12 ERROR TransportClient: Failed to send RPC 7734757459881277232 to host3//11.11.11.11:11111: java.nio.channels.ClosedChannelException
java.nio.channels.ClosedChannelException
17/07/11 09:41:12 ERROR YarnScheduler: Lost executor 3 on host4: Slave lost
17/07/11 09:41:12 ERROR TransportClient: Failed to send RPC 5816053641531447955 to host3//11.11.11.11:11111: java.nio.channels.ClosedChannelException
java.nio.channels.ClosedChannelException
17/07/11 09:41:12 ERROR YarnScheduler: Lost executor 7 on host2: Slave lost
17/07/11 09:41:13 ERROR TransportClient: Failed to send RPC 8774007142277591342 to host3/11.11.11.11:11111: java.nio.channels.ClosedChannelException
java.nio.channels.ClosedChannelException
17/07/11 09:41:13 ERROR YarnScheduler: Lost executor 8 on host1: Slave lost
17/07/11 09:41:19 ERROR YarnScheduler: Lost executor 1 on host3: Container marked as failed: container_e743_1499728610705_0043_02_000002 on host: host3. Exit status: 50. Diagnostics: Exception from container-launch.
Container id: container_e743_1499728610705_0043_02_000002
Exit code: 50
Stack trace: ExitCodeException exitCode=50:
at org.apache.hadoop.util.Shell.runCommand(Shell.java:600)
at org.apache.hadoop.util.Shell.run(Shell.java:511)
at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:783)
at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:212)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:303)
at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:82)
at java.util.concurrent.FutureTask.run(FutureTask.java:262)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Container exited with a non-zero exit code 50
When I comment lines:
val extractLogs = logsStream.map( log => new Matcher().matchLog(log))
extractLogs.print(1)
and I uncomment lines:
// val words = logsStream.transform( rdd => rdd.map( log => matchLog(log)))
// words.print()
Everything works fine. My question is why? I'm afraid that solution that works may not be parallelized on cluster because method matchLog is not serializable. Someone has a similar problem or know how to deal with it?
Lisitng 1:
case class logValues2(time_stamp: String, action: String, protocol: String, connection_id: String, src_ip: String, dst_ip: String, src_port: String, dst_port: String, duration: String, bytes: String, user: String) extends Serializable
class Matcher extends Serializable {
def matchLog(x: String): logValues2 = {
var dst_ip = " "
var dst_port = " "
var time_stamp = time_stamp_reg.findAllIn(x).mkString(",")
var action = action_reg.findAllIn(x).mkString(",")
var protocol = protocol_reg.findAllIn(x).mkString(",")
var connection_id = connection_id_reg.findAllIn(x).mkString(",")
var ips = ips_reg.findAllIn(x).mkString(" ").split(""" """)
var src_ip = ips(0)
if (ips.length > 1) {
dst_ip = ips(1)
} else {
dst_ip = " "
}
var ports = ports_reg.findAllIn(x).mkString(" ").split(""" """)
var src_port = ports(0)
if (ports.length > 1) {
dst_port = ports(1)
} else {
dst_port = " "
}
var duration = duration_reg.findAllIn(x).mkString(",")
var bytes = bytes_reg.findAllIn(x).mkString(",")
var user = user_reg.findAllIn(x).mkString(",")
var logObject = logValues2(time_stamp, action, protocol, connection_id, src_ip, dst_ip, src_port, dst_port, duration, bytes, user)
return logObject
}
Above method is implemented also separately (no within Matcher class).
UPDATE:
My Regular expressions: Listing 2:
val time_stamp_reg = """^.*?(?=\s\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s%)""".r
val action_reg = """((?<=:\s)\w{4,10}(?=\s\w{2})|(?<=\w\s)(\w{7,9})(?=\s[f]))""".r
val protocol_reg = """(?<=[\w:]\s)(\w+)(?=\s[cr])""".r
val connection_id_reg = """(?<=\w\s)(\d+)(?=\sfor)""".r
val ips_reg = """(?<=[\d\w][:\s])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?=\/\d+|\z| \w)""".r
val ports_reg = """(?<=\d\/)(\d{1,6})(?=\z|[\s(])""".r
val duration_reg = """(?<=duration\s)(\d{1,2}:\d{1,2}:\d{1,2})(?=\s|\z)""".r
val bytes_reg = """(?<=bytes\s)(\d+)(?=\s|\z)""".r
val user_reg = """(?<=\\\\)(\d+)(?=\W)""".r