I am trying to run a simple flink streaming job on AWS EMR. The purpose is very simple for now:
Consume data from Kafka in flink
Load to another topic in kafka.
I am using the following dependencies:
scalaVersion := "2.11.8"
val flinkVersion = "1.11.1"
libraryDependencies ++= Seq(
"org.apache.flink" %% "flink-scala" % flinkVersion,
"org.apache.flink" %% "flink-streaming-scala" % flinkVersion,
"org.apache.flink" %% "flink-connector-kafka" % flinkVersion
)
Flink code that i am using is :
private val serdeSchema = new SimpleStringSchema
val env = StreamExecutionEnvironment.getExecutionEnvironment
val stream = env
.addSource(createKafkaConsumer(kafkaInputTopic
, kafkaBrokers, kafkaConfig("consumerGroupId").toString
, kafkaConfig("defaultReset").toString))
stream
.map((s: String) => s)
.addSink(createKafkaProducer(kafkaOutputTopic, kafkaBrokers))
env.execute(jobConfig("jobName").toString)
}
def createKafkaProducer(kafkaTopic: String, kafkaBrokers: String): FlinkKafkaProducer[String] = {
val producer = new FlinkKafkaProducer[String](kafkaBrokers,
kafkaTopic, serdeSchema)
producer
}
def createKafkaConsumer(kafkaInputTopic: String
, kafkaBrokers: String
, consumerGroup:String
, defaultReset: String): FlinkKafkaConsumer[String] = {
val properties = new Properties()
properties.setProperty("bootstrap.servers", kafkaBrokers)
properties.setProperty("group.id", consumerGroup)
properties.setProperty("enable.auto.commit" , "false")
properties.setProperty("auto.offset.reset" , defaultReset)
val consumer = new FlinkKafkaConsumer[String](kafkaInputTopic, serdeSchema, properties)
consumer
}
I generate a assembly jar using sbt. I use the following command to run the job on EMR
/bin/flink run -c com.example.FlinkConsumer flink/target/scala-2.11/flink-assembly-0.1.jar
Below is the stack trace
Caused by: org.apache.flink.client.program.ProgramInvocationException: Job failed (JobID: c458520153e875811c46c386b9ec605e)
at org.apache.flink.client.deployment.ClusterClientJobClientAdapter.lambda$null$6(ClusterClientJobClientAdapter.java:112)
at java.util.concurrent.CompletableFuture.uniApply(CompletableFuture.java:616)
at java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:591)
at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
at java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
at org.apache.flink.client.program.rest.RestClusterClient.lambda$pollResourceAsync$21(RestClusterClient.java:565)
at java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
at java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
at java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:1975)
at org.apache.flink.runtime.concurrent.FutureUtils.lambda$retryOperationWithDelay$8(FutureUtils.java:291)
at java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:774)
at java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:750)
at java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:488)
at java.util.concurrent.CompletableFuture.postFire(CompletableFuture.java:575)
at java.util.concurrent.CompletableFuture$UniCompose.tryFire(CompletableFuture.java:943)
at java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:456)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:147)
at org.apache.flink.client.deployment.ClusterClientJobClientAdapter.lambda$null$6(ClusterClientJobClientAdapter.java:110)
... 19 more
Caused by: org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:110)
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:76)
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:192)
at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:186)
at org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:180)
at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:484)
at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:380)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:279)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:194)
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:74)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:152)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at akka.actor.Actor$class.aroundReceive(Actor.scala:517)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
at akka.actor.ActorCell.invoke(ActorCell.scala:561)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
at akka.dispatch.Mailbox.run(Mailbox.scala:225)
at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Caused by: java.lang.NoSuchMethodError: org.apache.flink.api.common.serialization.SerializationSchema.open(Lorg/apache/flink/api/common/serialization/SerializationSchema$InitializationContext;)V
at org.apache.flink.streaming.connectors.kafka.internals.KafkaSerializationSchemaWrapper.open(KafkaSerializationSchemaWrapper.java:61)
at org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.open(FlinkKafkaProducer.java:808)
at org.apache.flink.api.common.functions.util.FunctionUtils.openFunction(FunctionUtils.java:36)
at org.apache.flink.streaming.api.operators.AbstractUdfStreamOperator.open(AbstractUdfStreamOperator.java:102)
at org.apache.flink.streaming.api.operators.StreamSink.open(StreamSink.java:48)
at org.apache.flink.streaming.runtime.tasks.StreamTask.initializeStateAndOpen(StreamTask.java:1007)
at org.apache.flink.streaming.runtime.tasks.StreamTask.lambda$beforeInvoke$0(StreamTask.java:454)
at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$SynchronizedStreamTaskActionExecutor.runThrowing(StreamTaskActionExecutor.java:94)
at org.apache.flink.streaming.runtime.tasks.StreamTask.beforeInvoke(StreamTask.java:449)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:461)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:707)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:532)
at java.lang.Thread.run(Thread.java:748)
It looks like a version issue, but i tried with various versions, i don't see this open method, but i think in the serialize it calls the open method and unable to find one. Can someone please help . I am new to flink.
If you're using EMR's Flink support, then most Flink libraries should be flagged as "provided" so that they're not in your jar, as they're on the classpath from the Flink installation that EMR is providing. You'll still need to explicitly include anything that's not provided by EMR (e.g. flink-connector-kafka).
Related
I'm trying to follow example: https://blog.knoldus.com/a-quick-demo-kafka-to-flink-to-cassandra/
I'm trying to parse my ShippingOrder JSON message from kafka and parse it into object. Then group it by some properties but have an error when flatMap step.
My sbt file:
import Dependencies._
scalaVersion := "2.13.4"
version := "0.1.0-SNAPSHOT"
organization := "com.example"
organizationName := "example"
lazy val root = (project in file("."))
.settings(
name := "KafkaTest",
libraryDependencies += scalaTest % Test,
libraryDependencies += "org.apache.flink" % "flink-streaming-scala_2.12" % "1.12.1" % "provided",
libraryDependencies += "org.apache.flink" % "flink-connector-kafka_2.12" % "1.12.1",
libraryDependencies += "org.apache.flink" % "flink-clients_2.12" % "1.12.1",
libraryDependencies += "org.json4s" %% "json4s-native" % "3.6.10",
)
My main file.
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.scala._
import org.json4s.native.JsonMethods
import java.util.Properties
object Kafka {
def main(args: Array[String]) {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
implicit lazy val formats = org.json4s.DefaultFormats
properties.setProperty("bootstrap.servers", "broker:9092")
properties.setProperty("group.id", "Flink")
implicit val typeInfo = TypeInformation.of(classOf[(String)])
implicit val typeInfo_2 = TypeInformation.of(classOf[(String, Int)])
implicit val typeInfo_3 = TypeInformation.of(classOf[(org.json4s.JsonAST.JValue)])
implicit val typeInfo_4 = TypeInformation.of(classOf[(ShippingOrder)])
val consumer = new FlinkKafkaConsumer[String]("ShippingOrders", new SimpleStringSchema(), properties)
consumer.setCommitOffsetsOnCheckpoints(true)
consumer.setStartFromEarliest()
val stream = env.addSource(consumer)
.flatMap(JsonMethods.parse(_).toOption)
.map(_.extract[ShippingOrder])
stream.print()
env.execute("Flink Kafka Example")
}
}
My Order Object
import scala.tools.nsc.doc.model.Trait
class ShippingOrder(
Old: Data,
New: Data,
)
class Data(
ID: String,
Action: String,
ClientID: Int,
Data: Trait,
ToLocation: Location,
ToName: String,
ToPhone: String,
Log: List[Log],
IsPartialReturn: Boolean,
Items: List[Item],
)
class Log(
Reason: String,
ReasonCode: String,
Status: String,
// UpdatedDate: java.sql.Date,
)
class Item(
Code: String,
Name: String,
Quantity: Int,
)
class Location(
// Coordinates: Trait,
Type: String,
)
I got an error went run this job
WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.flink.api.java.ClosureCleaner (file:/usr/local/Cellar/apache-flink/1.12.1/libexec/lib/flink-dist_2.12-1.12.1.jar) to field java.util.Properties.serialVersionUID
WARNING: Please consider reporting this to the maintainers of org.apache.flink.api.java.ClosureCleaner
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
Job has been submitted with JobID 391088d1b7233806d15cd10da73f8660
------------------------------------------------------------
The program finished with the following exception:
org.apache.flink.client.program.ProgramInvocationException: The main method caused an error: org.apache.flink.client.program.ProgramInvocationException: Job failed (JobID: 391088d1b7233806d15cd10da73f8660)
at org.apache.flink.client.program.PackagedProgram.callMainMethod(PackagedProgram.java:360)
at org.apache.flink.client.program.PackagedProgram.invokeInteractiveModeForExecution(PackagedProgram.java:213)
at org.apache.flink.client.ClientUtils.executeProgram(ClientUtils.java:114)
at org.apache.flink.client.cli.CliFrontend.executeProgram(CliFrontend.java:816)
at org.apache.flink.client.cli.CliFrontend.run(CliFrontend.java:248)
at org.apache.flink.client.cli.CliFrontend.parseAndRun(CliFrontend.java:1058)
at org.apache.flink.client.cli.CliFrontend.lambda$main$10(CliFrontend.java:1136)
at org.apache.flink.runtime.security.contexts.NoOpSecurityContext.runSecured(NoOpSecurityContext.java:28)
at org.apache.flink.client.cli.CliFrontend.main(CliFrontend.java:1136)
Caused by: java.util.concurrent.ExecutionException: org.apache.flink.client.program.ProgramInvocationException: Job failed (JobID: 391088d1b7233806d15cd10da73f8660)
at java.base/java.util.concurrent.CompletableFuture.reportGet(CompletableFuture.java:395)
at java.base/java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1999)
at org.apache.flink.client.program.StreamContextEnvironment.getJobExecutionResult(StreamContextEnvironment.java:123)
at org.apache.flink.client.program.StreamContextEnvironment.execute(StreamContextEnvironment.java:80)
at org.apache.flink.streaming.api.environment.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.java:1782)
at org.apache.flink.streaming.api.scala.StreamExecutionEnvironment.execute(StreamExecutionEnvironment.scala:746)
at Kafka$.main(Kafka.scala:34)
at Kafka.main(Kafka.scala)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at org.apache.flink.client.program.PackagedProgram.callMainMethod(PackagedProgram.java:343)
... 8 more
Caused by: org.apache.flink.client.program.ProgramInvocationException: Job failed (JobID: 391088d1b7233806d15cd10da73f8660)
at org.apache.flink.client.deployment.ClusterClientJobClientAdapter.lambda$null$6(ClusterClientJobClientAdapter.java:125)
at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:642)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.client.program.rest.RestClusterClient.lambda$pollResourceAsync$22(RestClusterClient.java:665)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859)
at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.concurrent.FutureUtils.lambda$retryOperationWithDelay$9(FutureUtils.java:394)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859)
at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.postFire(CompletableFuture.java:610)
at java.base/java.util.concurrent.CompletableFuture$UniCompose.tryFire(CompletableFuture.java:1085)
at java.base/java.util.concurrent.CompletableFuture$Completion.run(CompletableFuture.java:478)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
at org.apache.flink.client.deployment.ClusterClientJobClientAdapter.lambda$null$6(ClusterClientJobClientAdapter.java:123)
... 18 more
Caused by: org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:118)
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:80)
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:233)
at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:224)
at org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:215)
at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:665)
at org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(SchedulerNG.java:89)
at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:447)
at jdk.internal.reflect.GeneratedMethodAccessor109.invoke(Unknown Source)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:306)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:213)
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:77)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:159)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
at scala.PartialFunction.applyOrElse(PartialFunction.scala:123)
at scala.PartialFunction.applyOrElse$(PartialFunction.scala:122)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:172)
at akka.actor.Actor.aroundReceive(Actor.scala:517)
at akka.actor.Actor.aroundReceive$(Actor.scala:515)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
at akka.actor.ActorCell.invoke(ActorCell.scala:561)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
at akka.dispatch.Mailbox.run(Mailbox.scala:225)
at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Caused by: java.lang.NoSuchMethodError: 'scala.collection.immutable.List scala.collection.immutable.List.map(scala.Function1)'
at org.json4s.ParserUtil$Buffer.substring(ParserUtil.scala:139)
at org.json4s.ParserUtil$.unquote(ParserUtil.scala:98)
at org.json4s.native.JsonParser$Parser.parseString$1(JsonParser.scala:243)
at org.json4s.native.JsonParser$Parser.nextToken(JsonParser.scala:282)
at org.json4s.native.JsonParser$.$anonfun$astParser$1(JsonParser.scala:188)
at org.json4s.native.JsonParser$.$anonfun$astParser$1$adapted(JsonParser.scala:145)
at org.json4s.native.JsonParser$.parse(JsonParser.scala:133)
at org.json4s.native.JsonParser$.parse(JsonParser.scala:71)
at org.json4s.native.JsonMethods.parse(JsonMethods.scala:10)
at org.json4s.native.JsonMethods.parse$(JsonMethods.scala:9)
at org.json4s.native.JsonMethods$.parse(JsonMethods.scala:63)
at Kafka$.$anonfun$main$1(Kafka.scala:30)
at org.apache.flink.streaming.api.scala.DataStream$$anon$6.flatMap(DataStream.scala:681)
at org.apache.flink.streaming.api.operators.StreamFlatMap.processElement(StreamFlatMap.java:47)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:71)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:46)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:26)
at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:50)
at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:28)
at org.apache.flink.streaming.api.operators.StreamSourceContexts$ManualWatermarkContext.processAndCollectWithTimestamp(StreamSourceContexts.java:322)
at org.apache.flink.streaming.api.operators.StreamSourceContexts$WatermarkContext.collectWithTimestamp(StreamSourceContexts.java:426)
at org.apache.flink.streaming.connectors.kafka.internals.AbstractFetcher.emitRecordsWithTimestamps(AbstractFetcher.java:365)
at org.apache.flink.streaming.connectors.kafka.internals.KafkaFetcher.partitionConsumerRecordsHandler(KafkaFetcher.java:183)
at org.apache.flink.streaming.connectors.kafka.internals.KafkaFetcher.runFetchLoop(KafkaFetcher.java:142)
at org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumerBase.run(FlinkKafkaConsumerBase.java:826)
at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:110)
at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:66)
at org.apache.flink.streaming.runtime.tasks.SourceStreamTask$LegacySourceFunctionThread.run(SourceStreamTask.java:241)
I have no idea about this error.
Please explain and help me fix this.
I am pretty sure this is because of the fact that You are not using fat-jar and thus the scala is missing from Your cluster. You should probably take a look here to get some info on how to create fat-jars in SBT.
I'm working on an integration project of Kafka and Spark and I'm trying to read a Kafka topic using Spark 2.4.5, Scala 2.12.11 and Kafka 2.5.0.
My sbt file is:
name := "Test"
version := "1.0"
scalaVersion := "2.12.11"
libraryDependencies ++= Seq(
"org.apache.spark" % "spark-sql_2.12" % "2.4.5",
"org.apache.spark" % "spark-sql-kafka-0-10_2.12" % "2.4.5",
"org.apache.spark" % "spark-streaming-kafka-0-10-assembly_2.12" % "2.4.5",
"org.apache.kafka" % "kafka-clients" % "2.5.0"
)
my code is:
object Test{
def main(args: Array[String]) = {
import org.apache.spark.sql.SparkSession
val spark = SparkSession
.builder()
.appName("SparkTest")
.master("local[*]")
.getOrCreate()
import spark.implicits._
val df = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "test")
.option("startingOffsets", "earliest")
.load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
.as[(String, String)]
df.printSchema()
}}
After having created the topic on Kafka, started zookeeper and Kafka itself, when I launch the code with:
./spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:2.4.5 --class Test /home/luca/Projects/Test/target/scala-2.12/test_2.12-1.0.jar
I run into the following error:
20/05/06 15:40:29 INFO StateStoreCoordinatorRef: Registered StateStoreCoordinator endpoint
Exception in thread "main" java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider org.apache.spark.sql.kafka010.KafkaSourceProvider could not be instantiated
at java.util.ServiceLoader.fail(ServiceLoader.java:232)
at java.util.ServiceLoader.access$100(ServiceLoader.java:185)
at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384)
at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:43)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.filterImpl(TraversableLike.scala:247)
at scala.collection.TraversableLike$class.filter(TraversableLike.scala:259)
at scala.collection.AbstractTraversable.filter(Traversable.scala:104)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:630)
at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:161)
at Test$.main(projectfile.scala:24)
at Test.main(projectfile.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:845)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.NoSuchMethodError: org.apache.spark.internal.Logging.$init$(Lorg/apache/spark/internal/Logging;)V
at org.apache.spark.sql.kafka010.KafkaSourceProvider.<init>(KafkaSourceProvider.scala:44)
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at java.lang.Class.newInstance(Class.java:442)
at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:380)
... 26 more
Can someone help me out with this?
Kafka- client version can be one of the reasons. Otherwise, try with spark 2.4.0 and scala 2.12 earlier versions. Looks like a compatibility issue
I'm running a very simple Scala job on Apache Spark 2.4.5 and when I try and iterate over the columns in a DataFrame and print there names I get the following stack trace correlating to the line where I try and call the for each.
Exception in thread "main" java.lang.NoSuchMethodError: scala.Predef$.refArrayOps([Ljava/lang/Object;)[Ljava/lang/Object;
at SimpleApp$.main(SimpleApp.scala:10)
at SimpleApp.main(SimpleApp.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:845)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
I am running Apache Spark in Docker using this image: bde2020/spark-master:2.4.5-hadoop2.7
I am compiling my app using scalaVersion := "2.12.11"
Full application code is:
import org.apache.spark.sql.{Row, SparkSession}
object SimpleApp {
def main(args: Array[String]) {
val file = "/spark/jobs/job1/data/test.json"
val spark = SparkSession.builder.appName("Simple Application Scala").getOrCreate()
val testData = spark.read.json(file)
println("prints fine")
testData.columns.foreach(x => println(x))
spark.stop()
}
build.sbt file is
name := "spark-scala"
version := "0.1"
scalaVersion := "2.12.11"
libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.4.5"
I am at a loss, I have checked and checked I am running the correct versions of things but suspect I must of missed something!
After much head banging discovered the image actually uses Scala 2.11.12 which is deprecated with Spark 2.4.5! Obvious in hindsight, all working now.
You are not setting spark-core in your dependencies.
This seems like a very odd and specific issue which has me stumped.
When using a DataFrame built by a spark.sql("select * from table") query on a Hive table, I get a timeout exception whenever I try to use an HTTP client in a transform or action step on that DataFrame.
Example:
import scalaj.http._
import org.apache.spark.sql.SparkSession
object Example {
def postDoc(doc: String): Unit = {
val resp = Http("https://example.com/endpoint")
.postData(doc)
.header("content-type", "application/json")
.asString
}
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().enableHiveSupport().getOrCreate()
import spark.implicits._
val df = spark.sql("select id, json_doc from some_table")
df.map(r => r.getAs[String]("json_doc")).foreach(postDoc _)
}
}
I can however hit the service through a DataFrame I create manually; i.e. Seq((1, "{\"a\": 1}")).toDF("id", "json_doc").foreach(postDoc _).
I've also tried creating temp tables and using spark.sql to select from them; which works on a DataFrame that I create manually but not on ones sourced from a Hive table.
My partial build.sbt
scalaVersion := "2.11.12"
libraryDependencies ++= {
val sparkVersion = "2.1.3"
Seq(
"org.apache.spark" %% "spark-core" % sparkVersion % "provided",
"org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
"org.scalaj" %% "scalaj-http" % "2.4.2"
)
}
Stacktrace
java.net.SocketTimeoutException: Read timed out
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at sun.net.www.protocol.http.HttpURLConnection$10.run(HttpURLConnection.java:1950)
at sun.net.www.protocol.http.HttpURLConnection$10.run(HttpURLConnection.java:1945)
at java.security.AccessController.doPrivileged(Native Method)
at sun.net.www.protocol.http.HttpURLConnection.getChainedException(HttpURLConnection.java:1944)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1514)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498)
at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:480)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.getResponseCode(HttpsURLConnectionImpl.java:352)
at scalaj.http.HttpRequest.scalaj$http$HttpRequest$$doConnection(Http.scala:367)
at scalaj.http.HttpRequest.exec(Http.scala:343)
at scalaj.http.HttpRequest.asString(Http.scala:492)
at com.gm.avalanche.collect.Collector$.postDoc(Collector.scala:34)
at com.gm.avalanche.collect.Collector$$anonfun$sqlTest$2.apply(Collector.scala:71)
at com.gm.avalanche.collect.Collector$$anonfun$sqlTest$2.apply(Collector.scala:71)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
at org.apache.spark.rdd.RDD$$anonfun$foreach$1$$anonfun$apply$28.apply(RDD.scala:918)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1954)
at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1954)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:100)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:325)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(SocketInputStream.java:116)
at java.net.SocketInputStream.read(SocketInputStream.java:171)
at java.net.SocketInputStream.read(SocketInputStream.java:141)
at sun.security.ssl.InputRecord.readFully(InputRecord.java:465)
at sun.security.ssl.InputRecord.read(InputRecord.java:503)
at sun.security.ssl.SSLSocketImpl.readRecord(SSLSocketImpl.java:975)
at sun.security.ssl.SSLSocketImpl.readDataRecord(SSLSocketImpl.java:933)
at sun.security.ssl.AppInputStream.read(AppInputStream.java:105)
at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:735)
at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:678)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream0(HttpURLConnection.java:1593)
at sun.net.www.protocol.http.HttpURLConnection.getInputStream(HttpURLConnection.java:1498)
at sun.net.www.protocol.https.HttpsURLConnectionImpl.getInputStream(HttpsURLConnectionImpl.java:268)
at scalaj.http.HttpRequest.scalaj$http$HttpRequest$$doConnection(Http.scala:365)
... 17 more
Turns out the socket was being closed remotely by the Ingress controller running in front of the Kubernates environment where the Elasticsearch instance is running. It was set to the default of a one minute timeout.
I am new to big data development and S3 in particular. I have a Scala project (PlayFrameWork) which I've been using HDFS for so far. Now I am trying to switch to S3. I have a very simple code which reads a file from S3:
val sparkSession: SparkSession = SparkSession.builder
.master("local[*]")
.appName("SparkSessionZipsExample")
.config("spark.sql.warehouse.dir", "file:///C:/projects/scala101")
.getOrCreate()
val sonnets: String = sparkSession.sparkContext
.wholeTextFiles("s3a://<awsAccessKey>:<awsASecretccessKey>#logs/Results.json")
.take(1)(0)._2
println(sonnets)
In my build.sbt I have added some commanded dependencies:
libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.3" % "provided"
libraryDependencies += "com.amazonaws" % "aws-java-sdk-s3" % "1.11.307"
libraryDependencies += "org.apache.hadoop" % "hadoop-aws" % "3.0.1"
But yet I am getting the following error:
[error] application -
! #77fbmca9f - Internal server error, for (GET) [/] ->
play.api.http.HttpErrorHandlerExceptions$$anon$1: Execution exception[[RuntimeException: java.lang.NoClassDefFoundError: org/apache/hadoop/util/BlockingThreadPoolExecutorService]]
at play.api.http.HttpErrorHandlerExceptions$.throwableToUsefulException(HttpErrorHandler.scala:255)
at play.api.http.DefaultHttpErrorHandler.onServerError(HttpErrorHandler.scala:180)
at play.core.server.AkkaHttpServer$$anonfun$3.applyOrElse(AkkaHttpServer.scala:310)
at play.core.server.AkkaHttpServer$$anonfun$3.applyOrElse(AkkaHttpServer.scala:308)
at scala.concurrent.Future$$anonfun$recoverWith$1.apply(Future.scala:346)
at scala.concurrent.Future$$anonfun$recoverWith$1.apply(Future.scala:345)
at scala.concurrent.impl.CallbackRunnable.run$$$capture(Promise.scala:36)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala)
at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:55)
at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply$mcV$sp(BatchingExecutor.scala:91)
Caused by: java.lang.RuntimeException: java.lang.NoClassDefFoundError: org/apache/hadoop/util/BlockingThreadPoolExecutorService
at play.api.mvc.ActionBuilder$$anon$2.apply(Action.scala:424)
at play.api.mvc.Action$$anonfun$apply$2.apply(Action.scala:96)
at play.api.mvc.Action$$anonfun$apply$2.apply(Action.scala:89)
at play.api.libs.streams.StrictAccumulator$$anonfun$mapFuture$2$$anonfun$1.apply(Accumulator.scala:174)
at play.api.libs.streams.StrictAccumulator$$anonfun$mapFuture$2$$anonfun$1.apply(Accumulator.scala:174)
at scala.util.Try$.apply(Try.scala:192)
at play.api.libs.streams.StrictAccumulator$$anonfun$mapFuture$2.apply(Accumulator.scala:174)
at play.api.libs.streams.StrictAccumulator$$anonfun$mapFuture$2.apply(Accumulator.scala:170)
at akka.stream.impl.Transform.apply(TraversalBuilder.scala:155)
at akka.stream.impl.PhasedFusingActorMaterializer.materialize(PhasedFusingActorMaterializer.scala:482)
Caused by: java.lang.NoClassDefFoundError: org/apache/hadoop/util/BlockingThreadPoolExecutorService
at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:254)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3288)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:123)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3337)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3305)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:476)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:514)
at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.setInputPaths(FileInputFormat.java:483)
at org.apache.spark.SparkContext$$anonfun$wholeTextFiles$1.apply(SparkContext.scala:877)
Caused by: java.lang.ClassNotFoundException: org.apache.hadoop.util.BlockingThreadPoolExecutorService
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at org.apache.hadoop.fs.s3a.S3AFileSystem.initialize(S3AFileSystem.java:254)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3288)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:123)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:3337)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:3305)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:476)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:361)
I appreciate any help.
Thank you,