save a spark rdd into big query table - scala

I am working with google big query platform and I want to load spark RDD using the scala google big query client
I wrote the following code :
import org.apache.spark.SparkConf
import org.apache.spark.sql
import org.apache.spark.sql.SparkSession
import org.slf4j.LoggerFactory
import com.google.api.services.bigquery.model.TableFieldSchema
import com.google.api.services.bigquery.model.TableSchema
import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration
import com.google.cloud.hadoop.io.bigquery.BigQueryFileFormat
import com.google.cloud.hadoop.io.bigquery.GsonBigQueryInputFormat
import com.google.cloud.hadoop.io.bigquery.output.BigQueryOutputConfiguration
import com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputFormat
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat
import com.google.api.services.bigquery.model.TableFieldSchema
import com.google.api.services.bigquery.model.TableSchema
import java.util
object Main {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf()
val sparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
val sc = sparkSession.sparkContext
val conf = sparkSession.sparkContext.hadoopConfiguration
import sparkSession.implicits._
val testdata = sc.parallelize(Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 1)
val dfha = testdata.toDF()
// Input parameters.
val projectId = conf.get("fs.gs.project.id")
val bucket = conf.get("fs.gs.system.bucket")
conf.set(BigQueryConfiguration.PROJECT_ID_KEY, projectId)
conf.set(BigQueryConfiguration.GCS_BUCKET_KEY, bucket)
val outputTableId = projectId + ":wordcount_dataset.wordcount_output"
// Temp output bucket that is deleted upon completion of job.
val outputGcsPath = ("gs://" + bucket + "/hadoop/tmp/bigquery/wordcountoutput")
// Output configuration.
val outputTableFieldSchema = new util.ArrayList[TableFieldSchema]
outputTableFieldSchema.add(new TableFieldSchema().setName("Word").setType("STRING"))
outputTableFieldSchema.add(new TableFieldSchema().setName("Count").setType("STRING"))
val outputSchema = new TableSchema().setFields(outputTableFieldSchema)
conf.set("mapreduce.job.outputformat.class",
classOf[IndirectBigQueryOutputFormat[_, _]].getName)
conf.set(BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION_KEY,
"WRITE_TRUNCATE")
BigQueryOutputConfiguration.configure(conf, outputTableId, outputSchema, outputGcsPath, BigQueryFileFormat.CSV, classOf[TextOutputFormat[_, _]])
testdata.saveAsNewAPIHadoopDataset(conf)
}
}
When submitting to dataproc I got the following errors. May you please help me with this :
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Bigquery connector version 0.10.8-hadoop2
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from default credential.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from given credential.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputFormat: Delegating functionality to 'TextOutputFormat'.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputFormat: Delegating functionality to 'TextOutputFormat'.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from default credential.
18/07/05 09:00:14 INFO com.google.cloud.hadoop.io.bigquery.BigQueryFactory: Creating BigQuery from given credential.
18/07/05 09:00:27 INFO com.google.cloud.hadoop.io.bigquery.BigQueryHelper: Importing into table 'renault-ftt:wordcount_dataset.wordcount_output' from 1 paths; path[0] is 'gs://dataproc-a68f6af1-5c30-4a63-97b6-4db2526a61ef-eu/hadoop/tmp/bigquery/wordcountoutput/part-r-00000'; awaitCompletion: true
18/07/05 09:00:27 INFO com.google.cloud.hadoop.io.bigquery.BigQueryHelper: Using provided import schema '{fields=[{"name":"Word","type":"STRING"}, {"name":"Count","type":"STRING"}]}'.
18/07/05 09:00:41 ERROR org.apache.spark.internal.io.SparkHadoopMapReduceWriter: Aborting job job_20180705090013_0000.
java.io.IOException: Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
at com.google.cloud.hadoop.io.bigquery.BigQueryUtils.waitForJobCompletion(BigQueryUtils.java:108)
at com.google.cloud.hadoop.io.bigquery.BigQueryHelper.importFromGcs(BigQueryHelper.java:183)
at com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputCommitter.commitJob(IndirectBigQueryOutputCommitter.java:70)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:142)
at org.apache.spark.internal.io.SparkHadoopMapReduceWriter$.write(SparkHadoopMapReduceWriter.scala:101)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.PairRDDFunctions.saveAsNewAPIHadoopDataset(PairRDDFunctions.scala:1084)
at com.renault.ftt.example.Main$.main(Main.scala:129)
at com.renault.ftt.example.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:775)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
18/07/05 09:00:42 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputCommitter: Found GCS output data at 'gs://dataproc-a68f6af1-5c30-4a63-97b6-4db2526a61ef-eu/hadoop/tmp/bigquery/wordcountoutput', attempting to clean up.
18/07/05 09:00:42 INFO com.google.cloud.hadoop.io.bigquery.output.ForwardingBigQueryFileOutputCommitter: Successfully deleted GCS output path 'gs://dataproc-a68f6af1-5c30-4a63-97b6-4db2526a61ef-eu/hadoop/tmp/bigquery/wordcountoutput'.
Exception in thread "main" org.apache.spark.SparkException: Job aborted.
at org.apache.spark.internal.io.SparkHadoopMapReduceWriter$.write(SparkHadoopMapReduceWriter.scala:107)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.PairRDDFunctions.saveAsNewAPIHadoopDataset(PairRDDFunctions.scala:1084)
at com.renault.ftt.example.Main$.main(Main.scala:129)
at com.renault.ftt.example.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:775)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:180)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:205)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:119)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.IOException: Error while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the error stream for more details.
at com.google.cloud.hadoop.io.bigquery.BigQueryUtils.waitForJobCompletion(BigQueryUtils.java:108)
at com.google.cloud.hadoop.io.bigquery.BigQueryHelper.importFromGcs(BigQueryHelper.java:183)
at com.google.cloud.hadoop.io.bigquery.output.IndirectBigQueryOutputCommitter.commitJob(IndirectBigQueryOutputCommitter.java:70)
at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:142)
at org.apache.spark.internal.io.SparkHadoopMapReduceWriter$.write(SparkHadoopMapReduceWriter.scala:101)
... 18 more
18/07/05 09:00:43 INFO org.spark_project.jetty.server.AbstractConnector: Stopped Spark#717cfabd{HTTP/1.1,[http/1.1]}{0.0.0.0:4040}
I can't understand the reason of this bug, can anyone help me with this

Related

Running Scala Jar with Spark-Submit

I've compiled a spark-scala script to a JAR and I want to run it with spark-submit. But I'm having this error:
2020-01-07 13:03:02,190 WARN util.Utils: Your hostname, nifi resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
2020-01-07 13:03:02,192 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
2020-01-07 13:03:03,109 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2020-01-07 13:03:03,826 WARN deploy.SparkSubmit$$anon$2: Failed to load hello.
java.lang.ClassNotFoundException: hello
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:418)
at java.lang.ClassLoader.loadClass(ClassLoader.java:351)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.spark.util.Utils$.classForName(Utils.scala:238)
at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:806)
at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:161)
at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:184)
at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:920)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:929)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
2020-01-07 13:03:03,857 INFO util.ShutdownHookManager: Shutdown hook called
2020-01-07 13:03:03,858 INFO util.ShutdownHookManager: Deleting directory /tmp/spark-a8cc1ba6-3643-4646-82a3-4b44f4487105
This is my code:
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
object hello {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setMaster("local")
.setAppName("quest9")
val sc = new SparkContext(conf)
val spark = SparkSession.builder().appName("quest9").master("local").getOrCreate()
import spark.implicits._
val zip_codes = spark.read.format("csv").option("header", "true").load("/home/hdfs/Documents/quest_9/doc/zip.csv")
val census = spark.read.format("csv").option("header", "true").load("/home/hdfs/Documents/quest_9/doc/census.csv")
census.createOrReplaceTempView("census")
zip_codes.createOrReplaceTempView("zip")
val query = census.as("census").join((zip_codes.where($"City" === "Inglewood").where($"County" === "Los Angeles").as("zip")),Seq("Zip_Code"),"inner").select($"census.Total_Males".as("male"),$"census.Total_Females".as("female")).distinct()
query.show()
val queryR = query.repartition(5)
queryR.write.parquet("/home/hdfs/Documents/population/census/IDE/census.parquet")
sc.stop()
}
}
I think my problem is that im using scala object instead of a class, but I'm not sure.
I run the spark-submit like this
spark-submit \
--class hello \
/home/hdfs/IdeaProjects/untitled/out/artifacts/quest_jar/quest.jar
Anyone solved this error before?
I think you need to specify a package name for both spark-submit and your object.
For instance :
spark-submit \
--class com.my.package.hello \
/home/hdfs/IdeaProjects/untitled/out/artifacts/quest_jar/quest.jar
and
package com.my.package
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
object hello {
...
}

Error while running a jar - Invalid signature file digest for Manifest main attributes

I have wrote small program in scala using spark 1.6.0 in intellij IDE, jar got build but its throwing error while running.
Please share your inputs to resolve this issue.
[cloudera#quickstart ~]$ spark-submit --master local --class com.sample.sample.sample /home/cloudera/Desktop/IdeaProjects.jar
Exception in thread "main" java.lang.SecurityException: Invalid signature file digest for Manifest main attributes
at sun.security.util.SignatureFileVerifier.processImpl(SignatureFileVerifier.java:286)
at sun.security.util.SignatureFileVerifier.process(SignatureFileVerifier.java:239)
at java.util.jar.JarVerifier.processEntry(JarVerifier.java:317)
at java.util.jar.JarVerifier.update(JarVerifier.java:228)
at java.util.jar.JarFile.initializeVerifier(JarFile.java:348)
at java.util.jar.JarFile.getInputStream(JarFile.java:415)
at sun.misc.JarIndex.getJarIndex(JarIndex.java:137)
at sun.misc.URLClassPath$JarLoader$1.run(URLClassPath.java:674)
at sun.misc.URLClassPath$JarLoader$1.run(URLClassPath.java:666)
at java.security.AccessController.doPrivileged(Native Method)
at sun.misc.URLClassPath$JarLoader.ensureOpen(URLClassPath.java:665)
at sun.misc.URLClassPath$JarLoader.<init>(URLClassPath.java:638)
at sun.misc.URLClassPath$3.run(URLClassPath.java:366)
at sun.misc.URLClassPath$3.run(URLClassPath.java:356)
at java.security.AccessController.doPrivileged(Native Method)
at sun.misc.URLClassPath.getLoader(URLClassPath.java:355)
at sun.misc.URLClassPath.getLoader(URLClassPath.java:332)
at sun.misc.URLClassPath.getResource(URLClassPath.java:198)
at java.net.URLClassLoader$1.run(URLClassLoader.java:358)
at java.net.URLClassLoader$1.run(URLClassLoader.java:355)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:354)
at java.lang.ClassLoader.loadClass(ClassLoader.java:425)
at java.lang.ClassLoader.loadClass(ClassLoader.java:358)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:270)
at org.apache.spark.util.Utils$.classForName(Utils.scala:177)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:688)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
I have tried options provided in existed thread like delete MANIFEST.MF file..etc , But no luck
package com.sample.sample
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
object sample {
def main(args: Array[String]): Unit = {
println("Hello World")
val conf = new SparkConf().setAppName("JDBC App").setMaster("local")
val sc = new SparkContext(conf)
// val sc = new SparkConf()
val sqlContext = new SQLContext(sc)
val url ="jdbc:mysql://localhost:3306/retail_db?user=root&password=cloudera"
val df = sqlContext.read.format("jdbc").option("url", url).option("dbtable", "products").load()
df.printSchema()
}
}
Following is the command I am running:
spark-submit --master local --class com.sample.sample.sample /home/cloudera/Desktop/IdeaProjects.jar
expecting this code should work

Unable to create a stream in spark-streaming using kinesis stream

I am new to kinesis and i am trying to process the kinesis stream data with spark-streaming (Pyspark) and facing the below error
Below is my code: I am pushing twitter data to my kinesis stream and trying to process using Spark-streaming. I tried including --jars with all dependencies but still facing the same issue.Spark version -2.4.3 and also 2.3.3 with appropriate spark-streaming-kinesis-asl-assembly.jar
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession
from pyspark import StorageLevel
from pyspark.streaming import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils,
InitialPositionInStream
spark_session = SparkSession.builder.getOrCreate()
ssc = StreamingContext(spark_session.sparkContext, 10)
sc = spark_session.sparkContext
Kinesis_app_name = "test"
Kinesis_stream_name = "python-stream"
endpoint_url = "https://kinesis.us-east-1.amazonaws.com"
region_name = "us-east-1"
data = KinesisUtils.createStream(
ssc, Kinesis_app_name, Kinesis_stream_name, endpoint_url,
region_name, InitialPositionInStream.LATEST, 10, StorageLevel.MEMORY_AND_DISK_2)
data.pprint()
ssc.start() # Start the computation
ssc.awaitTermination()
I would like to process the stream using spark-streaming but getting the below error :
File "C:\spark-2.3.3-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\streaming\kinesis.py", line 92, in createStream
File "C:\spark-2.3.3-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py", line 1257, in __call__
File "C:\spark-2.3.3-bin-hadoop2.7\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py", line 328, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o27.createStream.
: java.lang.NoClassDefFoundError: com/amazonaws/services/kinesis/model/Record
at java.lang.Class.getDeclaredMethods0(Native Method)
at java.lang.Class.privateGetDeclaredMethods(Class.java:2701)
at java.lang.Class.getDeclaredMethods(Class.java:1975)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:232)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:159)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2299)
at org.apache.spark.streaming.kinesis.KinesisUtils$.createStream(KinesisUtils.scala:127)
at org.apache.spark.streaming.kinesis.KinesisUtils$.createStream(KinesisUtils.scala:554)
at org.apache.spark.streaming.kinesis.KinesisUtilsPythonHelper.createStream(KinesisUtils.scala:616)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.amazonaws.services.kinesis.model.Record
at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 20 more
I ran into the same issue. It ended up being that I had included just the spark-streaming-kinesis-asl jar. This jar does not contain the kinesis sdk as far as I'm aware. I fixed it by removing the lone jar and then using the package manager for the dependency with the spark-submit argument --packages org.apache.spark:spark-streaming-kinesis-asl_2.12:2.4.4. If you use the package manager but do not remove the offending jar, the program will not work. I hope this helps all who come across this error in the future.
Please see the solution below:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream, StorageLevel
if __name__ == "__main__":
kinesisConf = {...} # I put all my credentials in here
batchInterval = 2000
kinesisCheckpointInterval = batchInterval
sc = SparkContext(appName="kinesis-stream")
ssc = StreamingContext(sc, batchInterval)
data = KinesisUtils.createStream(
ssc=ssc,
kinesisAppName=kinesisConf['appName'],
streamName=kinesisConf['streamName'],
endpointUrl=kinesisConf['endpointUrl'],
regionName=kinesisConf['regionName'],
initialPositionInStream=InitialPositionInStream.LATEST,
checkpointInterval=kinesisCheckpointInterval,
storageLevel=StorageLevel.MEMORY_AND_DISK_2,
awsAccessKeyId=kinesisConf['awsAccessKeyId'],
awsSecretKey=kinesisConf['awsSecretKey']
)
data.pprint()
ssc.start()
ssc.awaitTermination()
& when you run it, do it like so:
spark-submit --master local[8] --packages org.apache.spark:spark-streaming-kinesis-asl_2.12:3.0.0-preview ./streaming.py
2.12 -> refers to the scala version
3.0.0 -> refers to the spark version
Go here and make sure you select the correct params for that package

How to add any new library like spark-sftp into my Pyspark code?

When Im trying to set a package dependency "spark-sftp" in my Spark conf, I get ClassNotFoundException. But it works when i execute the script using:
spark-submit --packages com.springml:spark-sftp_2.11:1.1.1 test.py
Below is my code. Can someone tell me how can i execute my pyspark script
by without passing the package as an argument to spark-submit?
import sys
import datetime
import pyspark
from pyspark.sql import *
from pyspark.sql import SparkSession, SQLContext, Row, HiveContext
from pyspark import SparkContext
#Create new config
conf = (pyspark.conf.SparkConf()
.set("spark.driver.maxResultSize", "16g")
.set("spark.driver.memory", "20g")
.set("spark.executor.memory", "20g")
.set("spark.executor.cores", "5")
.set("spark.shuffle.service.enabled", "true")
.set("spark.dynamicAllocation.enabled", "true")
.set("spark.dynamicAllocation.initialExecutors", "24")
.set("spark.dynamicAllocation.minExecutors", "6")
.set("spark.submit.deployMode", "client")
.set("spark.jars.packages", "com.springml:spark-sftp_2.11:1.1.1")
.set("spark.python.worker.memory", "4g")
.set("spark.default.parallelism", "960")
.set("spark.executor.memoryOverhead", "4g")
.setMaster("yarn-client"))
# Create new context
spark = SparkSession.builder.appName("AppName").config(conf=conf).enableHiveSupport().getOrCreate()
spark.sparkContext.setLogLevel("WARN")
df = spark.read.format("com.springml.spark.sftp").option("host", "HOST").option("username", "HOSTNAME").option("password", "pass").option("fileType", "csv").option("inferSchema", "true").load("/test/sample.csv")
Output:
: java.lang.ClassNotFoundException: Failed to find data source: com.springml.spark.sftp. Please find packages at http://spark.apache.org/third-party-projects.html
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:635)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:190)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:174)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ClassNotFoundException: com.springml.spark.sftp.DefaultSource
While submitting spark job, you can specify what packages to install. For this one you can specify this maven dependency as:
> $SPARK_HOME/bin/spark-shell --packages com.springml:spark-sftp_2.11:1.1.3

MQTT Structured Streaming

I am trying to set up Spark Streaming to read a MQTT source, but it launches an exception when I receive the second message.
I have the following code:
import java.sql.Timestamp
import org.apache.bahir.sql.streaming.mqtt._
import org.apache.commons.io.FileUtils
import org.apache.spark.sql.{ForeachWriter, Row, SparkSession}
import org.apache.spark.sql.types.StructType
object App {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.appName("StructuredNetworkWordCount")
.master("local[*]")
.getOrCreate()
import spark.implicits._
// Read text from socket
val lines = spark.readStream.format("org.apache.bahir.sql.streaming.mqtt.MQTTStreamSourceProvider").option("topic", "measurement").option("username", "spark").option("password", "******").load("tcp://10.0.0.129:1883").as[(String, Timestamp)]
val query = lines.writeStream.format("console").start
query.awaitTermination()
}
}
And I observe the following exception when I receive a second message:
17/01/16 16:18:33 ERROR StreamExecution: Query query-1 terminated with error
java.lang.AbstractMethodError: org.apache.bahir.sql.streaming.mqtt.MQTTTextStreamSource.commit(Lorg/apache/spark/sql/execution/streaming/Offset;)V
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1$$anonfun$apply$mcV$sp$5.apply(StreamExecution.scala:359)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1$$anonfun$apply$mcV$sp$5.apply(StreamExecution.scala:358)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at org.apache.spark.sql.execution.streaming.StreamProgress.foreach(StreamProgress.scala:25)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1.apply$mcV$sp(StreamExecution.scala:358)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1.apply(StreamExecution.scala:345)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1.apply(StreamExecution.scala:345)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$reportTimeTaken(StreamExecution.scala:656)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch(StreamExecution.scala:345)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply$mcZ$sp(StreamExecution.scala:219)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply(StreamExecution.scala:213)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply(StreamExecution.scala:213)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$reportTimeTaken(StreamExecution.scala:656)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1.apply$mcZ$sp(StreamExecution.scala:212)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:43)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:208)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:142)
Exception in thread "stream execution thread for query-1" java.lang.AbstractMethodError: org.apache.bahir.sql.streaming.mqtt.MQTTTextStreamSource.commit(Lorg/apache/spark/sql/execution/streaming/Offset;)V
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1$$anonfun$apply$mcV$sp$5.apply(StreamExecution.scala:359)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1$$anonfun$apply$mcV$sp$5.apply(StreamExecution.scala:358)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at org.apache.spark.sql.execution.streaming.StreamProgress.foreach(StreamProgress.scala:25)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1.apply$mcV$sp(StreamExecution.scala:358)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1.apply(StreamExecution.scala:345)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1.apply(StreamExecution.scala:345)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$reportTimeTaken(StreamExecution.scala:656)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch(StreamExecution.scala:345)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply$mcZ$sp(StreamExecution.scala:219)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply(StreamExecution.scala:213)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply(StreamExecution.scala:213)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$reportTimeTaken(StreamExecution.scala:656)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1.apply$mcZ$sp(StreamExecution.scala:212)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:43)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:208)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:142)
org.apache.spark.sql.streaming.StreamingQueryException: Query query-1 terminated with exception: org.apache.bahir.sql.streaming.mqtt.MQTTTextStreamSource.commit(Lorg/apache/spark/sql/execution/streaming/Offset;)V
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:248)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:142)
Caused by: java.lang.AbstractMethodError: org.apache.bahir.sql.streaming.mqtt.MQTTTextStreamSource.commit(Lorg/apache/spark/sql/execution/streaming/Offset;)V
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1$$anonfun$apply$mcV$sp$5.apply(StreamExecution.scala:359)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1$$anonfun$apply$mcV$sp$5.apply(StreamExecution.scala:358)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at org.apache.spark.sql.execution.streaming.StreamProgress.foreach(StreamProgress.scala:25)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1.apply$mcV$sp(StreamExecution.scala:358)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1.apply(StreamExecution.scala:345)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch$1.apply(StreamExecution.scala:345)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$reportTimeTaken(StreamExecution.scala:656)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$constructNextBatch(StreamExecution.scala:345)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply$mcZ$sp(StreamExecution.scala:219)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply(StreamExecution.scala:213)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1$$anonfun$1.apply(StreamExecution.scala:213)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$reportTimeTaken(StreamExecution.scala:656)
at org.apache.spark.sql.execution.streaming.StreamExecution$$anonfun$org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches$1.apply$mcZ$sp(StreamExecution.scala:212)
at org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor.execute(TriggerExecutor.scala:43)
at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runBatches(StreamExecution.scala:208)
... 1 more
Did anybody have this problem?
Is your spark-MQTT jar the same version of your spark? it's possible that different versions cause these problems. I had a similar problem when using spark 1.6 from Cloudera Express. After upgrading it to the same version the problem was solved