Im using the below custom receiver to consume data from Rabbitmq in Spark-Scala.
import org.apache.spark.streaming.rabbitmq.RabbitMQUtils
import java.io.{BufferedReader, InputStreamReader}
import java.net.Socket
import java.nio.charset.StandardCharsets
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{InputDStream, ReceiverInputDStream}
import org.apache.spark.SparkContext
import org.apache.spark.streaming._
import org.apache.spark.streaming.receiver._
import org.apache.spark.internal.Logging
import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.internal.Logging
import org.slf4j.LoggerFactory
val host = "10.*.*.*"
val port = "5**"
val queueName = "test"
val vHost = "sd"
val userName = "user1"
val password = "***"
var batchInterval = Seconds(500)
var ssc = new StreamingContext(sc, batchInterval)
val receiverStream = RabbitMQUtils.createStream(ssc, Map(
"host" -> host,
"port" -> port,
"queueName" -> queueName,
"vHost" -> vHost,
"userName" -> userName,
"password" -> password
))
receiverStream.start()
println("started receiverStream")
ssc.start()
ssc.awaitTermination()
How to print the receiverstream data and later save the file to any specific location.
Could someone please assist.
Thank you.
You should be able to print the stream data with:
receiverStream
.writeStream
.outputMode("complete")
.format("console")
.start()
Reference: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#quick-example
You can save to parquet (or other) files with:
receiverStream
.writeStream
.format("parquet") // can be "orc", "json", "csv", etc.
.option("path", "path/to/destination/dir")
.start()
See: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#output-sinks
Related
import org.apache.hadoop.fs.{Path,FileSystem}
import org.apache.hadoop.conf.Configuration
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.functions.{current_date, from_unixtime, unix_timestamp}
import spark.implicits._
spark.sql("drop table if exists Staging.test_partiton_drop")
val df = Seq(("Aravind", "DIP"), ("Karthik", "DTP")).toDF("name", "dept")
.withColumn("hdp_load_dt", current_date()).withColumn("hdp_load_hr", from_unixtime(unix_timestamp(), "HH"))
var partitionBySeq = Seq("hdp_load_dt", "hdp_load_hr")
df
.write.partitionBy(partitionBySeq:_*)
.format("parquet")
.mode(SaveMode.Append)
.option("path", "/krnoir/streaming_test")
.option("compression", "snappy")
.saveAsTable("Staging.test_partiton_drop")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val finalPath = new Path("/krnoir/streaming_test/_REPLICATION.DONE")
fs.create(finalPath,false)
When I ran the above code, /krnoir/streaming_test/ is getting overwritten in each run while I am expecting new files to be added and not overwritten.
I am pushing twitter data through NiFi to kafka and process it with spark. The results are then written to elasticsearch, retrieve username, hashtags, time and few elements and write to Elasticsearch. Where could I be wrong?
I want to covert the results to dataframe and use case class for choosing few items from twitter json.
Many Thanks
package comSpark
import org.apache.spark.streaming._
import org.apache.spark.sql.functions
import org.apache.kafka.clients.consumer._
import java.lang.IllegalArgumentException
import org.apache.spark.SparkDriverExecutionException
import org.apache.spark.sql.functions.explode
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.jdbc._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import scala.util.Try
import org.apache.spark.sql.types.DecimalType._
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.regexp_replace
import org.apache.spark.sql.types.StructType
import org.apache.kafka.clients.consumer.ConsumerRecord._
import org.apache.spark.rdd.RDD._
import org.apache.spark.rdd.RDD
import org.apache.kafka.clients.consumer._
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, LocationStrategies, KafkaUtils}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark._
object TwitterAnalytics {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("ES").setMaster("local[*]").
set("spark.driver.allowMultipleContexts","true")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
val ssc = new StreamingContext(conf,Seconds(10))
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "localhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "c1",
"auto.offset.reset" -> "latest",
"zookeeper.connection.timeout.ms" -> "10000",
"zookeeper.session.timeout.ms" -> "10000",
"enable.auto.commit" -> "true")
val topics = Array("kafkaelasticproject")
val stream1 = KafkaUtils
.createDirectStream[String, String](ssc,PreferConsistent,
Subscribe[String, String]
(topics,
kafkaParams)
)
stream1.print()
// })
ssc.start()
ssc.awaitTermination()
println(" stream completed" )
}
}
I am facing an issue during spark streaming. I am getting empty records after it gets streamed and passes to the "parse" method.
My code:
import spark.implicits._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Encoders
import org.apache.spark.streaming._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import spark.implicits._
import org.apache.spark.sql.types.{StructType, StructField, StringType,
IntegerType}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import spark.implicits._
import org.apache.spark.sql.types.{StructType, StructField, StringType,
IntegerType}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.storage.StorageLevel
import java.util.regex.Pattern
import java.util.regex.Matcher
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql._
val conf = new SparkConf().setAppName("streamHive").setMaster("local[*]").set("spark.driver.allowMultipleContexts", "true")
val ssc = new StreamingContext(conf, Seconds(5))
val sc=ssc.sparkContext
val lines = ssc.textFileStream("file:///home/sadr/testHive")
case class Prices(name: String, age: String,sex: String, location: String)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
def parse (rdd : org.apache.spark.rdd.RDD[String] ) =
{
var l = rdd.map(_.split(","))
val prices = l.map(p => Prices(p(0),p(1),p(2),p(3)))
val pricesDf = sqlContext.createDataFrame(prices)
pricesDf.registerTempTable("prices")
pricesDf.show()
var x = sqlContext.sql("select count(*) from prices")
x.show()}
lines.foreachRDD { rdd => parse(rdd)}
lines.print()
ssc.start()
My input file:
cat test1.csv
Riaz,32,M,uk
tony,23,M,india
manu,33,M,china
imart,34,F,AUS
I am getting this output:
lines.foreachRDD { rdd => parse(rdd)}
lines.print()
ssc.start()
scala> +----+---+---+--------+
|name|age|sex|location|
+----+---+---+--------+
+----+---+---+--------+
I am using Spark version 2.3....I AM GETTING FOLLOWING ERROR AFTER ADDING X.SHOW()
Not sure if you are actually able to read the streams.
textFileStream reads only the new files added to the directory after the program starts and not the existing ones. Was the file already there?
If yes, remove it from the directory, start the program and copy the file again?
enter image description hereI receive a error when try do select over my temp table. Somebody can help me please?
object StreamingLinReg extends java.lang.Object{
val conf = new SparkConf(true)
.set("spark.cassandra.connection.host", "127.0.0.1").setAppName("Streaming Liniar Regression")
.set("spark.cassandra.connection.port", "9042")
.set("spark.driver.allowMultipleContexts", "true")
.set("spark.streaming.receiver.writeAheadLog.enable", "true")
val sc = new SparkContext(conf);
val ssc = new StreamingContext(sc, Seconds(1));
val sqlContext = new org.apache.spark.sql.SQLContext(sc);
import sqlContext.implicits._
val trainingData = ssc.cassandraTable[String]("features","consumodata").select("consumo", "consumo_mensal", "soma_pf", "tempo_gasto").map(LabeledPoint.parse)
trainingData.toDF.registerTempTable("training")
val dstream = new ConstantInputDStream(ssc, trainingData)
val numFeatures = 100
val model = new StreamingLinearRegressionWithSGD()
.setInitialWeights(Vectors.zeros(numFeatures))
.setNumIterations(1)
.setStepSize(0.1)
.setMiniBatchFraction(1.0)
model.trainOn(dstream)
model.predictOnValues(dstream.map(lp => (lp.label, lp.features))).foreachRDD { rdd =>
val metrics = new RegressionMetrics(rdd)
val MSE = metrics.meanSquaredError //Squared error
val RMSE = metrics.rootMeanSquaredError //Squared error
val MAE = metrics.meanAbsoluteError //Mean absolute error
val Rsquared = metrics.r2
//val Explained variance = metrics.explainedVariance
rdd.toDF.registerTempTable("liniarRegressionModel")
}
}
ssc.start()
ssc.awaitTermination()
//}
}
%sql
select * from liniarRegressionModel limit 10
when I do select the temporary table I get an error message.I run first paragraph after execute the select over temp table.
org.apache.spark.sql.AnalysisException: Table not found: liniarRegressionModel; line 1 pos 14 at org.apache.spark.sql.catalyst.analysis.
package$AnalysisErrorAt.failAnalysis (package.scala:42) at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations
$.getTable (Analyzer.scala:305) at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations
$$anonfun$apply$9.applyOrElse
(Analyzer.scala:314) at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveRelations
$$anonfun$apply$9.applyOrElse(Analyzer.scala:309) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
$$anonfun$resolveOperators$1.apply(LogicalPlan.scala:57) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
$$anonfun$resolveOperators$1.apply(LogicalPlan.scala:57) at org.apache.spark.sql.catalyst.trees.CurrentOrigin
$.withOrigin(TreeNode.scala:69) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveOperators
(LogicalPlan.scala:56) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
$$anonfun$1.apply(LogicalPlan.scala:54) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$$anonfun$1.apply
(LogicalPlan.scala:54) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply
(TreeNode.scala:281) at scala.collection.Iterator
$$anon$11.next(Iterator.scala:328) at scala.collection.Iterator$
class.foreach(Iterator.scala:727) at scala.collection.AbstractIterator.foreach
(Iterator.scala:1157) at scala.collection.generic.Growable $class.$plus$plus$eq(Growable.scala:48) at scala.collection.mutable.ArrayBuffer.
$plus$plus$eq(ArrayBuffer.scala:103) at scala.collection.mutable.ArrayBuffer.
$plus$plus$eq(ArrayBuffer.scala:47) at scala.collection.TraversableOnce$class.to
(TraversableOnce.scala:273) at scala.collection.AbstractIterator.to
(Iterator.scala:1157) at scala.collection.TraversableOnce$class.toBuffer
(TraversableOnce.scala:265) at scala.collection.AbstractIterator.toBuffer
(Iterator.scala:1157) at scala.collection.TraversableOnce$class.toArray
(TraversableOnce.scala:252) at scala.collection.AbstractIterator.toArray
(Iterator.scala:1157)
My output after execute the code
import java.lang.Object
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._
import org.apache.spark.sql.cassandra._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.StreamingContext._
import com.datastax.spark.connector.streaming._
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.ConstantInputDStream
import org.apache.spark.mllib.evaluation.RegressionMetrics
defined module StreamingLinReg
FINISHED
Took 15 seconds
I am a newbie to Scala/Spark. In the following code, I am extracting Twitter public stream content to the HBase.
On commenting the last four lines (put commands in HBase), I am able to print content of tweet on the terminal, however unable to dump it to the HBase table.
I need help in on the following regards:
1. How can I overcome the serialilzation error?
2. Are there efficient methods (may be useing Kryo serialilzation) to overcome this error?
Caused by: java.io.NotSerializableException:
org.apache.hadoop.conf.Configuration Serialization stack:
- object not serializable (class: org.apache.hadoop.conf.Configuration, value: Configuration:
core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml,
yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml)
import twitter4j.auth._
import twitter4j.conf._
import twitter4j._
import twitter4j.json._
import scala.io.Source
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.ConnectionFactory
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.KeyValue
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.mapreduce.Job
import org.apache.spark._
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.hadoop.hbase.util.Bytes
import java.io._
import org.apache.spark.streaming.twitter.TwitterUtils
////////////////////////////
val conf = new SparkConf().setAppName("model1").setMaster("local[*]")
// val sc = new SparkContext(conf)
val TABLE_NAME = "publicrd"
val CF_USER = "user"
val CF_TWEET = "tweet"
val CF_ENTITIES = "entities"
val CF_PLACES = "places"
val hadoopConf = new Configuration
val conf = HBaseConfiguration.create(hadoopConf)
val admin = new HBaseAdmin(conf)
val tableDesc = new HTableDescriptor(Bytes.toBytes(TABLE_NAME))
// Define column family descriptor
val ColumnFamilyDesc1 = new HColumnDescriptor(Bytes.toBytes(CF_USER))
val ColumnFamilyDesc2 = new HColumnDescriptor(Bytes.toBytes(CF_TWEET))
val ColumnFamilyDesc3 = new HColumnDescriptor(Bytes.toBytes(CF_ENTITIES))
val ColumnFamilyDesc4 = new HColumnDescriptor(Bytes.toBytes(CF_PLACES))
// Add column family in table descriptor
tableDesc.addFamily(ColumnFamilyDesc1)
tableDesc.addFamily(ColumnFamilyDesc2)
tableDesc.addFamily(ColumnFamilyDesc3)
tableDesc.addFamily(ColumnFamilyDesc4)
// Check if the table exists
if (admin.tableExists(TABLE_NAME)){
print(">>>>>" + TABLE_NAME + " already exists <<<<<")
admin.disableTable(TABLE_NAME)
admin.deleteTable(TABLE_NAME)
}
// Create HBASE table
admin.createTable(tableDesc)
val table = new HTable(conf, TABLE_NAME)
/////
val timewindow = 2 // seconds
val ssc = new StreamingContext(sc, Seconds(timewindow))
val cb = new ConfigurationBuilder
val ckey = "ckey"
val csecret = "csecret"
val atoken = "atoken"
val atokensecret = "atokensecret"
cb.setDebugEnabled(true).
setOAuthConsumerKey(ckey).
setOAuthConsumerSecret(csecret).
setOAuthAccessToken(atoken).
setOAuthAccessTokenSecret(atokensecret).
setJSONStoreEnabled(true)
val auth = new OAuthAuthorization(cb.build)
val tweets = TwitterUtils.createStream(ssc,Some(auth))
val status = tweets.filter(_.getLang()=="en")
status.foreachRDD(foreachFunc = rdd => {
rdd.foreachPartition {
records => while (records.hasNext) {
var record = records.next
print("\n\n>>>>"+record)
var tweetID = record.getUser().getId().toString//.isInstanceOf[Int]
print("\ntweetID : "+tweetID)
var tweetBody = record.getText()//.toString
print("\ntweetBody : "+tweetBody)
var favoritesCount = record.getFavoriteCount()//.toInt
print("\nfavoritesCount : "+favoritesCount)
var keyrow = "t_"+tweetID //"t_"+
print("\nkeyrow : "+keyrow+"\n")
var theput= new Put(Bytes.toBytes(keyrow))
theput.add(Bytes.toBytes(CF_TWEET),Bytes.toBytes("tweetid"),Bytes.toBytes(tweetID))
theput.add(Bytes.toBytes(CF_TWEET),Bytes.toBytes("tweetid"),Bytes.toBytes(tweetBody))
theput.add(Bytes.toBytes(CF_USER),Bytes.toBytes("tweetid"),Bytes.toBytes(favoritesCount))
table.put(theput)
}
}
}
)
The code is run on the terminal via:
spark-shell --driver-class-path /opt/hadoop/hbase-1.2.1/lib/hbase-server-1.1.4.jar:/opt/hadoop/hbase-1.2.1/lib/hbase-protocol-1.0.0-cdh5.5.0.jar:/opt/hadoop/hbase-1.2.1/lib/hbase-hadoop2-compat-1.0.0-cdh5.5.0.jar:/opt/hadoop/hbase-1.2.1/lib/hbase-client-1.0.0-cdh5.5.0.jar:/opt/hadoop/hbase-1.2.1/lib/hbase-common-1.0.0-cdh5.5.0.jar:/opt/hadoop/hbase-1.2.1/lib/htrace-core-3.2.0-incubating.jar:/home/cloudera/Desktop/hbase/twitter4jJARS/guava-19.0.jar:/home/cloudera/Desktop/hbase/twitter4jJARS/spark-streaming-twitter_2.10-1.6.1.jar:/home/cloudera/Desktop/hbase/twitter4jJARS/twitter4j-async-4.0.4.jar:/home/cloudera/Desktop/hbase/twitter4jJARS/twitter4j-core-4.0.4.jar:/home/cloudera/Desktop/hbase/twitter4jJARS/twitter4j-examples-4.0.4.jar:/home/cloudera/Desktop/hbase/twitter4jJARS/twitter4j-media-support-4.0.4.jar:/home/cloudera/Desktop/hbase/twitter4jJARS/twitter4j-stream-4.0.4.jar
It says the object org.apache.hadoop.conf.Configuration is not serialisable which mean it does not implement the Serializable interface while it's required. To get rid of that add #transient keyword.
#transient val hadoopConf = new Configuration