writing SparkRDD to a HBase table using Scala - scala

I am trying to write a SparkRDD to HBase table using scala(haven't used before). The entire code is this :
import org.apache.hadoop.hbase.client.{HBaseAdmin, Result}
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import scala.collection.JavaConverters._
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark._
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.rdd.PairRDDFunctions
import org.apache.spark.SparkContext._
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.client._
object HBaseWrite {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("HBaseWrite").setMaster("local").set("spark.driver.allowMultipleContexts","true").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
val outputTable = "tablename"
System.setProperty("user.name", "hdfs")
System.setProperty("HADOOP_USER_NAME", "hdfs")
conf.set("hbase.master", "localhost:60000")
conf.setInt("timeout", 120000)
conf.set("hbase.zookeeper.quorum", "localhost")
conf.set("zookeeper.znode.parent", "/hbase-unsecure")
conf.setInt("hbase.client.scanner.caching", 10000)
sparkConf.registerKryoClasses(Array(classOf[org.apache.hadoop.hbase.client.Result]))
val jobConfig: JobConf = new JobConf(conf,this.getClass)
jobConfig.setOutputFormat(classOf[TableOutputFormat])
jobConfig.set(TableOutputFormat.OUTPUT_TABLE,outputTable)
val x = 12
val y = 15
val z = 25
var newarray = Array(x,y,z)
val newrddtohbase = sc.parallelize(newarray)
def convert(a:Int) : Tuple2[ImmutableBytesWritable,Put] = {
val p = new Put(Bytes.toBytes(a))
p.add(Bytes.toBytes("columnfamily"),
Bytes.toBytes("col_1"), Bytes.toBytes(a))
new Tuple2[ImmutableBytesWritable,Put](new ImmutableBytesWritable(a.toString.getBytes()), p);
}
new PairRDDFunctions(newrddtohbase.map(convert)).saveAsHadoopDataset(jobConfig)
sc.stop()
}
}
The error I get after doing HBaseWrite(main(Array()) is this:
org.apache.spark.SparkException: Task not serializable
How do I proceed to get it done?

The thing you are doing wrong here is defining the convert inside main
If you write this code in this way it may work :
object HBaseWrite {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("HBaseWrite").setMaster("local").set("spark.driver.allowMultipleContexts","true").set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(sparkConf)
val conf = HBaseConfiguration.create()
val outputTable = "tablename"
System.setProperty("user.name", "hdfs")
System.setProperty("HADOOP_USER_NAME", "hdfs")
conf.set("hbase.master", "localhost:60000")
conf.setInt("timeout", 120000)
conf.set("hbase.zookeeper.quorum", "localhost")
conf.set("zookeeper.znode.parent", "/hbase-unsecure")
conf.setInt("hbase.client.scanner.caching", 10000)
sparkConf.registerKryoClasses(Array(classOf[org.apache.hadoop.hbase.client.Result]))
val jobConfig: JobConf = new JobConf(conf,this.getClass)
jobConfig.setOutputFormat(classOf[TableOutputFormat])
jobConfig.set(TableOutputFormat.OUTPUT_TABLE,outputTable)
val x = 12
val y = 15
val z = 25
var newarray = Array(x,y,z)
val newrddtohbase = sc.parallelize(newarray)
val convertFunc = convert _
new PairRDDFunctions(newrddtohbase.map(convertFunc)).saveAsHadoopDataset(jobConfig)
sc.stop()
}
def convert(a:Int) : Tuple2[ImmutableBytesWritable,Put] = {
val p = new Put(Bytes.toBytes(a))
p.add(Bytes.toBytes("columnfamily"),
Bytes.toBytes("col_1"), Bytes.toBytes(a))
new Tuple2[ImmutableBytesWritable,Put](new ImmutableBytesWritable(a.toString.getBytes()), p);
}
}
P.S.: The code is not tested , but it should work !

For example, the below method takes Int as argument and returns Double
var toDouble: (Int) => Double = a => {
a.toDouble
}
You can use toDouble(2) and it returns 2.0
The same way you can convert your method to function literal as below.
val convert: (Int) => Tuple2[ImmutableBytesWritable,Put] = a => {
val p = new Put(Bytes.toBytes(a))
p.add(Bytes.toBytes("columnfamily"),
Bytes.toBytes("col_1"), Bytes.toBytes(a))
new Tuple2[ImmutableBytesWritable,Put](new ImmutableBytesWritable(a.toString.getBytes()), p);
}

Related

Migrate code Scala to databricks notebook

Working to get this code running using notebooks in databricks(already tested and working with an IDE), can not get this working if I change the structure of the code.
import java.io.{BufferedReader, InputStreamReader}
import java.text.SimpleDateFormat
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
object TestUnit {
val dateFormat = new SimpleDateFormat("yyyyMMdd")
case class Averages (cust: String, Num: String, date: String, credit: Double)
def main(args: Array[String]): Unit = {
val inputFile = "s3a://tfsdl-ghd-wb/raidnd/Cleartablet.csv"
val outputFile = "s3a://tfsdl-ghd-wb/raidnd/Incte_19&20.csv"
val fileSystem = getFileSystem(inputFile)
val inputData = readCSVFileLines(fileSystem, inputFile, skipHeader = true)
.toSeq
val filtinp = inputData.filter(x => x.nonEmpty)
.map(x => x.split(","))
.map(x => Revenue(x(6), x(5), x(0), x(8).toDouble))
// Create output writer
val writer = new PrintWriter(new File(outputFile))
// Header for output CSV file
writer.write("Date,customer,number,Credit,Average Credit/SKU\n")
filtinp.foreach{x =>
val (com1, avg1) = com1Average(filtermp, x)
val (com2, avg2) = com2Average(filtermp, x)
}
// Write row to output csv file
writer.write(s"${x.day},${x.customer},${x.number},${x.credit},${avgcredit1},${avgcredit2}\n")
writer.close() // close the writer`
}
}

How to implement the Seq.grouped(size:Int): Seq[Seq[A]] for Dataset in Spark

I want to try to implement the
def grouped(size: Int): Iterator[Repr] that Seq has but for Dataset in Spark.
So the input should be ds: Dataset[A], size: Int and output Seq[Dataset[A]] where each of the Dataset[A] in the output can't be bigger than size.
How should I proceed ? I tried with repartition and mapPartitions but I am not sure where to go from there.
Thank you.
Edit: I found the glom method in RDD but it produce a RDD[Array[A]] how do I go from this to the other way around Array[RDD[A]] ?
here you go, something that you want
/*
{"countries":"pp1"}
{"countries":"pp2"}
{"countries":"pp3"}
{"countries":"pp4"}
{"countries":"pp5"}
{"countries":"pp6"}
{"countries":"pp7"}
*/
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.{SparkConf, SparkContext};
object SparkApp extends App {
override def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Simple Application").setMaster("local").set("spark.ui.enabled", "false")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val dataFrame: DataFrame = sqlContext.read.json("/data.json")
val k = 3
val windowSpec = Window.partitionBy("grouped").orderBy("countries")
val newDF = dataFrame.withColumn("grouped", lit("grouping"))
var latestDF = newDF.withColumn("row", row_number() over windowSpec)
val totalCount = latestDF.count()
var lowLimit = 0
var highLimit = lowLimit + k
while(lowLimit < totalCount){
latestDF.where(s"row <= $highLimit and row > $lowLimit").show(false)
lowLimit = lowLimit + k
highLimit = highLimit + k
}
}
}
Here is the solution I found but I am not sure if that can works reliably:
override protected def batch[A](
input: Dataset[A],
batchSize: Int
): Seq[Dataset[A]] = {
val count = input.count()
val partitionQuantity = Math.ceil(count / batchSize).toInt
input.randomSplit(Array.fill(partitionQuantity)(1.0 / partitionQuantity), seed = 0)
}

flink sink to parquet file with AvroParquetWriter is not writing data to file

I am trying to write a parquet file as sink using AvroParquetWriter. The file is created but with 0 length (no data is written). am I doing something wrong ? couldn't figure out what is the problem
import io.eels.component.parquet.ParquetWriterConfig
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}
import org.apache.parquet.hadoop.metadata.CompressionCodecName
import scala.io.Source
import org.apache.flink.streaming.api.scala._
object Tester extends App {
val env = StreamExecutionEnvironment.getExecutionEnvironment
def now = System.currentTimeMillis()
val path = new Path(s"/tmp/test-$now.parquet")
val schemaString = Source.fromURL(getClass.getResource("/request_schema.avsc")).mkString
val schema: Schema = new Schema.Parser().parse(schemaString)
val compressionCodecName = CompressionCodecName.SNAPPY
val config = ParquetWriterConfig()
val genericReocrd: GenericRecord = new GenericData.Record(schema)
genericReocrd.put("name", "test_b")
genericReocrd.put("code", "NoError")
genericReocrd.put("ts", 100L)
val stream = env.fromElements(genericReocrd)
val writer: ParquetWriter[GenericRecord] = AvroParquetWriter.builder[GenericRecord](path)
.withSchema(schema)
.withCompressionCodec(compressionCodecName)
.withPageSize(config.pageSize)
.withRowGroupSize(config.blockSize)
.withDictionaryEncoding(config.enableDictionary)
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
.withValidation(config.validating)
.build()
writer.write(genericReocrd)
stream.addSink{r =>
writer.write(r)
}
env.execute()
The problem is that you don't close the ParquetWriter. This is necessary to flush pending elements to disk. You could solve the problem by defining your own RichSinkFunction where you close the ParquetWriter in the close method:
class ParquetWriterSink(val path: String, val schema: String, val compressionCodecName: CompressionCodecName, val config: ParquetWriterConfig) extends RichSinkFunction[GenericRecord] {
var parquetWriter: ParquetWriter[GenericRecord] = null
override def open(parameters: Configuration): Unit = {
parquetWriter = AvroParquetWriter.builder[GenericRecord](new Path(path))
.withSchema(new Schema.Parser().parse(schema))
.withCompressionCodec(compressionCodecName)
.withPageSize(config.pageSize)
.withRowGroupSize(config.blockSize)
.withDictionaryEncoding(config.enableDictionary)
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
.withValidation(config.validating)
.build()
}
override def close(): Unit = {
parquetWriter.close()
}
override def invoke(value: GenericRecord, context: SinkFunction.Context[_]): Unit = {
parquetWriter.write(value)
}
}

How to write RDD[List[(ImmutableBytesWritable, Put)] to HBase using saveAsNewAPIHadoopDataset

I know that we can use saveAsNewAPIHadoopDataset with RDD[(ImmutableBytesWritable, Put)] to write to HBase table using spark.
But I have a list i.e RDD[List[(ImmutableBytesWritable, Put)] which I want to write 2 different HBase Tables.
How to do it?
Below is the code.
package com.scryAnalytics.FeatureExtractionController
import com.scryAnalytics.FeatureExtractionController.DAO.{DocumentEntitiesDAO, NLPEntitiesDAO, SegmentFeaturesDAO}
import com.scryAnalytics.NLPGeneric.{GateGenericNLP, NLPEntities}
import com.sun.xml.bind.v2.TODO
import com.vocp.ner.main.GateNERImpl
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat, TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.mapreduce.Job
import com.scryAnalytics.FeatureExtraction.SegmentsFeatureExtraction
import com.scryAnalytics.FeatureExtraction.DAO.VOCPEntities
import scala.collection.JavaConversions._
import gate.FeatureMap
import java.util.Map.Entry
import scala.collection.JavaConversions
import scala.util.control.Breaks.break
import scala.util.control.ControlThrowable
/**
* Created by sahil on 1/12/16.
*/
object Main {
def main(args: Array[String]): Unit = {
val inputTableName = "posts"
val outputTableName = "drugSegmentNew1"
val pluginHome = "/home/sahil/Voice-of-Cancer-Patients/VOCP Modules/bin/plugins"
val sc = new SparkContext(new SparkConf().setAppName("HBaseRead").setMaster("local[4]"))
val conf = HBaseConfiguration.create()
conf.set(HConstants.ZOOKEEPER_QUORUM, "localhost")
conf.set(TableInputFormat.INPUT_TABLE, inputTableName)
val admin = new HBaseAdmin(conf)
if (!admin.isTableAvailable(inputTableName)) {
val tableDesc = new HTableDescriptor(TableName.valueOf(inputTableName))
admin.createTable(tableDesc)
}
val job: Job = Job.getInstance(conf, "FeatureExtractionJob")
job.setOutputFormatClass(classOf[MultiTableOutputFormat])
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[ImmutableBytesWritable], classOf[Result])
val resultRDD = hBaseRDD.map(x => x._2)
// TODO: Add filters
val entity: VOCPEntities = VOCPEntities.DRUG
val nlpRDD = resultRDD.mapPartitions { iter =>
val nlpEntities: NLPEntitiesDAO = new NLPEntitiesDAO
iter.map {
result =>
val message = Bytes.toString(result.getValue(Bytes.toBytes("p"), Bytes.toBytes("message")))
val row_key = Bytes.toString(result.getRow)
nlpEntities.setToken(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("token")))))
nlpEntities.setSpaceToken(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("spaceToken")))))
nlpEntities.setSentence(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("sentence")))))
nlpEntities.setVG(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("verbGroup")))))
nlpEntities.setSplit(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("split")))))
nlpEntities.setNounChunk(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("nounChunk")))))
nlpEntities.setDrugs(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("drug")))))
nlpEntities.setRegimen(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("regimen")))))
nlpEntities.setSideEffects(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("sideEffect")))))
nlpEntities.setALT_DRUG(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("altDrug")))))
nlpEntities.setALT_THERAPY(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("altTherapy")))))
(row_key, message, nlpEntities)
}
}
val featureExtractionOld: SegmentsFeatureExtraction = new SegmentsFeatureExtraction(
pluginHome, entity)
val outputRDD = nlpRDD.mapPartitions { iter =>
val featureExtraction: SegmentsFeatureExtraction = new SegmentsFeatureExtraction(
pluginHome, entity)
iter.map { x =>
val featuresJson = featureExtraction.generateFeatures(x._2, Utility.objectToJson(x._3))
val segmentFeatures: SegmentFeaturesDAO = Utility.jsonToSegmentFeatures(featuresJson)
val documentEntities: DocumentEntitiesDAO = new DocumentEntitiesDAO
documentEntities.setSystemId(x._1)
documentEntities.setToken(x._3.getToken)
documentEntities.setSpaceToken(x._3.getSpaceToken)
documentEntities.setSentence(x._3.getSentence)
documentEntities.setVG(x._3.getVG)
documentEntities.setNounChunk(x._3.getNounChunk)
documentEntities.setSplit(x._3.getSplit)
documentEntities.setDRUG(x._3.getDrugs)
documentEntities.setSE(x._3.getSideEffects)
documentEntities.setREG(x._3.getRegimen)
documentEntities.setALT_DRUG(x._3.getALT_DRUG)
documentEntities.setALT_THERAPY(x._3.getALT_THERAPY)
documentEntities.setSegment(segmentFeatures.getSegment)
documentEntities.setSegmentClass(segmentFeatures.getSegmentClass)
documentEntities.setSegmentInstance(segmentFeatures.getSegmentInstance)
(x._1, documentEntities)
}
}
val newRDD = outputRDD.map { k => convertToPut(k) }
newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
}
def convertToPut(NlpWithRowKey: (String, DocumentEntitiesDAO)): List[(ImmutableBytesWritable, Put)] = {
val rowkey = NlpWithRowKey._1
val documentEntities = NlpWithRowKey._2
var returnList: List[(ImmutableBytesWritable, Put)] = List()
val segmentInstances = documentEntities.getSegmentInstance
val segments = documentEntities.getSegment
if(segments != null) {
var count = 0
for(segment <- segmentInstances) {
val keyString: String = documentEntities.getSystemId + "#" + Integer.toString(count)
count = count + 1
val outputKey: ImmutableBytesWritable = new ImmutableBytesWritable(keyString.getBytes())
val put = new Put(outputKey.get())
val features: FeatureMap = segment.getFeatures
val it: Iterator[Entry[Object, Object]] = features.entrySet.iterator()
var sideEffect_offset = "NULL"
var entity_offset = "NULL"
while(it.hasNext) {
val pair = it.next()
if(pair.getKey.equals("sideEffect-offset")) {
sideEffect_offset = pair.getValue().toString()
}
else if(pair.getKey.equals("drug-offset")) {
entity_offset = pair.getValue().toString()
}
else if(pair.getKey().equals("drug") || pair.getKey().equals("sideEffect")){
put.add(Bytes.toBytes("seg"), Bytes.toBytes(pair.getKey.toString), Bytes
.toBytes(pair.getValue().toString))
}
else {
put.add(Bytes.toBytes("segFeatures"), Bytes.toBytes(pair.getKey.toString), Bytes
.toBytes(pair.getValue().toString))
}
}
put.add(Bytes.toBytes("seg"), Bytes.toBytes("RelationId"),
Bytes.toBytes(documentEntities.getSystemId() + "-" + entity_offset + "-" + sideEffect_offset))
put.add(Bytes.toBytes("segInst"),Bytes.toBytes("id"), Bytes.toBytes(segment.getId()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("type"), Bytes.toBytes(segment.getType()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("startNodeId"), Bytes.toBytes(
segment.getStartNode().getId()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("startNodeOffset"),
Bytes.toBytes(segment.getStartNode().getOffset()))
put.add(Bytes.toBytes("segInst"),Bytes.toBytes("endNodeId"),
Bytes.toBytes(segment.getEndNode().getId()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("endNodeOffset"),
Bytes.toBytes(segment.getEndNode().getOffset()))
put.add(Bytes.toBytes("seg"),Bytes.toBytes("system_id"),
Bytes.toBytes(documentEntities.getSystemId()))
put.add(Bytes.toBytes("seg"), Bytes.toBytes("segmentText"),
Bytes.toBytes(segment.getAnnotatedText()))
for(segmentClassAnnots <- documentEntities.getSegmentClass) {
try {
if (segment.getId().equals(segmentClassAnnots.getFeatures().get("instance-id"))) {
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("id"),
Bytes.toBytes(segmentClassAnnots.getId()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("type"),
Bytes.toBytes(segmentClassAnnots.getType()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("startNodeId"), Bytes
.toBytes(segmentClassAnnots.getStartNode()
.getId()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("startNodeOffset"), Bytes
.toBytes(segmentClassAnnots.getStartNode()
.getOffset()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("endNodeId"), Bytes
.toBytes(segmentClassAnnots.getEndNode()
.getId()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("endNodeOffset"), Bytes
.toBytes(segmentClassAnnots.getEndNode()
.getOffset()))
break
}
} catch {
case t: Throwable => t.printStackTrace
}
returnList = returnList:+((new ImmutableBytesWritable(Bytes.toBytes("drugSegmentNew1")), put))
}
}
}
val PUT = new Put(Bytes.toBytes(rowkey))
PUT.add(Bytes.toBytes("f"), Bytes.toBytes("dStatus"), Bytes.toBytes("1"))
returnList = returnList:+((new ImmutableBytesWritable(Bytes.toBytes("posts")), PUT))
(returnList)
}
}
Just change your below line :
val newRDD = outputRDD.map { k => convertToPut(k) }
with this line:
val newRDD = outputRDD.flatMap { k => convertToPut(k) }
Hope this helps!

how to print Map[String, Array[Float]] in scala?

I am using word2vec function which is inside mllib library of Spark. I want to print word vectors which I am getting as output to "getVectors" function
My code looks like this:
import org.apache.spark._
import org.apache.spark.rdd._
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
object word2vec {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("word2vec")
val sc = new SparkContext(conf)
val input = sc.textFile("file:///home/snap-01/balance.csv").map(line => line.split(",").toSeq)
val word2vec = new Word2Vec()
val model = word2vec.fit(input)
model.save(sc, "myModelPath")
val sameModel = Word2VecModel.load(sc, "myModelPath")
val vec = sameModel.getVectors
print(vec)
}
}
I am getting "Map(Balance -> [F#2932e15f)"
Try this :
vec.foreach { case (key, values) => println("key " + key + " - " + values.mkString("-")
}
Alternatively,
println(vec.mapValues(_.toList))
But keep an eye on the memory required to do so.