Working to get this code running using notebooks in databricks(already tested and working with an IDE), can not get this working if I change the structure of the code.
import java.io.{BufferedReader, InputStreamReader}
import java.text.SimpleDateFormat
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
object TestUnit {
val dateFormat = new SimpleDateFormat("yyyyMMdd")
case class Averages (cust: String, Num: String, date: String, credit: Double)
def main(args: Array[String]): Unit = {
val inputFile = "s3a://tfsdl-ghd-wb/raidnd/Cleartablet.csv"
val outputFile = "s3a://tfsdl-ghd-wb/raidnd/Incte_19&20.csv"
val fileSystem = getFileSystem(inputFile)
val inputData = readCSVFileLines(fileSystem, inputFile, skipHeader = true)
.toSeq
val filtinp = inputData.filter(x => x.nonEmpty)
.map(x => x.split(","))
.map(x => Revenue(x(6), x(5), x(0), x(8).toDouble))
// Create output writer
val writer = new PrintWriter(new File(outputFile))
// Header for output CSV file
writer.write("Date,customer,number,Credit,Average Credit/SKU\n")
filtinp.foreach{x =>
val (com1, avg1) = com1Average(filtermp, x)
val (com2, avg2) = com2Average(filtermp, x)
}
// Write row to output csv file
writer.write(s"${x.day},${x.customer},${x.number},${x.credit},${avgcredit1},${avgcredit2}\n")
writer.close() // close the writer`
}
}
I want to try to implement the
def grouped(size: Int): Iterator[Repr] that Seq has but for Dataset in Spark.
So the input should be ds: Dataset[A], size: Int and output Seq[Dataset[A]] where each of the Dataset[A] in the output can't be bigger than size.
How should I proceed ? I tried with repartition and mapPartitions but I am not sure where to go from there.
Thank you.
Edit: I found the glom method in RDD but it produce a RDD[Array[A]] how do I go from this to the other way around Array[RDD[A]] ?
here you go, something that you want
/*
{"countries":"pp1"}
{"countries":"pp2"}
{"countries":"pp3"}
{"countries":"pp4"}
{"countries":"pp5"}
{"countries":"pp6"}
{"countries":"pp7"}
*/
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.{SparkConf, SparkContext};
object SparkApp extends App {
override def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Simple Application").setMaster("local").set("spark.ui.enabled", "false")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val dataFrame: DataFrame = sqlContext.read.json("/data.json")
val k = 3
val windowSpec = Window.partitionBy("grouped").orderBy("countries")
val newDF = dataFrame.withColumn("grouped", lit("grouping"))
var latestDF = newDF.withColumn("row", row_number() over windowSpec)
val totalCount = latestDF.count()
var lowLimit = 0
var highLimit = lowLimit + k
while(lowLimit < totalCount){
latestDF.where(s"row <= $highLimit and row > $lowLimit").show(false)
lowLimit = lowLimit + k
highLimit = highLimit + k
}
}
}
Here is the solution I found but I am not sure if that can works reliably:
override protected def batch[A](
input: Dataset[A],
batchSize: Int
): Seq[Dataset[A]] = {
val count = input.count()
val partitionQuantity = Math.ceil(count / batchSize).toInt
input.randomSplit(Array.fill(partitionQuantity)(1.0 / partitionQuantity), seed = 0)
}
I am trying to write a parquet file as sink using AvroParquetWriter. The file is created but with 0 length (no data is written). am I doing something wrong ? couldn't figure out what is the problem
import io.eels.component.parquet.ParquetWriterConfig
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericRecord}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.hadoop.fs.Path
import org.apache.parquet.avro.AvroParquetWriter
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}
import org.apache.parquet.hadoop.metadata.CompressionCodecName
import scala.io.Source
import org.apache.flink.streaming.api.scala._
object Tester extends App {
val env = StreamExecutionEnvironment.getExecutionEnvironment
def now = System.currentTimeMillis()
val path = new Path(s"/tmp/test-$now.parquet")
val schemaString = Source.fromURL(getClass.getResource("/request_schema.avsc")).mkString
val schema: Schema = new Schema.Parser().parse(schemaString)
val compressionCodecName = CompressionCodecName.SNAPPY
val config = ParquetWriterConfig()
val genericReocrd: GenericRecord = new GenericData.Record(schema)
genericReocrd.put("name", "test_b")
genericReocrd.put("code", "NoError")
genericReocrd.put("ts", 100L)
val stream = env.fromElements(genericReocrd)
val writer: ParquetWriter[GenericRecord] = AvroParquetWriter.builder[GenericRecord](path)
.withSchema(schema)
.withCompressionCodec(compressionCodecName)
.withPageSize(config.pageSize)
.withRowGroupSize(config.blockSize)
.withDictionaryEncoding(config.enableDictionary)
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
.withValidation(config.validating)
.build()
writer.write(genericReocrd)
stream.addSink{r =>
writer.write(r)
}
env.execute()
The problem is that you don't close the ParquetWriter. This is necessary to flush pending elements to disk. You could solve the problem by defining your own RichSinkFunction where you close the ParquetWriter in the close method:
class ParquetWriterSink(val path: String, val schema: String, val compressionCodecName: CompressionCodecName, val config: ParquetWriterConfig) extends RichSinkFunction[GenericRecord] {
var parquetWriter: ParquetWriter[GenericRecord] = null
override def open(parameters: Configuration): Unit = {
parquetWriter = AvroParquetWriter.builder[GenericRecord](new Path(path))
.withSchema(new Schema.Parser().parse(schema))
.withCompressionCodec(compressionCodecName)
.withPageSize(config.pageSize)
.withRowGroupSize(config.blockSize)
.withDictionaryEncoding(config.enableDictionary)
.withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
.withValidation(config.validating)
.build()
}
override def close(): Unit = {
parquetWriter.close()
}
override def invoke(value: GenericRecord, context: SinkFunction.Context[_]): Unit = {
parquetWriter.write(value)
}
}
I know that we can use saveAsNewAPIHadoopDataset with RDD[(ImmutableBytesWritable, Put)] to write to HBase table using spark.
But I have a list i.e RDD[List[(ImmutableBytesWritable, Put)] which I want to write 2 different HBase Tables.
How to do it?
Below is the code.
package com.scryAnalytics.FeatureExtractionController
import com.scryAnalytics.FeatureExtractionController.DAO.{DocumentEntitiesDAO, NLPEntitiesDAO, SegmentFeaturesDAO}
import com.scryAnalytics.NLPGeneric.{GateGenericNLP, NLPEntities}
import com.sun.xml.bind.v2.TODO
import com.vocp.ner.main.GateNERImpl
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{HBaseAdmin, Result}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{MultiTableOutputFormat, TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.mapreduce.Job
import com.scryAnalytics.FeatureExtraction.SegmentsFeatureExtraction
import com.scryAnalytics.FeatureExtraction.DAO.VOCPEntities
import scala.collection.JavaConversions._
import gate.FeatureMap
import java.util.Map.Entry
import scala.collection.JavaConversions
import scala.util.control.Breaks.break
import scala.util.control.ControlThrowable
/**
* Created by sahil on 1/12/16.
*/
object Main {
def main(args: Array[String]): Unit = {
val inputTableName = "posts"
val outputTableName = "drugSegmentNew1"
val pluginHome = "/home/sahil/Voice-of-Cancer-Patients/VOCP Modules/bin/plugins"
val sc = new SparkContext(new SparkConf().setAppName("HBaseRead").setMaster("local[4]"))
val conf = HBaseConfiguration.create()
conf.set(HConstants.ZOOKEEPER_QUORUM, "localhost")
conf.set(TableInputFormat.INPUT_TABLE, inputTableName)
val admin = new HBaseAdmin(conf)
if (!admin.isTableAvailable(inputTableName)) {
val tableDesc = new HTableDescriptor(TableName.valueOf(inputTableName))
admin.createTable(tableDesc)
}
val job: Job = Job.getInstance(conf, "FeatureExtractionJob")
job.setOutputFormatClass(classOf[MultiTableOutputFormat])
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[ImmutableBytesWritable], classOf[Result])
val resultRDD = hBaseRDD.map(x => x._2)
// TODO: Add filters
val entity: VOCPEntities = VOCPEntities.DRUG
val nlpRDD = resultRDD.mapPartitions { iter =>
val nlpEntities: NLPEntitiesDAO = new NLPEntitiesDAO
iter.map {
result =>
val message = Bytes.toString(result.getValue(Bytes.toBytes("p"), Bytes.toBytes("message")))
val row_key = Bytes.toString(result.getRow)
nlpEntities.setToken(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("token")))))
nlpEntities.setSpaceToken(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("spaceToken")))))
nlpEntities.setSentence(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("sentence")))))
nlpEntities.setVG(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("verbGroup")))))
nlpEntities.setSplit(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("split")))))
nlpEntities.setNounChunk(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("gen"), Bytes.toBytes("nounChunk")))))
nlpEntities.setDrugs(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("drug")))))
nlpEntities.setRegimen(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("regimen")))))
nlpEntities.setSideEffects(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("sideEffect")))))
nlpEntities.setALT_DRUG(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("altDrug")))))
nlpEntities.setALT_THERAPY(Utility.jsonToAnnotations(Bytes.toString(
result.getValue(Bytes.toBytes("ner"), Bytes.toBytes("altTherapy")))))
(row_key, message, nlpEntities)
}
}
val featureExtractionOld: SegmentsFeatureExtraction = new SegmentsFeatureExtraction(
pluginHome, entity)
val outputRDD = nlpRDD.mapPartitions { iter =>
val featureExtraction: SegmentsFeatureExtraction = new SegmentsFeatureExtraction(
pluginHome, entity)
iter.map { x =>
val featuresJson = featureExtraction.generateFeatures(x._2, Utility.objectToJson(x._3))
val segmentFeatures: SegmentFeaturesDAO = Utility.jsonToSegmentFeatures(featuresJson)
val documentEntities: DocumentEntitiesDAO = new DocumentEntitiesDAO
documentEntities.setSystemId(x._1)
documentEntities.setToken(x._3.getToken)
documentEntities.setSpaceToken(x._3.getSpaceToken)
documentEntities.setSentence(x._3.getSentence)
documentEntities.setVG(x._3.getVG)
documentEntities.setNounChunk(x._3.getNounChunk)
documentEntities.setSplit(x._3.getSplit)
documentEntities.setDRUG(x._3.getDrugs)
documentEntities.setSE(x._3.getSideEffects)
documentEntities.setREG(x._3.getRegimen)
documentEntities.setALT_DRUG(x._3.getALT_DRUG)
documentEntities.setALT_THERAPY(x._3.getALT_THERAPY)
documentEntities.setSegment(segmentFeatures.getSegment)
documentEntities.setSegmentClass(segmentFeatures.getSegmentClass)
documentEntities.setSegmentInstance(segmentFeatures.getSegmentInstance)
(x._1, documentEntities)
}
}
val newRDD = outputRDD.map { k => convertToPut(k) }
newRDD.saveAsNewAPIHadoopDataset(job.getConfiguration())
}
def convertToPut(NlpWithRowKey: (String, DocumentEntitiesDAO)): List[(ImmutableBytesWritable, Put)] = {
val rowkey = NlpWithRowKey._1
val documentEntities = NlpWithRowKey._2
var returnList: List[(ImmutableBytesWritable, Put)] = List()
val segmentInstances = documentEntities.getSegmentInstance
val segments = documentEntities.getSegment
if(segments != null) {
var count = 0
for(segment <- segmentInstances) {
val keyString: String = documentEntities.getSystemId + "#" + Integer.toString(count)
count = count + 1
val outputKey: ImmutableBytesWritable = new ImmutableBytesWritable(keyString.getBytes())
val put = new Put(outputKey.get())
val features: FeatureMap = segment.getFeatures
val it: Iterator[Entry[Object, Object]] = features.entrySet.iterator()
var sideEffect_offset = "NULL"
var entity_offset = "NULL"
while(it.hasNext) {
val pair = it.next()
if(pair.getKey.equals("sideEffect-offset")) {
sideEffect_offset = pair.getValue().toString()
}
else if(pair.getKey.equals("drug-offset")) {
entity_offset = pair.getValue().toString()
}
else if(pair.getKey().equals("drug") || pair.getKey().equals("sideEffect")){
put.add(Bytes.toBytes("seg"), Bytes.toBytes(pair.getKey.toString), Bytes
.toBytes(pair.getValue().toString))
}
else {
put.add(Bytes.toBytes("segFeatures"), Bytes.toBytes(pair.getKey.toString), Bytes
.toBytes(pair.getValue().toString))
}
}
put.add(Bytes.toBytes("seg"), Bytes.toBytes("RelationId"),
Bytes.toBytes(documentEntities.getSystemId() + "-" + entity_offset + "-" + sideEffect_offset))
put.add(Bytes.toBytes("segInst"),Bytes.toBytes("id"), Bytes.toBytes(segment.getId()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("type"), Bytes.toBytes(segment.getType()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("startNodeId"), Bytes.toBytes(
segment.getStartNode().getId()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("startNodeOffset"),
Bytes.toBytes(segment.getStartNode().getOffset()))
put.add(Bytes.toBytes("segInst"),Bytes.toBytes("endNodeId"),
Bytes.toBytes(segment.getEndNode().getId()))
put.add(Bytes.toBytes("segInst"), Bytes.toBytes("endNodeOffset"),
Bytes.toBytes(segment.getEndNode().getOffset()))
put.add(Bytes.toBytes("seg"),Bytes.toBytes("system_id"),
Bytes.toBytes(documentEntities.getSystemId()))
put.add(Bytes.toBytes("seg"), Bytes.toBytes("segmentText"),
Bytes.toBytes(segment.getAnnotatedText()))
for(segmentClassAnnots <- documentEntities.getSegmentClass) {
try {
if (segment.getId().equals(segmentClassAnnots.getFeatures().get("instance-id"))) {
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("id"),
Bytes.toBytes(segmentClassAnnots.getId()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("type"),
Bytes.toBytes(segmentClassAnnots.getType()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("startNodeId"), Bytes
.toBytes(segmentClassAnnots.getStartNode()
.getId()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("startNodeOffset"), Bytes
.toBytes(segmentClassAnnots.getStartNode()
.getOffset()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("endNodeId"), Bytes
.toBytes(segmentClassAnnots.getEndNode()
.getId()))
put.add(Bytes.toBytes("segClass"), Bytes.toBytes("endNodeOffset"), Bytes
.toBytes(segmentClassAnnots.getEndNode()
.getOffset()))
break
}
} catch {
case t: Throwable => t.printStackTrace
}
returnList = returnList:+((new ImmutableBytesWritable(Bytes.toBytes("drugSegmentNew1")), put))
}
}
}
val PUT = new Put(Bytes.toBytes(rowkey))
PUT.add(Bytes.toBytes("f"), Bytes.toBytes("dStatus"), Bytes.toBytes("1"))
returnList = returnList:+((new ImmutableBytesWritable(Bytes.toBytes("posts")), PUT))
(returnList)
}
}
Just change your below line :
val newRDD = outputRDD.map { k => convertToPut(k) }
with this line:
val newRDD = outputRDD.flatMap { k => convertToPut(k) }
Hope this helps!
I am using word2vec function which is inside mllib library of Spark. I want to print word vectors which I am getting as output to "getVectors" function
My code looks like this:
import org.apache.spark._
import org.apache.spark.rdd._
import org.apache.spark.SparkContext._
import org.apache.spark.mllib.feature.{Word2Vec, Word2VecModel}
object word2vec {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("word2vec")
val sc = new SparkContext(conf)
val input = sc.textFile("file:///home/snap-01/balance.csv").map(line => line.split(",").toSeq)
val word2vec = new Word2Vec()
val model = word2vec.fit(input)
model.save(sc, "myModelPath")
val sameModel = Word2VecModel.load(sc, "myModelPath")
val vec = sameModel.getVectors
print(vec)
}
}
I am getting "Map(Balance -> [F#2932e15f)"
Try this :
vec.foreach { case (key, values) => println("key " + key + " - " + values.mkString("-")
}
Alternatively,
println(vec.mapValues(_.toList))
But keep an eye on the memory required to do so.