I am stuck on an issue. I have created a scala maven project where I am trying to communicate with mongo. My project is getting build successfuly and I do not get any error when I execute my project, but still no data is getting inserted in my mongodb.
If I insert data manually through mongo-cli then my code can read it but it is not even inserting a single record.
Could you please help me in finding where I am making mistakes.
My mongo version is 3.2.10
App.scala
package com.assignment.scala
import org.mongodb.scala._
import org.mongodb.scala.model.Aggregates._
import org.mongodb.scala.model.Filters._
import org.mongodb.scala.model.Projections._
import org.mongodb.scala.model.Sorts._
import org.mongodb.scala.model.Updates._
import org.mongodb.scala.model._
/**
* #author ${user.name}
*/
object App {
def main(args: Array[String]) {
println("Calling insertDoc==")
insertDoc()
}
def insertDoc() = {
val mongoClient = MongoClient()
val database = mongoClient.getDatabase("assignment")
val collection = database.getCollection("links")
println("collection find : " + collection.find())
collection.find().subscribe(
(user: Document) => println("document------------"+user.toJson()),
(error: Throwable) => println(s"Query failed: ${error.getMessage}"),
() => println("Done"))
collection.drop()
val bufferedSource = io.Source.fromFile("/home/impadmin/ServiceSource/ml-latest-small/links.csv")
var firstLine = List[String]();
var docList = List[Document]();
for ((line,count) <- bufferedSource.getLines.zipWithIndex) {
val cols = line.split(",").map(_.trim)
if (firstLine.size <= 0 && cols.length > 0) {
firstLine = cols.toList;
} else {
var doc: Document = Document("_id" -> count)
for ((a, b) <- firstLine zip cols) {
doc ++= Document(a -> b)
}
docList = docList :+ doc
collection.insertOne(doc)
}
}
val doc3 = Document("name" -> "MongoDB", "type" -> "database",
"count" -> 1, "info" -> Document("x" -> 203, "y" -> 102))
collection.insertOne(doc)
collection.find().subscribe((doc1: Document) => println(doc1.toJson()),
(e: Throwable) => println(s"There was an error: $e"))
}
}
It worked using subscribe:
val insertObservable1: Observable[Completed] = collection.insertMany(docList);
val latch1 = new CountDownLatch(1)
insertObservable1.subscribe(new Observer[Completed] {
override def onError(e: Throwable): Unit = {
println("Error1")
e.printStackTrace()
}
override def onSubscribe(subscription: Subscription): Unit = {
subscription.request(java.lang.Long.MAX_VALUE)
println("inserting1-------------")
}
override def onComplete(): Unit = {
println("Completed1")
latch1.countDown()
}
override def onNext(result: Completed): Unit = {
println("Next1-----------")
}
})
latch1.await()
}
})
Related
val patterns = ctx.getBroadcastState(patternStateDescriptor)
The imports I made
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{MapStateDescriptor, ValueState, ValueStateDescriptor}
import org.apache.flink.api.scala.typeutils.Types
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.datastream.BroadcastStream
import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
Here's the code
val env = StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
properties.setProperty("bootstrap.servers","localhost:9092")
val patternStream = new FlinkKafkaConsumer010("patterns", new SimpleStringSchema, properties)
val patterns = env.addSource(patternStream)
var patternData = patterns.map {
str =>
val splitted_str = str.split(",")
PatternStream(splitted_str(0).trim, splitted_str(1).trim, splitted_str(2).trim)
}
val logsStream = new FlinkKafkaConsumer010("logs", new SimpleStringSchema, properties)
// logsStream.setStartFromEarliest()
val logs = env.addSource(logsStream)
var data = logs.map {
str =>
val splitted_str = str.split(",")
LogsTest(splitted_str.head.trim, splitted_str(1).trim, splitted_str(2).trim)
}
val keyedData: KeyedStream[LogsTest, String] = data.keyBy(_.metric)
val bcStateDescriptor = new MapStateDescriptor[Unit, PatternStream]("patterns", Types.UNIT, Types.of[PatternStream]) // first type defined is for the key and second data type defined is for the value
val broadcastPatterns: BroadcastStream[PatternStream] = patternData.broadcast(bcStateDescriptor)
val alerts = keyedData
.connect(broadcastPatterns)
.process(new PatternEvaluator())
alerts.print()
// println(alerts.getClass)
// val sinkProducer = new FlinkKafkaProducer010("output", new SimpleStringSchema(), properties)
env.execute("Flink Broadcast State Job")
}
class PatternEvaluator()
extends KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)] {
private lazy val patternStateDescriptor = new MapStateDescriptor("patterns", classOf[String], classOf[String])
private var lastMetricState: ValueState[String] = _
override def open(parameters: Configuration): Unit = {
val lastMetricDescriptor = new ValueStateDescriptor("last-metric", classOf[String])
lastMetricState = getRuntimeContext.getState(lastMetricDescriptor)
}
override def processElement(reading: LogsTest,
readOnlyCtx: KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)]#ReadOnlyContext,
out: Collector[(String, String, String)]): Unit = {
val metrics = readOnlyCtx.getBroadcastState(patternStateDescriptor)
if (metrics.contains(reading.metric)) {
val metricPattern: String = metrics.get(reading.metric)
val metricPatternValue: String = metrics.get(reading.value)
val lastMetric = lastMetricState.value()
val logsMetric = (reading.metric)
val logsValue = (reading.value)
if (logsMetric == metricPattern) {
if (metricPatternValue == logsValue) {
out.collect((reading.timestamp, reading.value, reading.metric))
}
}
}
}
override def processBroadcastElement(
update: PatternStream,
ctx: KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)]#Context,
out: Collector[(String, String, String)]
): Unit = {
val patterns = ctx.getBroadcastState(patternStateDescriptor)
if (update.metric == "IP") {
patterns.put(update.metric /*,update.operator*/ , update.value)
}
// else if (update.metric == "username"){
// patterns.put(update.metric, update.value)
// }
// else {
// println("No required data found")
// }
// }
}
}
Sample Data :- Logs Stream
"21/09/98","IP", "5.5.5.5"
Pattern Stream
"IP","==","5.5.5.5"
I'm unable to analyse data by getting desired result, i.e = 21/09/98,IP,5.5.5.5
There's no error as of now, it's just not analysing the data
The code is reading streams (Checked)
One common source of trouble in cases like this is that the API offers no control over the order in which the patterns and the data are ingested. It could be that processElement is being called before processBroadcastElement.
I'am trying to create and download an archive file without relying on memory(to avoid out of memory exception for large file) I'am using for that play framework 2.3 with Scala, after some research I found an example: https://gist.github.com/kirked/412b5156f94419e71ce4a84ec1d54761, I made some modification on it. My problem is when I download the file and try to open it I get this exception: An error occurred while loading the archive.
here all the code:
def zip() = Action {
implicit request: Request[AnyContent] =>
val buffer = new ZipBuffer(10000)
val writeCentralDirectory = Enumerator.generateM(Future{
if (buffer.isClosed) {
None
}
else {
buffer.flush
buffer.close
Some(buffer.bytes)
}
})
val test = Enumerator.apply(ResolvedSource2("test.txt", "helllo"))
Ok.chunked(test &> zipeach2(buffer) andThen writeCentralDirectory >>> Enumerator.eof) as withCharset("application/zip") withHeaders(
CONTENT_DISPOSITION -> s"attachment; filename=aa.zip")
}
case class ResolvedSource2(filepath: String, stream: String)
def zipeach2(buffer: ZipBuffer)(implicit ec: ExecutionContext): Enumeratee[ResolvedSource2, Array[Byte]] = {
Enumeratee.mapConcat[ResolvedSource2] { source =>
buffer.zipStream.putNextEntry(new ZipEntry(source.filepath))
var done = false
def entryDone: Unit = {
done = true
buffer.zipStream.closeEntry
}
def restOfStream: Stream[Array[Byte]] = {
if (done) Stream.empty
else {
while (!done && !buffer.full) {
try {
val byte = source.stream
buffer.zipStream.write(byte.getBytes)
entryDone
}
catch {
case e: IOException =>
println(s"reading/zipping stream [${source.filepath}]", e)
}
}
buffer.bytes #:: restOfStream
}
}
restOfStream
}
}
}
class ZipBuffer(capacity: Int) {
private val buf = new ByteArrayOutputStream(capacity)
private var closed = false
val zipStream = new ZipOutputStream(buf)
def close(): Unit = {
if (!closed) {
closed = true
reset
zipStream.finish()
zipStream.close // writes central directory
}
}
def flush() = {
zipStream.flush()
}
def isClosed = closed
def reset: Unit = buf.reset
def full: Boolean = buf.size >= capacity
def bytes: Array[Byte] = {
val result = buf.toByteArray
reset
result
}
}
I am unable to build a fat jar for my Kafka-SparkStructuredStreaming-MongoDB pipeline.
I have built StructuredStreamingProgram: receives streaming data from Kafka Topics and apply some parsing and then my intention is to save the structured streaming data into a MongoDB collection.
I have followed this article to build my pipeline https://learningfromdata.blog/2017/04/16/real-time-data-ingestion-with-apache-spark-structured-streaming-implementation/
I have created Helpers.scala and MongoDBForeachWriter.scala as suggested in the article for my streaming pipeline and save it under src/main/scala/example
When i do sbt assembly to build a fat jar i face this errors;
"[error] C:\spark_streaming\src\main\scala\example\structuredStreamApp.scala:63: class MongoDBForeachWriter is abstract; cannot be instantiated
[error] val structuredStreamForeachWriter: MongoDBForeachWriter = new MongoDBForeachWriter(mongodb_uri,mdb_name,mdb_collection,CountAccum)"
I need guidance in making this pipeline work.
Any help will be appreciated
package example
import java.util.Calendar
import org.apache.spark.util.LongAccumulator
import org.apache.spark.sql.Row
import org.apache.spark.sql.ForeachWriter
import org.mongodb.scala._
import org.mongodb.scala.bson.collection.mutable.Document
import org.mongodb.scala.bson._
import example.Helpers._
abstract class MongoDBForeachWriter(p_uri: String,
p_dbName: String,
p_collectionName: String,
p_messageCountAccum: LongAccumulator) extends ForeachWriter[Row] {
val mongodbURI = p_uri
val dbName = p_dbName
val collectionName = p_collectionName
val messageCountAccum = p_messageCountAccum
var mongoClient: MongoClient = null
var db: MongoDatabase = null
var collection: MongoCollection[Document] = null
def ensureMongoDBConnection(): Unit = {
if (mongoClient == null) {
mongoClient = MongoClient(mongodbURI)
db = mongoClient.getDatabase(dbName)
collection = db.getCollection(collectionName)
}
}
override def open(partitionId: Long, version: Long): Boolean = {
true
}
override def process(record: Row): Unit = {
val valueStr = new String(record.getAs[Array[Byte]]("value"))
val doc: Document = Document(valueStr)
doc += ("log_time" -> Calendar.getInstance().getTime())
// lazy opening of MongoDB connection
ensureMongoDBConnection()
val result = collection.insertOne(doc).results()
// tracks how many records I have processed
if (messageCountAccum != null)
messageCountAccum.add(1)
}
}
package example
import java.util.concurrent.TimeUnit
import scala.concurrent.Await
import scala.concurrent.duration.Duration
import org.mongodb.scala._
object Helpers {
implicit class DocumentObservable[C](val observable: Observable[Document]) extends ImplicitObservable[Document] {
override val converter: (Document) => String = (doc) => doc.toJson
}
implicit class GenericObservable[C](val observable: Observable[C]) extends ImplicitObservable[C] {
override val converter: (C) => String = (doc) => doc.toString
}
trait ImplicitObservable[C] {
val observable: Observable[C]
val converter: (C) => String
def results(): Seq[C] = Await.result(observable.toFuture(), Duration(10, TimeUnit.SECONDS))
def headResult() = Await.result(observable.head(), Duration(10, TimeUnit.SECONDS))
def printResults(initial: String = ""): Unit = {
if (initial.length > 0) print(initial)
results().foreach(res => println(converter(res)))
}
def printHeadResult(initial: String = ""): Unit = println(s"${initial}${converter(headResult())}")
}
}
package example
import org.apache.spark.sql.functions.{col, _}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.util.LongAccumulator
import example.Helpers._
import java.util.Calendar
object StructuredStreamingProgram {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("OSB_Streaming_Model")
.getOrCreate()
import spark.implicits._
val df = spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "10.160.172.45:9092, 10.160.172.46:9092, 10.160.172.100:9092")
.option("subscribe", "TOPIC_WITH_COMP_P2_R2, TOPIC_WITH_COMP_P2_R2.DIT, TOPIC_WITHOUT_COMP_P2_R2.DIT")
.load()
val dfs = df.selectExpr("CAST(value AS STRING)").toDF()
val data =dfs.withColumn("splitted", split($"SERVICE_NAME8", "/"))
.select($"splitted".getItem(4).alias("region"),$"splitted".getItem(5).alias("service"),col("_raw"))
.withColumn("service_type", regexp_extract($"service", """.*(Inbound|Outbound|Outound).*""",1))
.withColumn("region_type", concat(
when(col("region").isNotNull,col("region")).otherwise(lit("null")), lit(" "),
when(col("service").isNotNull,col("service_type")).otherwise(lit("null"))))
val extractedDF = data.filter(
col("region").isNotNull &&
col("service").isNotNull &&
col("_raw").isNotNull &&
col("service_type").isNotNull &&
col("region_type").isNotNull)
.filter("region != ''")
.filter("service != ''")
.filter("_raw != ''")
.filter("service_type != ''")
.filter("region_type != ''")
// sends to MongoDB once every 20 seconds
val mongodb_uri = "mongodb://dstk8sdev06.us.dell.com/?maxPoolSize=1"
val mdb_name = "HANZO_MDB"
val mdb_collection = "Testing_Spark"
val CountAccum: LongAccumulator = spark.sparkContext.longAccumulator("mongostreamcount")
val structuredStreamForeachWriter: MongoDBForeachWriter = new MongoDBForeachWriter(mongodb_uri,mdb_name,mdb_collection,CountAccum)
val query = df.writeStream
.foreach(structuredStreamForeachWriter)
.trigger(Trigger.ProcessingTime("20 seconds"))
.start()
while (!spark.streams.awaitAnyTermination(60000)) {
println(Calendar.getInstance().getTime()+" :: mongoEventsCount = "+CountAccum.value)
}
}
}
with the above by doing corrections i would need to be able to save the structured streaming data into mongodb
You can instantiate object for abstract class. To resolve this issue, implement close function in MongoDBForeachWriter class and make it as as concrete class.
class MongoDBForeachWriter(p_uri: String,
p_dbName: String,
p_collectionName: String,
p_messageCountAccum: LongAccumulator) extends ForeachWriter[Row] {
val mongodbURI = p_uri
val dbName = p_dbName
val collectionName = p_collectionName
val messageCountAccum = p_messageCountAccum
var mongoClient: MongoClient = null
var db: MongoDatabase = null
var collection: MongoCollection[Document] = null
def ensureMongoDBConnection(): Unit = {
if (mongoClient == null) {
mongoClient = MongoClient(mongodbURI)
db = mongoClient.getDatabase(dbName)
collection = db.getCollection(collectionName)
}
}
override def open(partitionId: Long, version: Long): Boolean = {
true
}
override def process(record: Row): Unit = {
val valueStr = new String(record.getAs[Array[Byte]]("value"))
val doc: Document = Document(valueStr)
doc += ("log_time" -> Calendar.getInstance().getTime())
// lazy opening of MongoDB connection
ensureMongoDBConnection()
val result = collection.insertOne(doc)
// tracks how many records I have processed
if (messageCountAccum != null)
messageCountAccum.add(1)
}
override def close(errorOrNull: Throwable): Unit = {
if(mongoClient != null) {
Try {
mongoClient.close()
}
}
}
}
Hope this helps.
Ravi
What I did works fine (at least it looks look it does), but I am not convinced this is the best way to do it...
Basically, I wanted to have my i18n translations in a database instead of property files so users can easily edit those translations and a cache can serve them to the other users within a short period of time - I use an Akka Actor to read from the database and create a cache used by the messageApi (before I always needed to redeploy with the changes in the property files).
Basically, is what I did totally the wrong way to do it?
TranslationActor.scala :
class TranslationActor extends Actor {
def receive = {
case _ => {
Logger.info("Starting to cache the translations")
TranslationActor.tempCache = ListMap.empty
var translations: ListMap[String, String] = ListMap.empty
for (acceptedLanguage <- TranslationActor.acceptedLanguages) {
val translationLanguageId: Long = TranslationLanguage.findByCode(acceptedLanguage).get.id
val languageTranslations: Seq[Translation] = Translation.findAllByLanguageId(translationLanguageId)
translations = new ListMap[String, String]
for (languageTranslation <- languageTranslations) {
val tag = EnglishTranslation.findById(languageTranslation.englishTranslationId).get.tag
var returnedTranslation: String = languageTranslation.translation
if (returnedTranslation.isEmpty) {
returnedTranslation = tag
}
translations += tag -> new CacheValue(new Locale(acceptedLanguage), returnedTranslation).stringVar
}
TranslationActor.tempCache += acceptedLanguage -> translations
}
TranslationActor.cache = TranslationActor.tempCache
Logger.info("Finished to cache the translations")
}
}
}
object TranslationActor {
var acceptedLanguages: Seq[String] = Seq("fr", "en")
var cache: ListMap[String, ListMap[String, String]] = ListMap.empty
var tempCache: ListMap[String, ListMap[String, String]] = ListMap.empty
}
class CacheValue(locale: Locale, string: String) {
val created: Long = System.currentTimeMillis
var messageFormat: MessageFormat = null
var localeVar: Locale = locale
var stringVar: String = string
def isOlderThan(period: Long): Boolean = {
(System.currentTimeMillis - created) > (period * 1000)
}
def getMessageFormat: MessageFormat = {
if (messageFormat == null) {
if (stringVar != null) {
messageFormat = new MessageFormat(stringVar, localeVar)
} else {
messageFormat = new MessageFormat("", localeVar)
}
}
messageFormat
}
}
ManageTranslationDaemon.scala :
#Singleton
class ManageTranslationDaemon #Inject() (actorSystem: ActorSystem, applicationLifecycle: ApplicationLifecycle) {
Logger.info("Scheduling the translation daemon")
val translationActor = actorSystem.actorOf(Props(new TranslationActor()))
actorSystem.scheduler.schedule(1 seconds, 30 minutes, translationActor, "translationDaemon")
applicationLifecycle.addStopHook { () =>
Logger.info("Shutting down translation daemon")
Future.successful(actorSystem.shutdown())
}
}
TranslationGuiceConfiguration.scala : (from an com.google.inject.AbstractModule)
class TranslationGuiceConfiguration extends AbstractModule {
def configure() : Unit = {
bind(classOf[ManageTranslationDaemon]).asEagerSingleton()
}
}
Then I extended parts of the DefaultMessagesApi (by looking at the code of MessagesApi) in
MessagesPersoApi.scala :
class MessagesPersoApi #Inject() (environment: Environment, configuration: Configuration, langs: Langs) extends DefaultMessagesApi(environment: Environment, configuration: Configuration, langs: Langs) {
private def joinPaths(first: Option[String], second: String) = first match {
case Some(parent) => new java.io.File(parent, second).getPath
case None => second
}
override protected def loadMessages(langCode: String): Map[String, String] = {
TranslationActor.cache.getOrElse(langCode, loadMessagesFromFile("messages." + langCode))
}
protected def loadMessagesFromFile(langCode: String): Map[String, String] = {
import scala.collection.JavaConverters._
environment.classLoader.getResources(joinPaths(messagesPrefix, langCode)).asScala.toList
.filterNot(url => Resources.isDirectory(environment.classLoader, url)).reverse
.map { messageFile =>
Messages.parse(Messages.UrlMessageSource(messageFile), messageFile.toString).fold(e => throw e, identity)
}.foldLeft(Map.empty[String, String]) {_ ++ _}
}
override protected def loadAllMessages: Map[String, Map[String, String]] = {
langs.availables.map(_.code).map { lang =>
(lang, loadMessages(lang))
}.toMap
.+("default" -> loadMessagesFromFile("messages"))
.+("default.play" -> loadMessagesFromFile("messages.default"))
}
}
And finally created a module (play.api.inject.Module)
MessagesPersoModule.scala :
class MessagesPersoModule extends Module {
def bindings(environment: Environment, configuration: Configuration) = {
Seq(
bind[Langs].to[DefaultLangs],
bind[MessagesApi].to[MessagesPersoApi]
)
}
}
And at the end, used it in my application.conf :
play.modules.disabled += "play.api.i18n.I18nModule"
play.modules.enabled += "modules.MessagesPersoModule"
play.modules.enabled += "modules.TranslationGuiceConfiguration"
Does that actually make sense? It seemed to me that it was a bit "complicated" to write. Is there an easier way to do that same logic with less code/classes ?
Thanks,
Yoann
I was trying to code some utilities to bulk load data through HFiles from Spark RDDs.
I was taking the pattern of CSVBulkLoadTool from phoenix. I managed to generate some HFiles and load them into HBase, but i can't see the rows using sqlline(e.g using hbase shell it is possible). I would be more than grateful for any suggestions.
BulkPhoenixLoader.scala:
class BulkPhoenixLoader[A <: ImmutableBytesWritable : ClassTag, T <: KeyValue : ClassTag](rdd: RDD[(A, T)]) {
def createConf(tableName: String, inConf: Option[Configuration] = None): Configuration = {
val conf = inConf.map(HBaseConfiguration.create).getOrElse(HBaseConfiguration.create())
val job: Job = Job.getInstance(conf, "Phoenix bulk load")
job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
job.setMapOutputValueClass(classOf[KeyValue])
// initialize credentials to possibily run in a secure env
TableMapReduceUtil.initCredentials(job)
val htable: HTable = new HTable(conf, tableName)
// Auto configure partitioner and reducer according to the Main Data table
HFileOutputFormat2.configureIncrementalLoad(job, htable)
conf
}
def bulkSave(tableName: String, outputPath: String, conf:
Option[Configuration]) = {
val configuration: Configuration = createConf(tableName, conf)
rdd.saveAsNewAPIHadoopFile(
outputPath,
classOf[ImmutableBytesWritable],
classOf[Put],
classOf[HFileOutputFormat2],
configuration)
}
}
ExtendedProductRDDFunctions.scala:
class ExtendedProductRDDFunctions[A <: scala.Product](data: org.apache.spark.rdd.RDD[A]) extends
ProductRDDFunctions[A](data) with Serializable {
def toHFile(tableName: String,
columns: Seq[String],
conf: Configuration = new Configuration,
zkUrl: Option[String] =
None): RDD[(ImmutableBytesWritable, KeyValue)] = {
val config = ConfigurationUtil.getOutputConfiguration(tableName, columns, zkUrl, Some(conf))
val tableBytes = Bytes.toBytes(tableName)
val encodedColumns = ConfigurationUtil.encodeColumns(config)
val jdbcUrl = zkUrl.map(getJdbcUrl).getOrElse(getJdbcUrl(config))
val conn = DriverManager.getConnection(jdbcUrl)
val query = QueryUtil.constructUpsertStatement(tableName,
columns.toList.asJava,
null)
data.flatMap(x => mapRow(x, jdbcUrl, encodedColumns, tableBytes, query))
}
def mapRow(product: Product,
jdbcUrl: String,
encodedColumns: String,
tableBytes: Array[Byte],
query: String): List[(ImmutableBytesWritable, KeyValue)] = {
val conn = DriverManager.getConnection(jdbcUrl)
val preparedStatement = conn.prepareStatement(query)
val columnsInfo = ConfigurationUtil.decodeColumns(encodedColumns)
columnsInfo.zip(product.productIterator.toList).zipWithIndex.foreach(setInStatement(preparedStatement))
preparedStatement.execute()
val uncommittedDataIterator = PhoenixRuntime.getUncommittedDataIterator(conn, true)
val hRows = uncommittedDataIterator.asScala.filter(kvPair =>
Bytes.compareTo(tableBytes, kvPair.getFirst) == 0
).flatMap(kvPair => kvPair.getSecond.asScala.map(
kv => {
val byteArray = kv.getRowArray.slice(kv.getRowOffset, kv.getRowOffset + kv.getRowLength - 1) :+ 1.toByte
(new ImmutableBytesWritable(byteArray, 0, kv.getRowLength), kv)
}))
conn.rollback()
conn.close()
hRows.toList
}
def setInStatement(statement: PreparedStatement): (((ColumnInfo, Any), Int)) => Unit = {
case ((c, v), i) =>
if (v != null) {
// Both Java and Joda dates used to work in 4.2.3, but now they must be java.sql.Date
val (finalObj, finalType) = v match {
case dt: DateTime => (new Date(dt.getMillis), PDate.INSTANCE.getSqlType)
case d: util.Date => (new Date(d.getTime), PDate.INSTANCE.getSqlType)
case _ => (v, c.getSqlType)
}
statement.setObject(i + 1, finalObj, finalType)
} else {
statement.setNull(i + 1, c.getSqlType)
}
}
private def getIndexTables(conn: Connection, qualifiedTableName: String) : List[(String, String)]
= {
val table: PTable = PhoenixRuntime.getTable(conn, qualifiedTableName)
val tables = table.getIndexes.asScala.map(x => x.getIndexType match {
case IndexType.LOCAL => (x.getTableName.getString, MetaDataUtil.getLocalIndexTableName(qualifiedTableName))
case _ => (x.getTableName.getString, x.getTableName.getString)
}).toList
tables
}
}
The generated HFiles I load with the utility tool from hbase as follows:
hbase org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles path/to/hfile tableName
You could just convert your csv file to an RDD of Product and use the .saveToPhoenix method. This is generally how I load csv data into phoenix.
Please see: https://phoenix.apache.org/phoenix_spark.html