Insert Spark dataframe into hbase - scala

I have a dataframe and I want to insert it into hbase. I follow this documenation .
This is how my dataframe look like:
--------------------
|id | name | address |
|--------------------|
|23 |marry |france |
|--------------------|
|87 |zied |italie |
--------------------
I create a hbase table using this code:
val tableName = "two"
val conf = HBaseConfiguration.create()
if(!admin.isTableAvailable(tableName)) {
print("-----------------------------------------------------------------------------------------------------------")
val tableDesc = new HTableDescriptor(tableName)
tableDesc.addFamily(new HColumnDescriptor("z1".getBytes()))
admin.createTable(tableDesc)
}else{
print("Table already exists!!--------------------------------------------------------------------------------------")
}
And now how may I insert this dataframe into hbase ?
In another example I succeed to insert into hbase using this code:
val myTable = new HTable(conf, tableName)
for (i <- 0 to 1000) {
var p = new Put(Bytes.toBytes(""+i))
p.add("z1".getBytes(), "name".getBytes(), Bytes.toBytes(""+(i*5)))
p.add("z1".getBytes(), "age".getBytes(), Bytes.toBytes("2017-04-20"))
p.add("z2".getBytes(), "job".getBytes(), Bytes.toBytes(""+i))
p.add("z2".getBytes(), "salary".getBytes(), Bytes.toBytes(""+i))
myTable.put(p)
}
myTable.flushCommits()
But now I am stuck, how to insert each record of my dataframe into my hbase table.
Thank you for your time and attention

An alternate is to look at rdd.saveAsNewAPIHadoopDataset, to insert the data into the hbase table.
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("sparkToHive").enableHiveSupport().getOrCreate()
import spark.implicits._
val config = HBaseConfiguration.create()
config.set("hbase.zookeeper.quorum", "ip's")
config.set("hbase.zookeeper.property.clientPort","2181")
config.set(TableInputFormat.INPUT_TABLE, "tableName")
val newAPIJobConfiguration1 = Job.getInstance(config)
newAPIJobConfiguration1.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "tableName")
newAPIJobConfiguration1.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val df: DataFrame = Seq(("foo", "1", "foo1"), ("bar", "2", "bar1")).toDF("key", "value1", "value2")
val hbasePuts= df.rdd.map((row: Row) => {
val put = new Put(Bytes.toBytes(row.getString(0)))
put.addColumn(Bytes.toBytes("cf1"), Bytes.toBytes("value1"), Bytes.toBytes(row.getString(1)))
put.addColumn(Bytes.toBytes("cf2"), Bytes.toBytes("value2"), Bytes.toBytes(row.getString(2)))
(new ImmutableBytesWritable(), put)
})
hbasePuts.saveAsNewAPIHadoopDataset(newAPIJobConfiguration1.getConfiguration())
}
Ref : https://sparkkb.wordpress.com/2015/05/04/save-javardd-to-hbase-using-saveasnewapihadoopdataset-spark-api-java-coding/

Below is a full example using the spark hbase connector from Hortonworks available in Maven.
This example shows
how to check if HBase table is existing
create HBase table if not existing
Insert DataFrame into HBase table
import org.apache.hadoop.hbase.client.{ColumnFamilyDescriptorBuilder, ConnectionFactory, TableDescriptorBuilder}
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.datasources.hbase.HBaseTableCatalog
object Main extends App {
case class Employee(key: String, fName: String, lName: String, mName: String,
addressLine: String, city: String, state: String, zipCode: String)
// as pre-requisites the table 'employee' with column families 'person' and 'address' should exist
val tableNameString = "default:employee"
val colFamilyPString = "person"
val colFamilyAString = "address"
val tableName = TableName.valueOf(tableNameString)
val colFamilyP = colFamilyPString.getBytes
val colFamilyA = colFamilyAString.getBytes
val hBaseConf = HBaseConfiguration.create()
val connection = ConnectionFactory.createConnection(hBaseConf);
val admin = connection.getAdmin();
println("Check if table 'employee' exists:")
val tableExistsCheck: Boolean = admin.tableExists(tableName)
println(s"Table " + tableName.toString + " exists? " + tableExistsCheck)
if(tableExistsCheck == false) {
println("Create Table employee with column families 'person' and 'address'")
val colFamilyBuild1 = ColumnFamilyDescriptorBuilder.newBuilder(colFamilyP).build()
val colFamilyBuild2 = ColumnFamilyDescriptorBuilder.newBuilder(colFamilyA).build()
val tableDescriptorBuild = TableDescriptorBuilder.newBuilder(tableName)
.setColumnFamily(colFamilyBuild1)
.setColumnFamily(colFamilyBuild2)
.build()
admin.createTable(tableDescriptorBuild)
}
// define schema for the dataframe that should be loaded into HBase
def catalog =
s"""{
|"table":{"namespace":"default","name":"employee"},
|"rowkey":"key",
|"columns":{
|"key":{"cf":"rowkey","col":"key","type":"string"},
|"fName":{"cf":"person","col":"firstName","type":"string"},
|"lName":{"cf":"person","col":"lastName","type":"string"},
|"mName":{"cf":"person","col":"middleName","type":"string"},
|"addressLine":{"cf":"address","col":"addressLine","type":"string"},
|"city":{"cf":"address","col":"city","type":"string"},
|"state":{"cf":"address","col":"state","type":"string"},
|"zipCode":{"cf":"address","col":"zipCode","type":"string"}
|}
|}""".stripMargin
// define some test data
val data = Seq(
Employee("1","Horst","Hans","A","12main","NYC","NY","123"),
Employee("2","Joe","Bill","B","1337ave","LA","CA","456"),
Employee("3","Mohammed","Mohammed","C","1Apple","SanFran","CA","678")
)
// create SparkSession
val spark: SparkSession = SparkSession.builder()
.master("local[*]")
.appName("HBaseConnector")
.getOrCreate()
// serialize data
import spark.implicits._
val df = spark.sparkContext.parallelize(data).toDF
// write dataframe into HBase
df.write.options(
Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "3")) // create 3 regions
.format("org.apache.spark.sql.execution.datasources.hbase")
.save()
}
This worked for me while I had the relevant site-xmls ("core-site.xml", "hbase-site.xml", "hdfs-site.xml") available in my resources.

using answer for code formatting purposes
Doc tells:
sc.parallelize(data).toDF.write.options(
Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5"))
.format("org.apache.hadoop.hbase.spark ")
.save()
where sc.parallelize(data).toDF is your DataFrame. Doc example turns scala collection to dataframe using sc.parallelize(data).toDF
You already have your DataFrame, just try to call
yourDataFrame.write.options(
Map(HBaseTableCatalog.tableCatalog -> catalog, HBaseTableCatalog.newTable -> "5"))
.format("org.apache.hadoop.hbase.spark ")
.save()
And it should work. Doc is pretty clear...
UPD
Given a DataFrame with specified schema, above will create an HBase
table with 5 regions and save the DataFrame inside. Note that if
HBaseTableCatalog.newTable is not specified, the table has to be
pre-created.
It's about data partitioning. Each HBase table can have 1...X regions. You should carefully pick number of regions. Low regions number is bad. High region numbers is also bad.

Related

spark write dataframe with hashMap to postgres as json

I am working with <spark.version>2.2.1</spark.version>
I would like to write a dataframe that has a map field into postgres as json field.
Example code:
import java.util.Properties
import org.apache.spark.SparkConf
import org.apache.spark.sql.{SaveMode, SparkSession}
import scala.collection.immutable.HashMap
case class ExampleJson(map: HashMap[String,Long])
object JdbcLoaderJson extends App{
val finalUrl = s"jdbc:postgresql://localhost:54321/development"
val user = "user"
val password = "123456"
val sparkConf = new SparkConf()
sparkConf.setMaster(s"local[2]")
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
def writeWithJson(tableName: String) : Unit = {
def getProperties: Properties = {
val p = new Properties()
val prop = new java.util.Properties
prop.setProperty("user", user)
prop.setProperty("password", password)
prop
}
var schema = "public"
var table = tableName
val asList = List(ExampleJson(HashMap("x" -> 1L, "y" -> 2L)),
ExampleJson(HashMap("y" -> 3L, "z" -> 4L)))
val asDf = spark.createDataFrame(asList)
asDf.show(false)
asDf.write.mode(SaveMode.Overwrite).jdbc(finalUrl, tableName, getProperties)
}
writeWithJson("with_json")
}
Output:
+-------------------+
|map |
+-------------------+
|Map(x -> 1, y -> 2)|
|Map(y -> 3, z -> 4)|
+-------------------+
Exception in thread "main" java.lang.IllegalArgumentException: Can't get JDBC type for map<string,bigint>
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$org$apache$spark$sql$execution$datasources$jdbc$JdbcUtils$$getJdbcType$2.apply(JdbcUtils.scala:172)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$org$apache$spark$sql$execution$datasources$jdbc$JdbcUtils$$getJdbcType$2.apply(JdbcUtils.scala:172)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.org$apache$spark$sql$execution$datasources$jdbc$JdbcUtils$$getJdbcType(JdbcUtils.scala:171)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$schemaString$1$$anonfun$23.apply(JdbcUtils.scala:707)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$schemaString$1$$anonfun$23.apply(JdbcUtils.scala:707)
at scala.collection.MapLike$class.getOrElse(MapLike.scala:128)
at scala.collection.AbstractMap.getOrElse(Map.scala:59)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$schemaString$1.apply(JdbcUtils.scala:707)
Process finished with exit code 1
I am actually ok with string as well instead of the map, it is more about writing json column to postgres from spark
Convert HashMap data into json string something like below.
asDf
.select(
to_json(struct($"*"))
.as("map")
)
.write
.mode(SaveMode.Overwrite)
.jdbc(finalUrl, tableName, getProperties)

spark Scala RDD to DataFrame Date format

Would you be able to help in this spark prob statement
Data -
empno|ename|designation|manager|hire_date|sal|deptno
7369|SMITH|CLERK|9902|2010-12-17|800.00|20
7499|ALLEN|SALESMAN|9698|2011-02-20|1600.00|30
Code:
val rawrdd = spark.sparkContext.textFile("C:\\Users\\cmohamma\\data\\delta scenarios\\emp_20191010.txt")
val refinedRDD = rawrdd.map( lines => {
val fields = lines.split("\\|") (fields(0).toInt,fields(1),fields(2),fields(3).toInt,fields(4).toDate,fields(5).toFloat,fields(6).toInt)
})
Problem Statement - This is not working -fields(4).toDate , whats is the alternative or what is the usage ?
What i have tried ?
tried replacing it to - to_date(col(fields(4)) , "yyy-MM-dd") - Not working
2.
Step 1.
val refinedRDD = rawrdd.map( lines => {
val fields = lines.split("\\|")
(fields(0),fields(1),fields(2),fields(3),fields(4),fields(5),fields(6))
})
Now this tuples are all strings
Step 2.
mySchema = StructType(StructField(empno,IntegerType,true), StructField(ename,StringType,true), StructField(designation,StringType,true), StructField(manager,IntegerType,true), StructField(hire_date,DateType,true), StructField(sal,DoubleType,true), StructField(deptno,IntegerType,true))
Step 3. converting the string tuples to Rows
val rowRDD = refinedRDD.map(attributes => Row(attributes._1, attributes._2, attributes._3, attributes._4, attributes._5 , attributes._6, attributes._7))
Step 4.
val empDF = spark.createDataFrame(rowRDD, mySchema)
This is also not working and gives error related to types. to solve this i changed the step 1 as
(fields(0).toInt,fields(1),fields(2),fields(3).toInt,fields(4),fields(5).toFloat,fields(6).toInt)
Now this is giving error for the date type column and i am again at the main problem.
Use Case - use textFile Api, convert this to a dataframe using custom schema (StructType) on top of it.
This can be done using the case class but in case class also i would be stuck where i would need to do a fields(4).toDate (i know i can cast string to date later in code but if the above problem solutionis possible)
You can use the following code snippet
import org.apache.spark.sql.functions.to_timestamp
scala> val df = spark.read.format("csv").option("header", "true").option("delimiter", "|").load("gs://otif-etl-input/test.csv")
df: org.apache.spark.sql.DataFrame = [empno: string, ename: string ... 5 more fields]
scala> val ts = to_timestamp($"hire_date", "yyyy-MM-dd")
ts: org.apache.spark.sql.Column = to_timestamp(`hire_date`, 'yyyy-MM-dd')
scala> val enriched_df = df.withColumn("ts", ts).show(2, false)
+-----+-----+-----------+-------+----------+-------+----------+-------------------+
|empno|ename|designation|manager|hire_date |sal |deptno |ts |
+-----+-----+-----------+-------+----------+-------+----------+-------------------+
|7369 |SMITH|CLERK |9902 |2010-12-17|800.00 |20 |2010-12-17 00:00:00|
|7499 |ALLEN|SALESMAN |9698 |2011-02-20|1600.00|30 |2011-02-20 00:00:00|
+-----+-----+-----------+-------+----------+-------+----------+-------------------+
enriched_df: Unit = ()
There are multiple ways to cast your data to proper data types.
First : use InferSchema
val df = spark.read .option("delimiter", "\\|").option("header", true) .option("inferSchema", "true").csv(path)
df.printSchema
Some time it doesn't work as expected. see details here
Second : provide your own Datatype conversion template
val rawDF = Seq(("7369", "SMITH" , "2010-12-17", "800.00"), ("7499", "ALLEN","2011-02-20", "1600.00")).toDF("empno", "ename","hire_date", "sal")
//define schema in DF , hire_date as Date
val schemaDF = Seq(("empno", "INT"), ("ename", "STRING"), (**"hire_date", "date"**) , ("sal", "double")).toDF("columnName", "columnType")
rawDF.printSchema
//fetch schema details
val dataTypes = schemaDF.select("columnName", "columnType")
val listOfElements = dataTypes.collect.map(_.toSeq.toList)
//creating a map friendly template
val validationTemplate = (c: Any, t: Any) => {
val column = c.asInstanceOf[String]
val typ = t.asInstanceOf[String]
col(column).cast(typ)
}
//Apply datatype conversion template on rawDF
val convertedDF = rawDF.select(listOfElements.map(element => validationTemplate(element(0), element(1))): _*)
println("Conversion done!")
convertedDF.show()
convertedDF.printSchema
Third : Case Class
Create schema from caseclass with ScalaReflection and provide this customized schema while loading DF.
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.types._
case class MySchema(empno: int, ename: String, hire_date: Date, sal: Double)
val schema = ScalaReflection.schemaFor[MySchema].dataType.asInstanceOf[StructType]
val rawDF = spark.read.schema(schema).option("header", "true").load(path)
rawDF.printSchema
Hope this will help.

How to create table in mysql database using apache spark

I am trying to create a spark application which is useful to
create, read, write and update MySQL data. So, is there any way to create a MySQL table using Spark?
Below I have a Scala-JDBC code that creates a table in MySQL
database. How can I do this through Spark?
package SparkMysqlJdbcConnectivity
import org.apache.spark.sql.SparkSession
import java.util.Properties
import java.lang.Class
import java.sql.Connection
import java.sql.DriverManager
object MysqlSparkJdbcProgram {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("MysqlJDBC Connections")
.master("local[*]")
.getOrCreate()
val driver = "com.mysql.jdbc.Driver"
val url = "jdbc:mysql://localhost:3306/world"
val operationtype = "create table"
val tablename = "country"
val tablename2 = "state"
val connectionProperties = new Properties()
connectionProperties.put("user", "root")
connectionProperties.put("password", "root")
val jdbcDf = spark.read.jdbc(url, s"${tablename}", connectionProperties)
operationtype.trim() match {
case "create table" => {
// Class.forName(driver)
try{
val con:Connection = DriverManager.getConnection(url,connectionProperties)
val result = con.prepareStatement(s"create table ${tablename2} (name varchar(255), country varchar(255))").execute()
println(result)
if(result) println("table creation is unsucessful") else println("table creation is unsucessful")
}
}
case "read table" => {
val jdbcDf = spark.read.jdbc("jdbc:mysql://localhost:3306/world", s"${tablename}", connectionProperties)
jdbcDf.show()
}
case "write table" => {}
case "drop table" => {}
}
}
}
The tables will be created automatically when you write the jdbcDf dataframe.
jdbcDf
.write
.jdbc("jdbc:mysql://localhost:3306/world", s"${tablename}", connectionProperties)
In case if you want to specify the table schema,
jdbcDf
.write
.option("createTableColumnTypes", "name VARCHAR(500), col1 VARCHAR(1024), col3 int")
.jdbc("jdbc:mysql://localhost:3306/world", s"${tablename}", connectionProperties)

Spark Structured Stream null in output dataset

i run a scala code which aggregates data and print output to the console. Unfortunately, i got a nulls after group operation. Current output:
|Id |Date | Count |
|null|null | 35471|
I realised, that the bottle neck is the point, when i group data - when i try to use column other than numeric, output returns nulls. Any advice will be welcome - i lost hours to find solution.
My code:
// create schema
val sensorsSchema = new StructType()
.add("SensorId", IntegerType)
.add("Timestamp", TimestampType)
.add("Value", DoubleType)
.add("State", StringType)
// read streaming data from csv...
// aggregate streaming data
val streamAgg = streamIn
.withColumn("Date", to_date(unix_timestamp($"Timestamp", "dd/MM/yyyy").cast(TimestampType)))
.groupBy("SensorId", "Date")
.count()
// write streaming data...
I change the code - now works perfect:
/****************************************
* STREAMING APP
* 1.0 beta
*****************************************
* read data from csv (local)
* and save as parquet (local)
****************************************/
package tk.streaming
import org.apache.spark.SparkConf
import org.apache.spark.sql._
// import org.apache.spark.sql.functions._
case class SensorsSchema(SensorId: Int, Timestamp: String, Value: Double, State: String, OperatorId: Int)
object Runner {
def main(args: Array[String]): Unit = {
// Configuration parameters (to create spark session and contexts)
val appName = "StreamingApp" // app name
val master = "local[*]" // master configuration
val dataDir = "/home/usr_spark/Projects/SparkStreaming/data"
val refreshInterval = 30 // seconds
// initialize context
val conf = new SparkConf().setMaster(master).setAppName(appName)
val spark = SparkSession.builder.config(conf).getOrCreate()
import spark.implicits._
// TODO change file source to Kafka (must)
// read streaming data
val sensorsSchema = Encoders.product[SensorsSchema].schema
val streamIn = spark.readStream
.format("csv")
.schema(sensorsSchema)
.load(dataDir + "/input")
.select("SensorId", "Timestamp", "State", "Value") // remove "OperatorId" column
// TODO save result in S3 (nice to have)
// write streaming data
import org.apache.spark.sql.streaming.Trigger
val streamOut = streamIn.writeStream
.queryName("streamingOutput")
.format("parquet")
.option("checkpointLocation", dataDir + "/output/checkpoint")
.option("path", dataDir + "/output")
.start()
streamOut.awaitTermination() // start streaming data
}
}

Spark: HBase Bulk Load using Scala

We have a text files of 100K records each and we need to read the file line by line and insert it's value into hbase.
The file is '|' delimited.
Sample textFile example:
SLNO|Name|City|Pincode
1|ABC|Pune|400104
2|BMN|Delhi|100065
Each column will have different column family.
We are trying to implement this in Spark-Scala using HBase Bulk load.
We came across this link suggesting bulk load :
http://www.openkb.info/2015/01/how-to-use-scala-on-spark-to-load-data.html
With the below syntax for inserting into single column family.
conf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
val job = Job.getInstance(conf)
job.setMapOutputKeyClass (classOf[ImmutableBytesWritable])
job.setMapOutputValueClass (classOf[KeyValue])
HFileOutputFormat.configureIncrementalLoad (job, table)
// Generate 10 sample data:
val num = sc.parallelize(1 to 10)
val rdd = num.map(x=>{
val kv: KeyValue = new KeyValue(Bytes.toBytes(x), "cf".getBytes(),
"c1".getBytes(), "value_xxx".getBytes() )
(new ImmutableBytesWritable(Bytes.toBytes(x)), kv)
})
// Directly bulk load to Hbase/MapRDB tables.
rdd.saveAsNewAPIHadoopFile("/tmp/xxxx19", classOf[ImmutableBytesWritable],
classOf[KeyValue], classOf[HFileOutputFormat], job.getConfiguration())
Can anyone advice on the bulk load insertion for multi-column family.
Do have a look at rdd.saveAsNewAPIHadoopDataset, to insert the data into the hbase table.
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("sparkToHive").enableHiveSupport().getOrCreate()
import spark.implicits._
val config = HBaseConfiguration.create()
config.set("hbase.zookeeper.quorum", "ip's")
config.set("hbase.zookeeper.property.clientPort","2181")
config.set(TableInputFormat.INPUT_TABLE, "tableName")
val newAPIJobConfiguration1 = Job.getInstance(config)
newAPIJobConfiguration1.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, "tableName")
newAPIJobConfiguration1.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
val df: DataFrame = Seq(("foo", "1", "foo1"), ("bar", "2", "bar1")).toDF("key", "value1", "value2")
val hbasePuts= df.rdd.map((row: Row) => {
val put = new Put(Bytes.toBytes(row.getString(0)))
put.addColumn(Bytes.toBytes("cf1"), Bytes.toBytes("value1"), Bytes.toBytes(row.getString(1)))
put.addColumn(Bytes.toBytes("cf2"), Bytes.toBytes("value2"), Bytes.toBytes(row.getString(2)))
(new ImmutableBytesWritable(), put)
})
hbasePuts.saveAsNewAPIHadoopDataset(newAPIJobConfiguration1.getConfiguration())
}
Ref : https://sparkkb.wordpress.com/2015/05/04/save-javardd-to-hbase-using-saveasnewapihadoopdataset-spark-api-java-coding/