I needed a sink to Postgres DB, so I started to build a custom Flink SinkFunction. As FlinkKafkaProducer implements TwoPhaseCommitSinkFunction, then I decided to do the same. As stated in O'Reilley's book Stream Processing with Apache Flink, you just need to implement the abstract methods, enable checkpointing and you're up to go. But what really happens when I run my code is that commit method is called only once, and it is called before invoke, what is totally unexpected since you shouldn't be ready to commit if your set of ready-to-commit transactions is empty. And the worst is that, after committing, invoke is called for all of the transaction lines present in my file, and then abort is called, which is even more unexpected.
When the Sink is initialized, It is of my understanding that the following should occur:
beginTransaction is called and sends an identifier to invoke
invoke adds the lines to the transaction, according to the identifier received
pre-commit makes all final modification on current transaction data
commit handles the finalized transaction of pre-commited data
So, I can't see why my program doesn't show this behaviour.
Here goes my sink code:
package PostgresConnector
import java.sql.{BatchUpdateException, DriverManager, PreparedStatement, SQLException, Timestamp}
import java.text.ParseException
import java.util.{Date, Properties, UUID}
import org.apache.flink.api.common.ExecutionConfig
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink.{SinkFunction, TwoPhaseCommitSinkFunction}
import org.apache.flink.streaming.api.scala._
import org.slf4j.{Logger, LoggerFactory}
class PostgreSink(props : Properties, config : ExecutionConfig) extends TwoPhaseCommitSinkFunction[(String,String,String,String),String,String](createTypeInformation[String].createSerializer(config),createTypeInformation[String].createSerializer(config)){
private var transactionMap : Map[String,Array[(String,String,String,String)]] = Map()
private var parsedQuery : PreparedStatement = _
private val insertionString : String = "INSERT INTO mydb (field1,field2,point) values (?,?,point(?,?))"
override def invoke(transaction: String, value: (String,String,String,String), context: SinkFunction.Context[_]): Unit = {
val LOG = LoggerFactory.getLogger(classOf[FlinkCEPClasses.FlinkCEPPipeline])
val res = this.transactionMap.get(transaction)
if(res.isDefined){
var array = res.get
array = array ++ Array(value)
this.transactionMap += (transaction -> array)
}else{
val array = Array(value)
this.transactionMap += (transaction -> array)
}
LOG.info("\n\nPassing through invoke\n\n")
()
}
override def beginTransaction(): String = {
val LOG: Logger = LoggerFactory.getLogger(classOf[FlinkCEPClasses.FlinkCEPPipeline])
val identifier = UUID.randomUUID.toString
LOG.info("\n\nPassing through beginTransaction\n\n")
identifier
}
override def preCommit(transaction: String): Unit = {
val LOG = LoggerFactory.getLogger(classOf[FlinkCEPClasses.FlinkCEPPipeline])
try{
val tuple : Option[Array[(String,String,String,String)]]= this.transactionMap.get(transaction)
if(tuple.isDefined){
tuple.get.foreach( (value : (String,String,String,String)) => {
LOG.info("\n\n"+value.toString()+"\n\n")
this.parsedQuery.setString(1,value._1)
this.parsedQuery.setString(2,value._2)
this.parsedQuery.setString(3,value._3)
this.parsedQuery.setString(4,value._4)
this.parsedQuery.addBatch()
})
}
}catch{
case e : SQLException =>
LOG.info("\n\nError when adding transaction to batch: SQLException\n\n")
case f : ParseException =>
LOG.info("\n\nError when adding transaction to batch: ParseException\n\n")
case g : NoSuchElementException =>
LOG.info("\n\nError when adding transaction to batch: NoSuchElementException\n\n")
case h : Exception =>
LOG.info("\n\nError when adding transaction to batch: Exception\n\n")
}
this.transactionMap = this.transactionMap.empty
LOG.info("\n\nPassing through preCommit...\n\n")
}
override def commit(transaction: String): Unit = {
val LOG : Logger = LoggerFactory.getLogger(classOf[FlinkCEPClasses.FlinkCEPPipeline])
if(this.parsedQuery != null) {
LOG.info("\n\n" + this.parsedQuery.toString+ "\n\n")
}
try{
this.parsedQuery.executeBatch
val LOG : Logger = LoggerFactory.getLogger(classOf[FlinkCEPClasses.FlinkCEPPipeline])
LOG.info("\n\nExecuting batch\n\n")
}catch{
case e : SQLException =>
val LOG : Logger = LoggerFactory.getLogger(classOf[FlinkCEPClasses.FlinkCEPPipeline])
LOG.info("\n\n"+"Error : SQLException"+"\n\n")
}
this.transactionMap = this.transactionMap.empty
LOG.info("\n\nPassing through commit...\n\n")
}
override def abort(transaction: String): Unit = {
val LOG : Logger = LoggerFactory.getLogger(classOf[FlinkCEPClasses.FlinkCEPPipeline])
this.transactionMap = this.transactionMap.empty
LOG.info("\n\nPassing through abort...\n\n")
}
override def open(parameters: Configuration): Unit = {
val LOG: Logger = LoggerFactory.getLogger(classOf[FlinkCEPClasses.FlinkCEPPipeline])
val driver = props.getProperty("driver")
val url = props.getProperty("url")
val user = props.getProperty("user")
val password = props.getProperty("password")
Class.forName(driver)
val connection = DriverManager.getConnection(url + "?user=" + user + "&password=" + password)
this.parsedQuery = connection.prepareStatement(insertionString)
LOG.info("\n\nConfiguring BD conection parameters\n\n")
}
}
And this is my main program:
package FlinkCEPClasses
import PostgresConnector.PostgreSink
import org.apache.flink.api.java.io.TextInputFormat
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.cep.PatternSelectFunction
import org.apache.flink.cep.pattern.conditions.SimpleCondition
import org.apache.flink.cep.scala.pattern.Pattern
import org.apache.flink.core.fs.{FileSystem, Path}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.cep.scala.{CEP, PatternStream}
import org.apache.flink.streaming.api.functions.source.FileProcessingMode
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import java.util.Properties
import org.apache.flink.api.common.ExecutionConfig
import org.slf4j.{Logger, LoggerFactory}
class FlinkCEPPipeline {
val LOG: Logger = LoggerFactory.getLogger(classOf[FlinkCEPPipeline])
LOG.info("\n\nStarting the pipeline...\n\n")
var env : StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.enableCheckpointing(10)
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)
env.setParallelism(1)
//var input : DataStream[String] = env.readFile(new TextInputFormat(new Path("/home/luca/Desktop/lines")),"/home/luca/Desktop/lines",FileProcessingMode.PROCESS_CONTINUOUSLY,1)
var input : DataStream[String] = env.readTextFile("/home/luca/Desktop/lines").name("Raw stream")
var tupleStream : DataStream[(String,String,String,String)] = input.map(new S2PMapFunction()).name("Tuple Stream")
var properties : Properties = new Properties()
properties.setProperty("driver","org.postgresql.Driver")
properties.setProperty("url","jdbc:postgresql://localhost:5432/mydb")
properties.setProperty("user","luca")
properties.setProperty("password","root")
tupleStream.addSink(new PostgreSink(properties,env.getConfig)).name("Postgres Sink").setParallelism(1)
tupleStream.writeAsText("/home/luca/Desktop/output",FileSystem.WriteMode.OVERWRITE).name("File Sink").setParallelism(1)
env.execute()
}
My S2PMapFunction code:
package FlinkCEPClasses
import org.apache.flink.api.common.functions.MapFunction
case class S2PMapFunction() extends MapFunction[String,(String,String,String,String)] {
override def map(value: String): (String, String, String,String) = {
var tuple = value.replaceAllLiterally("(","").replaceAllLiterally(")","").split(',')
(tuple(0),tuple(1),tuple(2),tuple(3))
}
}
My pipeline works like this: I read lines from a file, map them to a tuple of strings, and use the data inside the tuples to save them in a Postgres DB
If you want to simulate the data, just create a file with lines in a format like this:
(field1,field2,pointx,pointy)
Edit
The execution order of the TwoPhaseCommitSinkFUnction's methods is the following:
Starting pipeline...
beginTransaction
preCommit
beginTransaction
commit
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
invoke
abort
I'm not an expert on this topic, but a couple of guesses:
preCommit is called whenever Flink begins a checkpoint, and commit is called when the checkpoint is complete. These methods are called simply because checkpointing is happening, regardless of whether the sink has received any data.
Checkpointing is happening periodically, regardless of whether any data is flowing through your pipeline. Given your very short checkpointing interval (10 msec), it does seem plausible that the first checkpoint barrier will reach the sink before the source has managed to send it any data.
It also looks like you are assuming that only one transaction will be open at a time. I'm not sure that's strictly guaranteed, but so long as maxConcurrentCheckpoints is 1 (which is the default), you should be okay.
So, here goes the "answer" for this question. Just to be clear: at this moment, the problem about the TwoPhaseCommitSinkFunction hasn't been solved yet. If what you're looking for is about the original problem, then you should look for another answer. If you don't care about what you'll use as a sink, then maybe I can help you with that.
As suggested by #DavidAnderson, I started to study the Table API and see if it could solve my problem, which was using Flink to insert lines in my database table.
It turned out to be really simple, as you'll see.
OBS: Beware of the version you are using. My Flink's version is 1.9.0.
Source code
package FlinkCEPClasses
import java.sql.Timestamp
import java.util.Properties
import org.apache.flink.api.common.typeinfo.{TypeInformation, Types}
import org.apache.flink.api.java.io.jdbc.JDBCAppendTableSink
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.table.api.{EnvironmentSettings, Table}
import org.apache.flink.table.api.scala.StreamTableEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.sinks.TableSink
import org.postgresql.Driver
class TableAPIPipeline {
// --- normal pipeline initialization in this block ---
var env : StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.enableCheckpointing(10)
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)
env.setParallelism(1)
var input : DataStream[String] = env.readTextFile("/home/luca/Desktop/lines").name("Original stream")
var tupleStream : DataStream[(String,Timestamp,Double,Double)] = input.map(new S2PlacaMapFunction()).name("Tuple Stream")
var properties : Properties = new Properties()
properties.setProperty("driver","org.postgresql.Driver")
properties.setProperty("url","jdbc:postgresql://localhost:5432/mydb")
properties.setProperty("user","myuser")
properties.setProperty("password","mypassword")
// --- normal pipeline initialization in this block END ---
// These two lines create what Flink calls StreamTableEnvironment.
// It seems pretty similar to a normal stream initialization.
val settings = EnvironmentSettings.newInstance().useBlinkPlanner().inStreamingMode().build()
val tableEnv = StreamTableEnvironment.create(env,settings)
//Since I wanted to sink data into a database, I used JDBC TableSink,
//because it is very intuitive and is a exact match with my need. You may
//look for other TableSink classes that fit better in you solution.
var tableSink : JDBCAppendTableSink = JDBCAppendTableSink.builder()
.setBatchSize(1)
.setDBUrl("jdbc:postgresql://localhost:5432/mydb")
.setDrivername("org.postgresql.Driver")
.setPassword("mypassword")
.setUsername("myuser")
.setQuery("INSERT INTO mytable (data1,data2,data3) VALUES (?,?,point(?,?))")
.setParameterTypes(Types.STRING,Types.SQL_TIMESTAMP,Types.DOUBLE,Types.DOUBLE)
.build()
val fieldNames = Array("data1","data2","data3","data4")
val fieldTypes = Array[TypeInformation[_]](Types.STRING,Types.SQL_TIMESTAMP,Types.DOUBLE, Types.DOUBLE)
// This is the crucial part of the code: first, you need to register
// your table sink, informing the name, the field names, field types and
// the TableSink object.
tableEnv.registerTableSink("postgres-table-sink",
fieldNames,
fieldTypes,
tableSink
)
// Then, you transform your DataStream into a Table object.
var table = tableEnv.fromDataStream(tupleStream)
// Finally, you insert your stream data into the registered sink.
table.insertInto("postgres-table-sink")
env.execute()
}
I am trying to insert data into MongoDB using Play-scala and ReactiveMongo.
Here is my DbimpService.scala:
package services
import models.Post
import reactivemongo.bson.BSONDocument
import reactivemongo.api.MongoDriver
import reactivemongo.api.collections.bson.BSONCollection
import scala.concurrent.ExecutionContext
import javax.inject.Inject
import play.api.libs.json.Json
import reactivemongo.play.json.collection.JSONCollection
import reactivemongo.api.commands.WriteResult
import scala.concurrent.Future
import org.apache.xerces.util.DatatypeMessageFormatter
class Dbimpservice #Inject() (implicit ec:ExecutionContext) extends Dbservice {
def create(p:Post):String={
var status = "Not Saved"
val driver = new MongoDriver
val connection = driver.connection(List("localhost"))
val db = connection("application")
val collection = db[BSONCollection]("post")
val futureList = collection.insert[Post](p)
futureList.onComplete { case sucess => println(sucess) }
return status
}
}
Here is my HomeController.scala:
package controllers
import javax.inject._
import play.api._
import play.api.mvc._
import models._
import scala.util.{ Failure, Success }
import scala.concurrent.Future
import scala.concurrent.ExecutionContext.Implicits.global
import reactivemongo.api.{ MongoDriver, MongoConnection }
import reactivemongo.play.json.collection.JSONCollection
import reactivemongo.bson.BSONDocument
import reactivemongo.api.commands.WriteResult
import reactivemongo.api.collections.bson.BSONCollection
import play.api.libs.json.Json
import services.Dbservice
import services.Dbimpservice
import services.Dbservice
import scala.concurrent.ExecutionContext
import scala.concurrent.Await
import scala.concurrent.duration.Duration
/**
* This controller creates an `Action` to handle HTTP requests to the
* application's home page.
*/
#Singleton
class HomeController #Inject() (implicit ec:ExecutionContext,val Dbservice : Dbimpservice)extends Controller {
/**
* Create an Action to render an HTML page with a welcome message.
* The configuration in the `routes` file means that this method
* will be called when the application receives a `GET` request with
* a path of `/`.
*/
def index = Action{
Ok("Hai")
}
def read = Action.async {
val query = BSONDocument()
val driver = new MongoDriver
val connection = driver.connection(List("localhost:27017"))
val db = connection("application")
val collection = db[BSONCollection]("post")
val futureList = collection.find(query).cursor[Post]().collect[List]()
futureList.map { list =>
Ok(list.toString())
}
}
def create = Action(BodyParsers.parse.json) { request =>
val personResult = request.body.validate[Post]
personResult.fold(
errors => {
BadRequest(Json.obj("status " ->"ERROR"))
},
valid = fun
)
}
def fun:Post => Result= { post =>
var ans = Dbservice.create(post)
Ok(ans)
}
}
I am trying to insert the data but not getting inserted and the error which i am getting is
Failure(reactivemongo.core.errors.ConnectionNotInitialized: MongoError['Connection is missing metadata (like protocol version, etc.) The connection pool is probably being initialized.'])
Some one please help me, I even referred the link
http://stackoverflow.com/questions/31456517/embedmongo-with-reactivemongo-process-does-not-exit
but did not get
Guessing that you are using a recent version of ReactiveMongo (0.11.7+), you are using a deprecated DB resolution code (connection(dbName) aka connection.apply(dbName).
See also
You need to use the asynchronous resolution, which benefit from the failover (to handle possible network latency/incident). The following code must so be refactored.
val db = connection("application")
val collection = db[BSONCollection]("post")
val futureList = collection.insert[Post](p)
Using the new DB resolution:
for {
db <- connection.database("application")
collection = db("post")
res <- collection.insert(p)
} yield res
I have the following code and I'm trying to connect to the MySQL database without success.
cat Database.scala
package com.github.odnanref.EmailFilter
import slick.driver.MySQLDriver._
import slick.driver.MySQLDriver.backend.Database
/**
* Created by andref on 12/05/16.
*/
class Database {
val url = "jdbc:mysql://localhost/playdb"
val db = Database.forURL(url, driver = "com.mysql.jdbc.Driver")
override def finalize() {
db.close()
super.finalize()
}
}
cat EmailMessageTable.scala
package com.github.odnanref.EmailFilter
import java.sql.Timestamp
import slick.driver.JdbcProfile
import slick.driver.MySQLDriver.api._
import scala.concurrent.Future
class EmailMessageTable(tag: Tag) extends Table[EmailMessage](tag, "email_message") {
def id = column[Option[Long]]("id", O.AutoInc, O.PrimaryKey)
def email = column[String]("email")
def subject = column[String]("subject")
def body = column[String]("body")
def datain = column[Timestamp]("datain")
def email_id= column[Long]("email_id")
def * = (id, email, subject, body, datain, email_id) <> ((EmailMessage.apply _).tupled, EmailMessage.unapply)
def ? = (id.get.?, email.?, subject.?, body.?, datain.?).shaped.<>({ r =>; _1.map(_ =>
EmailMessage.tupled((_1, _2.get, _3.get, _4.get, _5.get))) }, (_: Any) =>
throw new Exception("Inserting into ? projection not supported."))
}
I can't initialize the database and execute search query's or insert statements based on this code I try to do
val db = new Database()
db.db.run(TableQuery[EmailMessageTable] += EmailMessage(...) )
And it says, it doesn't know the method +=
Also I get this error:
Database.scala:4: imported `Database' is permanently hidden by definition of class Database in package EmailFilter
[warn] import slick.driver.MySQLDriver.backend.Database
What am I doing wrong?
Post EDIT>
package com.github.odnanref.EmailFilter
import java.sql.Timestamp
case class EmailMessage(
id: Option[Long],
email: String,
subject:String,
body:String,
datain: Timestamp,
email_id: Long
)
You are importing a class named Database inside a file that defines another class with the same name. You can:
rename your Database class:
class MyDatabase {
val url = ...
val db = ...
...
}
rename imported class:
import slick.driver.MySQLDriver.backend.{Database => SlickDB}
...
val db = SlickDB.forURL(url, driver = "com.mysql.jdbc.Driver")
avoid importing Database explicitly:
import slick.driver.MySQLDriver.backend
...
val db = backend.Database.forURL(url, driver = "com.mysql.jdbc.Driver")
I followed the documentation of Slick 3.0.0-RC1, using Typesafe Config as database connection configuration. Here is my conf:
database = {
driver = "org.postgresql.Driver"
url = "jdbc:postgresql://localhost:5432/postgre"
user = "postgre"
}
I established a file Locale.scala as:
package models
import slick.driver.PostgresDriver.api._
import scala.concurrent.Future
case class Locale(id: String, name: String)
class Locales(tag: Tag) extends Table[Locale](tag, "LOCALES") {
def id = column[String]("ID", O.PrimaryKey)
def name = column[String]("NAME")
def * = (id, name) <> (Locale.tupled, Locale.unapply)
}
object Locales {
private val locales = TableQuery[Locales]
val db = Database.forConfig("database")
def count: Future[Int] =
try db.run(locales.length.result)
finally db.close
}
Then I got confused that when and where the proper time is to create Database object using
val db = Database.forConfig("database")
If I create db like this, there will be as many Database objects as my models. So what is the best practice to get this work?
You can create an Object DBLocator and load it using lazy operator so that its loaded only on demand.
You can always invoke the method defined in DBLocator class to get an instance of Session.
using scala, slick 2.0 & eclipse I have an error I can't explain : "value ddl is not a member of scala.slick.lifted.TableQuery[SqliteSpec.this.Personnes]"
here is the code:
I declare a trait like this :
trait sqlite {
val db = Database.forURL("jdbc:sqlite:rdvs.txt", driver = "org.sqlite.JDBC")
class Personnes(tag: Tag) extends Table[Rdv](tag, "RDV") {
def id = column[Int]("ID", O.PrimaryKey, O.AutoInc)
def nom = column[String]("NOM", O.NotNull)
def prénom = column[String]("PRENOM")
def sexe = column[Int]("SEXE")
def télPortable = column[String]("TELPOR")
def télBureau = column[String]("TELBUR")
def télPrivé = column[String]("TELPRI")
def siteRDV = column[String]("SITE")
def typeRDV = column[String]("TYPE")
def libelléRDV = column[String]("LIBELLE")
def numRDV = column[String]("NUMRDV")
def étape = column[String]("ETAPE")
def dateRDV = column[Date]("DATE")
def heureRDVString = column[String]("HEURE")
def statut = column[String]("STATUT")
def orderId = column[String]("ORDERID")
def * = (id.?, nom, prénom, sexe, télPortable, télBureau, télPrivé,
siteRDV, typeRDV, libelléRDV, numRDV, étape, dateRDV, heureRDVString,
statut, orderId) <> (Rdv.tupled, Rdv.unapply _)
}
}
and here is the wrong code :
db.withDynSession{
val personnes=TableQuery[Personnes]
personnes.ddl.create
}
althought I followed this official tutorial : http://slick.typesafe.com/doc/2.0.0/schemas.html (section DDL)
Do you know what's wrong?
thanks.
Maybe this is useful for somebody: I had the same problem, but my mistake was importing different driver simple implicits. In my main model class had Postgres', but in my tests had H2's (in order to make in-memory integration testing). Switching to the same drivers solved the issue.
I would like to add the entire code, but the "edit" button is no more present under my question; here is the code:
package tests {
#RunWith(classOf[JUnitRunner])
class SqliteSpec extends Specification with sqlite {
"la base sqlite" should {
"create a new database file" in new Sqlite_avant_chaque_test {
todo
}
}
class Sqlite_avant_chaque_test extends Scope {
println("avant test")
def abc = {
db.withDynSession {
val personnes = TableQuery[Personnes]
personnes.ddl.create
}
}
}
}
}
ok, I only needed to log on; here is the working code :
import org.specs2.execute.AsResult
import org.specs2.runner.JUnitRunner
import scala.collection.GenTraversableOnce
import org.specs2.time.TimeConversions$longAsTime
import org.specs2.collection.BiMap
import org.specs2.control.Debug$Debuggable
import scala.collection.mutable.ListBuffer
import org.specs2.internal.scalaz.TreeLoc
import org.specs2.mutable.Specification
import scala.xml.NodeSeq
import scala.collection.immutable.List
import org.specs2.text.LinesContent
import org.specs2.specification.Fragments
import scala.math.Numeric
import java.net.URL
import org.specs2.matcher.MatchSuccess
import scala.runtime.Nothing$
import scala.reflect.ClassTag
import org.specs2.main.Arguments
import java.io.InputStream
import org.specs2.data.Sized
import org.junit.runner.RunWith
import org.specs2.specification.Scope
import java.sql.SQLInvalidAuthorizationSpecException
import models.sqlite
import scala.slick.lifted.TableQuery
//import scala.slick.driver.JdbcDriver.simple._
import scala.slick.driver.SQLiteDriver.simple._
import play.api.test.Helpers
import play.test.Helpers
package tests {
#RunWith(classOf[JUnitRunner])
class SqliteSpec extends Specification with sqlite {
sequential
"la base sqlite" should {
"create a new database file" in new Sqlite_avant_chaque_test {
todo
}
}
class Sqlite_avant_chaque_test extends Scope {
println("avant test")
//db.withDynSession {
db.withSession { implicit session: Session =>
val personnes = TableQuery[Personnes]
personnes.ddl.create
println("avant tests, après création base")
}
}
}
}