spline spark agent jar has errors during post processing - apache-atlas

I have been trying to run the following code with the new spline jsr: za.co.absa.spline.agent.spark:spark-3.0-spline-agent-bundle_2.12:0.6.0 but have been getting errors specific to UserExtraMetadataProvider which has been deprecated in the newer versions. I have also tried replacing UserExtraMetadataProvider with UserExtraAppendingPostProcessingFilter using the code shown below this first code block but still getting the errors. Can you please validate and share how to properly write the post processing filter code using the new spline bundle.
%scala
import za.co.absa.spline.harvester.conf.StandardSplineConfigurationStack
import za.co.absa.spline.harvester.extra.UserExtraMetadataProvider
import za.co.absa.spline.harvester.HarvestingContext
import org.apache.commons.configuration.Configuration
import za.co.absa.spline.harvester.SparkLineageInitializer._
import za.co.absa.spline.harvester.conf.DefaultSplineConfigurer
import za.co.absa.spline.producer.model._
import scala.util.parsing.json.JSON
val splineConf: Configuration = StandardSplineConfigurationStack(spark)
spark.enableLineageTracking(new DefaultSplineConfigurer(splineConf) {
//override protected def userExtraMetadataProvider = new UserExtraMetaDataProvider {
//val test = dbutils.notebook.getContext.notebookPath
val notebookInformationJson = dbutils.notebook.getContext.toJson
val outerMap = JSON.parseFull(notebookInformationJson).getOrElse(0).asInstanceOf[Map[String,String]]
val tagMap = outerMap("tags").asInstanceOf[Map[String,String]]
val extraContextMap = outerMap("extraContext").asInstanceOf[Map[String,String]]
val notebookPath = extraContextMap("notebook_path").split("/")
val notebookURL = tagMap("browserHostName")+"/?o="+tagMap("orgId")+tagMap("browserHash")
val user = tagMap("user")
val name = notebookPath(notebookPath.size-1)
val notebookInfo = Map("notebookURL" -> notebookURL,
"user" -> user,
"name" -> name,
"mounts" -> dbutils.fs.ls("/mnt").map(_.path),
"timestamp" -> System.currentTimeMillis)
val notebookInfoJson = scala.util.parsing.json.JSONObject(notebookInfo)
override protected def userExtraMetadataProvider: UserExtraMetadataProvider = new UserExtraMetadataProvider {
override def forExecEvent(event: ExecutionEvent, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar1")
override def forExecPlan(plan: ExecutionPlan, ctx: HarvestingContext): Map[String, Any] = Map("notebookInfo" -> notebookInfoJson) // tilføj mount info til searchAndReplace denne funktion indeholder infoen
override def forOperation(op: ReadOperation, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar3")
override def forOperation(op: WriteOperation, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar4")
override def forOperation(op: DataOperation, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar5")
}
})
Here is the updated code that is still having errors
%scala
import za.co.absa.spline.harvester.conf.StandardSplineConfigurationStack
import za.co.absa.spline.harvester.extra.UserExtraMetadataProvider
import za.co.absa.spline.harvester.HarvestingContext
import org.apache.commons.configuration.Configuration
import za.co.absa.spline.harvester.SparkLineageInitializer._
import za.co.absa.spline.harvester.conf.DefaultSplineConfigurer
import za.co.absa.spline.producer.model._
import play.api.libs.json._
val splineConf: Configuration = StandardSplineConfigurationStack(spark)
spark.enableLineageTracking(new DefaultSplineConfigurer(splineConf) {
val notebookInformationJson = Json.toJson(dbutils.notebook.getContext)
val outerMap = Json.toJson(notebookInformationJson).getOrElse(0).asInstanceOf[Map[String,String]]
val tagMap = outerMap("tags").asInstanceOf[Map[String,String]]
val extraContextMap = outerMap("extraContext").asInstanceOf[Map[String,String]]
val notebookPath = extraContextMap("notebook_path").split("/")
val notebookURL = tagMap("browserHostName")+"/?o="+tagMap("orgId")+tagMap("browserHash")
val user = tagMap("user")
val name = notebookPath(notebookPath.size-1)
val notebookInfo = Map("notebookURL" -> Json.toJson(notebookURL),
"user" -> Json.toJson(user),
"name" -> Json.toJson(name),
"mounts" -> Json.toJson(dbutils.fs.ls("/mnt").map(_.path)),
"timestamp" -> Json.toJson(System.currentTimeMillis))
val notebookInfoJson = Json.toJson(notebookInfo)
def userExtraMetadataProvider: UserExtraAppendingPostProcessingFilter
= new UserExtraAppendingPostProcessingFilter
{
def processExecutionEvent(event: ExecutionEvent, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar1")
def processExecutionPlan (plan: ExecutionPlan, ctx: HarvestingContext): Map[String, Any] = Map("notebookInfo" -> notebookInfoJson)
def processReadOperation(op: ReadOperation, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar3")
def processWriteOperation(op: WriteOperation, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar4")
def processDataOperation(op: DataOperation, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar5")
}
})
Here is the error:
command-2044409137370707:12: error: not enough arguments for constructor DefaultSplineConfigurer: (sparkSession: org.apache.spark.sql.SparkSession, userConfiguration: org.apache.commons.configuration.Configuration)za.co.absa.spline.harvester.conf.DefaultSplineConfigurer.
Unspecified value parameter userConfiguration.
spark.enableLineageTracking(new DefaultSplineConfigurer(splineConf) {
^
command-2044409137370707:32: error: not found: type UserExtraAppendingPostProcessingFilter
def userExtraMetadataProvider: UserExtraAppendingPostProcessingFilter
^
command-2044409137370707:33: error: not found: type UserExtraAppendingPostProcessingFilter
= new UserExtraAppendingPostProcessingFilter
^
command-2044409137370707:37: error: not found: type ExecutionEvent
def processExecutionEvent(event: ExecutionEvent, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar1")
^
command-2044409137370707:38: error: not found: type ExecutionPlan
def processExecutionPlan (plan: ExecutionPlan, ctx: HarvestingContext): Map[String, Any] = Map("notebookInfo" -> notebookInfoJson)
^
command-2044409137370707:39: error: not found: type ReadOperation
def processReadOperation(op: ReadOperation, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar3")
^
command-2044409137370707:40: error: not found: type WriteOperation
def processWriteOperation(op: WriteOperation, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar4")
^
command-2044409137370707:41: error: not found: type DataOperation
def processDataOperation(op: DataOperation, ctx: HarvestingContext): Map[String, Any] = Map("foo" -> "bar5")
^
command-2044409137370707:36: warning: a pure expression does nothing in statement position; multiline expressions may require enclosing parentheses
{
^

Your code doesn't compile for a few reasons:
You miss some imports (the error log makes it clear):
import za.co.absa.spline.producer.model.v1_1._
import za.co.absa.spline.harvester.extra.UserExtraAppendingPostProcessingFilter
The correct signature for the extra metadata provider is the following:
protected def maybeUserExtraMetadataProvider: Option[UserExtraMetadataProvider]
UserExtraAppendingPostProcessingFilter is just an adapter for the deprecated UserExtraMetadataProvider. So you still need to create an instance:
new UserExtraAppendingPostProcessingFilter(new UserExtraMetadataProvider() {
// ???
})
Please note that we are working on a declarative solution for capturing the extra metadata, so that most of the rules and values could be defined in the configuration, and little to no coding would be required for that. See https://github.com/AbsaOSS/spline-spark-agent/issues/169
For now just use UserExtraMetadataProvider
For more details see https://github.com/AbsaOSS/spline-spark-agent/discussions/228#discussioncomment-819620

Related

Scala circe decode Map[String, String] type

I have a Map[String, String] object with I want to use as json. I have written an encoder for this type:
implicit val encodeMap: Encoder[Map[String, String]] = new Encoder[Map[String, String]] {
override def apply(values: Map[String, String]): Json = {
values.toList
.map(pair => Json.obj(
(pair._1, pair._2.asJson)
)).asJson
}
}
In addition to encoder, I need a decoder, but I don't have an idea how to write it. My best try so far:
implicit val decodeMap: Decoder[Map[String, String]] = new Decoder[Map[String, String]] {
final def apply(c: HCurser): Decoder.Result[Map[String, String]] = ???
}
Pretty basic, but I don't really know how to get into this problem.
Thanks!
Something like this should work but as Andy said above, you should be able to use automatic or semi-automatic derivation in this case.
import cats.syntax.either._
implicit val decodeMap: Decoder[Map[String, String]] = new Decoder[Map[String, String]] {
override def apply(c: HCursor): Decoder.Result[Map[String, String]] = {
c.keys.fold[Decoder.Result[Map[String, String]]](Right(Map.empty))(
_.foldLeft(Map[String, String]().asRight[DecodingFailure])((res, k) => {
res.flatMap((m: Map[String, String]) => {
c.downField(k).as[String].fold(
_.asLeft[Map[String, String]],
v => (m + (k -> v)).asRight[DecodingFailure]
)
})
})
)
}
}

Avro4s, how to serialise a map with custom key type?

I am using Avro4s. It's easy to serialise a
Map[String, T]
but I have a situation like
sealed trait Base
case object First extends Base
case object Second extends Base
and I need to serialise something like
Map[Base, T]
Any advice on the best way to achieve this? Thanks.
The thing is that according to the Avro spec
Map keys are assumed to be strings.
So the only type supported by Avro is Map[String,T]. It means that you need to write some custom code that will map your Map[Base, T] onto Map[String,T] and back. Something like this will probably work for you:
import scala.collection.breakOut
import scala.collection.immutable.Map
import scala.collection.JavaConverters._
import com.sksamuel.avro4s._
import org.apache.avro.Schema
import org.apache.avro.Schema.Field
object BaseMapAvroHelpers {
private val nameMap: Map[Base, String] = Map(First -> "first", Second -> "second")
private val revNameMap: Map[String, Base] = nameMap.toList.map(kv => (kv._2, kv._1)).toMap
implicit def toSchema[T: SchemaFor]: ToSchema[Map[Base, T]] = new ToSchema[Map[Base, T]] {
override val schema: Schema = Schema.createMap(implicitly[SchemaFor[T]].apply())
}
implicit def toValue[T: SchemaFor : ToValue]: ToValue[Map[Base, T]] = new ToValue[Map[Base, T]] {
override def apply(value: Map[Base, T]): java.util.Map[String, T] = value.map(kv => (nameMap(kv._1), kv._2)).asJava
}
implicit def fromValue[T: SchemaFor : FromValue]: FromValue[Map[Base, T]] = new FromValue[Map[Base, T]] {
override def apply(value: Any, field: Field): Map[Base, T] = {
val fromValueS = implicitly[FromValue[String]]
val fromValueT = implicitly[FromValue[T]]
value.asInstanceOf[java.util.Map[Any, Any]].asScala.map(kv => (revNameMap(fromValueS(kv._1)), fromValueT(kv._2)))(breakOut)
}
}
}
Usage example:
case class Wrapper[T](value: T)
def test(): Unit = {
import BaseMapAvroHelpers._
val map: Map[Base, String] = Map(First -> "abc", Second -> "xyz")
val wrapper = Wrapper(map)
val schema = AvroSchema[Wrapper[Map[Base, String]]]
println(s"Schema: $schema")
val bufOut = new ByteArrayOutputStream()
val out = AvroJsonOutputStream[Wrapper[Map[Base, String]]](bufOut)
out.write(wrapper)
out.flush()
println(s"Avro Out: ${bufOut.size}")
println(bufOut.toString("UTF-8"))
val in = AvroJsonInputStream[Wrapper[Map[Base, String]]](new ByteArrayInputStream(bufOut.toByteArray))
val read = in.singleEntity
println(s"read: $read")
}
and the output is something like:
Schema: {"type":"record","name":"Wrapper","namespace":"so","fields":[{"name":"value","type":{"type":"map","values":"string"}}]}
Avro Out: 40
{"value":{"first":"abc","second":"xyz"}}
read: Success(Wrapper(Map(First -> abc, Second -> xyz)))

How to bind an enum to a playframework form?

I have an enum of the form:
object MatchFilterType extends Enumeration {
type MatchFilterType = Value
val gt = Value("gt")
val lt = Value("lt")
val eq = Value("eq")
}
Trying to create a form val in my controller:
case class SearchRequest(mft: MatchFilterType, queryText: String, locations: List[String])
val searchForm: Form[SearchRequest] = Form(
mapping(
"mft" -> ????????,
"queryText" -> nonEmptyText,
"locations" -> list(text)
)(SearchRequest.apply)(SearchRequest.unapply)
)
I am using play 2.6.x for this project.
How do I map my enumeration in my Form val?
First create an implicit Formatter that uses the enums withName method which takes a string and turns it into an enum:
implicit def matchFilterFormat: Formatter[MatchFilterType] = new Formatter[MatchFilterType] {
override def bind(key: String, data: Map[String, String]) =
data.get(key)
.map(MatchFilterType.withName(_))
.toRight(Seq(FormError(key, "error.required", Nil)))
override def unbind(key: String, value: MatchFilterType) =
Map(key -> value.toString)
}
Then use Forms.of to create a FieldMapping:
Form(...,
"mft" -> Forms.of[MatchFilterType],
...)
Bear in mind that MatchFilterType.withName(_) will thrown an exception if the string is not an enum member, so update the bind method to handle this as you need.
A more generic approach:
def enumerationFormatter[E <: Enumeration](enum: E): Formatter[E#Value] = new Formatter[E#Value] {
override def bind(key: String, data: Map[String, String]): Either[Seq[FormError], E#Value] =
data.get(key).map(s => enum.withName(s)).toRight(Seq(FormError(key, "error.required", Nil)))
override def unbind(key: String, value: E#Value): Map[String, String] = Map(key -> value.toString)
}
Which can then be used like:
object TestValues extends Enumeration {
type TestValue = Value
val Test: TestValue = Value
}
case class MyForm(testValue: TestValue)
object MyForm {
implicit val testValueFormatter: Formatter[TestValue] = enumerationFormatter(TestValues)
val form = Form(mapping("testValue" -> of[TestValue])(MyForm.apply)(MyForm.unapply))
}

BSONObjectIDFormat in trait BSONFormats is deprecated

I am using Reactive Mongo version 0.11.11 and I want to implement a method in my DAO which counts all documents by _id.
Here is my DAO:
import com.google.inject.Inject
import models.auth.{Team, Player}
import play.api.libs.concurrent.Execution.Implicits.defaultContext
import play.api.libs.json._
import play.modules.reactivemongo.ReactiveMongoApi
import play.modules.reactivemongo.json._
import reactivemongo.bson._
import reactivemongo.play.json.collection.JSONCollection
import scala.concurrent.Future
trait TeamDao {
def find(_id: BSONObjectID): Future[Option[Team]]
def find(name: String): Future[Option[Team]]
def save(team: Team): Future[Team]
def link(player: Player, team: Team): Future[Team]
def update(team: Team): Future[Team]
def count(team: Option[Team] = None): Future[Int]
def count(_id: BSONObjectID): Future[Int]
def countAllPlayersWithTeam(team: Team): Future[Int]
}
class MongoTeamDao #Inject()(reactiveMongoApi: ReactiveMongoApi) extends TeamDao {
val players = reactiveMongoApi.db.collection[JSONCollection]("players")
val teams = reactiveMongoApi.db.collection[JSONCollection]("teams")
def find(_id: BSONObjectID): Future[Option[Team]] = teams.find(BSONDocument("_id" -> _id)).one[Team]
def find(name: String): Future[Option[Team]] = teams.find(Json.obj("name" -> name)).one[Team]
def save(team: Team): Future[Team] = teams.insert(team).map(_ => team)
def link(player: Player, team: Team) = for {
_ <- players.update(Json.obj("_id" -> player.id), Json.obj("$push" -> BSONDocument("teams" -> team._id)))
team <- find(team._id.get)
} yield team.get
def update(team: Team) = for {
_ <- teams.update(BSONDocument("_id" -> team._id), BSONDocument("$set" -> BSONDocument("name" -> team.name)))
team <- find(team._id.get)
} yield team.get
def count(team: Option[Team] = None): Future[Int] = {
val tmpTeam: Team = team.getOrElse {
return teams.count()
}
teams.count(Some(Json.obj("name" -> tmpTeam.name)))
}
def count(_id: BSONObjectID): Future[Int] = {
teams.count(Some(Json.obj("_id" -> _id)))
}
def countAllPlayersWithTeam(team: Team): Future[Int] = {
players.count(Some(Json.obj("teams" -> team._id)))
}
}
The problem is that I get the following error:
value BSONObjectIDFormat in trait BSONFormats is deprecated: Use [[reactivemongo.play.json.BSONFormats.BSONObjectIDFormat]]
[error] teams.count(Some(Json.obj("_id" -> _id)))
I tried to replace the count method with:
def count(_id: BSONObjectID): Future[Int] = {
teams.count(Some(BSONDocument("_id" -> _id)))
}
But then I get the following compile error:
[error] found : reactivemongo.bson.BSONDocument
[error] required: MongoTeamDao.this.teams.pack.Document
[error] (which expands to) play.api.libs.json.JsObject
[error] Error occurred in an application involving default arguments.
[error] teams.count(Some(BSONDocument("_id" -> _id)))
You are mixing JSONCollection and BSON values.
It's recommanded that you either use the JSON serialization with JSONCollection, or you use the default BSON serialization with BSONCollection.
The deprecation message is a warning indicating to use the separate JSON library, instead of the former types previously included in the Play plugin.
A BSONCollection can be resolved from the Play plugin as follows.
reactiveMongoApi.database.map(_.collection[BSONCollection]("players"))
The functions MongoConnection.(db|apply) and/or ReactiveMongoApi.db are deprecated, and the equivalent .database must be used (which returns Future[DefaultDB] instead of DefaultDB).

Scala macros for nested case classes to Map and other way around

I want to convert any case class to a Map[String,Any] for example:
case class Person(name:String, address:Address)
case class Address(street:String, zip:Int)
val p = Person("Tom", Address("Jefferson st", 10000))
val mp = p.asMap
//Map("name" -> "Tom", "address" -> Map("street" -> "Jefferson st", "zip" -> 10000))
val p1 = mp.asCC[Person]
assert(p1 === p)
Possible duplications:
Here is a question that with reflection answer.
Here is a question for (converting from case class to map (without nesting)
I also found how to do it for a case claas without any nested case class inside it, here is the code from here:
package macros
import scala.language.experimental.macros
import scala.reflect.macros.blackbox.Context
trait Mappable[T] {
def toMap(t: T): Map[String, Any]
def fromMap(map: Map[String, Any]): T
}
object Mappable {
implicit def materializeMappable[T]: Mappable[T] = macro materializeMappableImpl[T]
def materializeMappableImpl[T: c.WeakTypeTag](c: Context): c.Expr[Mappable[T]] = {
import c.universe._
val tpe = weakTypeOf[T]
val companion = tpe.typeSymbol.companion
val fields = tpe.decls.collectFirst {
case m: MethodSymbol if m.isPrimaryConstructor => m
}.get.paramLists.head
val (toMapParams, fromMapParams) = fields.map { field =>
val name = field.asTerm.name
val key = name.decodedName.toString
val returnType = tpe.decl (name).typeSignature
(q"$key -> t.$name", q"map($key).asInstanceOf[$returnType]")
}.unzip
c.Expr[Mappable[T]] { q"""
new Mappable[$tpe] {
def toMap(t: $tpe): Map[String, Any] = Map(..$toMapParams)
def fromMap(map: Map[String, Any]): $tpe = $companion(..$fromMapParams)
}
""" }
}
}
Also it worth to mention Play Json library and ReactiveMongo Bson library do the same thing, but those project were really big to understand how to do this.