So I've tried seemingly countless things to get this to work. When I call queueWrite, the println statements give me this:
{ "uuid" : "49f2-0b64-4bf3-49f2a35b-bbe8-4954f742d88b" }
and this:
{ "uuid" : "49f2-0b64-4bf3-49f2a35b-bbe8-4954f742d88b", "name" : "personName", "key" : "3E6A" }
Which (I'm pretty sure) is just fine. However, after it prints, I get this:
java.lang.IllegalArgumentException: Invalid BSON field name uuid
Afaik, the field name uuid is fine, the only things about an improper name I could really find is to just make sure there are no '.' symbols in it (which there aren't)
def queueWrite(collection: String, filter: Map[String, () => String], data: Map[String, () => String]) {
val col = collections.get(collection).get
val filterDoc = new BsonDocument
filter.foreach(f => { filterDoc append (f._1, new BsonString(f._2.apply)) })
val filterBson = Document(filterDoc)
println("filter: \n" + filterBson.toJson)
val dataDoc = new BsonDocument
data.foreach(f => { dataDoc append (f._1, new BsonString(f._2.apply)) })
val dataBson = Document(dataDoc)
println("data: \n" + dataBson.toJson)
val options = new FindOneAndUpdateOptions
options.returnDocument(ReturnDocument.AFTER)
options.upsert(true)
val observer = new Observer[Document] {
override def onSubscribe(s: Subscription) = s.request(1)
override def onNext(doc: Document) = println(doc.toJson)
override def onError(e: Throwable) = e.printStackTrace
override def onComplete = println("onComplete")
}
val observable: Observable[Document] = col.findOneAndUpdate(filterBson, dataBson, options)
observable.subscribe(observer)
}
Any ideas / suggestions are greatly appreciated as always :)
Related
I am stuck on an issue. I have created a scala maven project where I am trying to communicate with mongo. My project is getting build successfuly and I do not get any error when I execute my project, but still no data is getting inserted in my mongodb.
If I insert data manually through mongo-cli then my code can read it but it is not even inserting a single record.
Could you please help me in finding where I am making mistakes.
My mongo version is 3.2.10
App.scala
package com.assignment.scala
import org.mongodb.scala._
import org.mongodb.scala.model.Aggregates._
import org.mongodb.scala.model.Filters._
import org.mongodb.scala.model.Projections._
import org.mongodb.scala.model.Sorts._
import org.mongodb.scala.model.Updates._
import org.mongodb.scala.model._
/**
* #author ${user.name}
*/
object App {
def main(args: Array[String]) {
println("Calling insertDoc==")
insertDoc()
}
def insertDoc() = {
val mongoClient = MongoClient()
val database = mongoClient.getDatabase("assignment")
val collection = database.getCollection("links")
println("collection find : " + collection.find())
collection.find().subscribe(
(user: Document) => println("document------------"+user.toJson()),
(error: Throwable) => println(s"Query failed: ${error.getMessage}"),
() => println("Done"))
collection.drop()
val bufferedSource = io.Source.fromFile("/home/impadmin/ServiceSource/ml-latest-small/links.csv")
var firstLine = List[String]();
var docList = List[Document]();
for ((line,count) <- bufferedSource.getLines.zipWithIndex) {
val cols = line.split(",").map(_.trim)
if (firstLine.size <= 0 && cols.length > 0) {
firstLine = cols.toList;
} else {
var doc: Document = Document("_id" -> count)
for ((a, b) <- firstLine zip cols) {
doc ++= Document(a -> b)
}
docList = docList :+ doc
collection.insertOne(doc)
}
}
val doc3 = Document("name" -> "MongoDB", "type" -> "database",
"count" -> 1, "info" -> Document("x" -> 203, "y" -> 102))
collection.insertOne(doc)
collection.find().subscribe((doc1: Document) => println(doc1.toJson()),
(e: Throwable) => println(s"There was an error: $e"))
}
}
It worked using subscribe:
val insertObservable1: Observable[Completed] = collection.insertMany(docList);
val latch1 = new CountDownLatch(1)
insertObservable1.subscribe(new Observer[Completed] {
override def onError(e: Throwable): Unit = {
println("Error1")
e.printStackTrace()
}
override def onSubscribe(subscription: Subscription): Unit = {
subscription.request(java.lang.Long.MAX_VALUE)
println("inserting1-------------")
}
override def onComplete(): Unit = {
println("Completed1")
latch1.countDown()
}
override def onNext(result: Completed): Unit = {
println("Next1-----------")
}
})
latch1.await()
}
})
I am trying to define a member function in a class that would be used as UDF while parsing data from a json file. I am using trait to a define a set of methods and a class to override those methods.
trait geouastr {
def getGeoLocation(ipAddress: String): Map[String, String]
def uaParser(ua: String): Map[String, String]
}
class GeoUAData(appName: String, sc: SparkContext, conf: SparkConf, combinedCSV: String) extends geouastr with Serializable {
val spark = SparkSession.builder.config(conf).getOrCreate()
val GEOIP_FILE_COMBINED = combinedCSV;
val logger = LogFactory.getLog(this.getClass)
val allDF = spark.
read.
option("header","true").
option("inferSchema", "true").
csv(GEOIP_FILE_COMBINED).cache
val emptyMap = Map(
"country" -> "",
"state" -> "",
"city" -> "",
"zipCode" -> "",
"latitude" -> 0.0.toString(),
"longitude" -> 0.0.toString())
override def getGeoLocation(ipAddress: String): Map[String, String] = {
val ipLong = ipToLong(ipAddress)
try {
logger.error("Entering UDF " + ipAddress + " allDF " + allDF.count())
val resultDF = allDF.
filter(allDF("network").cast("long") <= ipLong.get).
filter(allDF("broadcast") >= ipLong.get).
select(allDF("country_name"), allDF("subdivision_1_name"),allDF("city_name"),
allDF("postal_code"),allDF("latitude"),allDF("longitude"))
val matchingDF = resultDF.take(1)
val matchRow = matchingDF(0)
logger.error("Lookup for " + ipAddress + " Map " + matchRow.toString())
val geoMap = Map(
"country" -> nullCheck(matchRow.getAs[String](0)),
"state" -> nullCheck(matchRow.getAs[String](1)),
"city" -> nullCheck(matchRow.getAs[String](2)),
"zipCode" -> nullCheck(matchRow.getAs[String](3)),
"latitude" -> matchRow.getAs[Double](4).toString(),
"longitude" -> matchRow.getAs[Double](5).toString())
} catch {
case (nse: NoSuchElementException) => {
logger.error("No such element", nse)
emptyMap
}
case (npe: NullPointerException) => {
logger.error("NPE for " + ipAddress + " allDF " + allDF.count(),npe)
emptyMap
}
case (ex: Exception) => {
logger.error("Generic exception " + ipAddress,ex)
emptyMap
}
}
}
def nullCheck(input: String): String = {
if(input != null) input
else ""
}
override def uaParser(ua: String): Map[String, String] = {
val client = Parser.get.parse(ua)
return Map(
"os"->client.os.family,
"device"->client.device.family,
"browser"->client.userAgent.family)
}
def ipToLong(ip: String): Option[Long] = {
Try(ip.split('.').ensuring(_.length == 4)
.map(_.toLong).ensuring(_.forall(x => x >= 0 && x < 256))
.zip(Array(256L * 256L * 256L, 256L * 256L, 256L, 1L))
.map { case (x, y) => x * y }
.sum).toOption
}
}
I notice uaParser to be working fine, while getGeoLocation is returning emptyMap(running into NPE). Adding snippet that shows how i am using this in main method.
val appName = "SampleApp"
val conf: SparkConf = new SparkConf().setAppName(appName)
val sc: SparkContext = new SparkContext(conf)
val spark = SparkSession.builder.config(conf).enableHiveSupport().getOrCreate()
val geouad = new GeoUAData(appName, sc, conf, args(1))
val uaParser = Sparkudf(geouad.uaParser(_: String))
val geolocation = Sparkudf(geouad.getGeoLocation(_: String))
val sampleRdd = sc.textFile(args(0))
val json = sampleRdd.filter(_.nonEmpty)
import spark.implicits._
val sampleDF = spark.read.json(json)
val columns = sampleDF.select($"user-agent", $"source_ip")
.withColumn("sourceIp", $"source_ip")
.withColumn("geolocation", geolocation($"source_ip"))
.withColumn("uaParsed", uaParser($"user-agent"))
.withColumn("device", ($"uaParsed") ("device"))
.withColumn("os", ($"uaParsed") ("os"))
.withColumn("browser", ($"uaParsed") ("browser"))
.withColumn("country" , ($"geolocation")("country"))
.withColumn("state" , ($"geolocation")("state"))
.withColumn("city" , ($"geolocation")("city"))
.withColumn("zipCode" , ($"geolocation")("zipCode"))
.withColumn("latitude" , ($"geolocation")("latitude"))
.withColumn("longitude" , ($"geolocation")("longitude"))
.drop("geolocation")
.drop("uaParsed")
Questions:
1. Should we switch from class to object for defining UDFs? (i can keep it as singleton)
2. Can class member function be used as UDF?
3. When such a UDF is invoked, will class member like allDF remain initialized?
4. Val declared as member variable - will it get initialized at the time of construction of geouad?
I am new to Scala, Thanks in advance for guidance/suggestions.
No, switching from class to object is not necessary for defining UDF, it is only different while calling the UDF.
Yes, you can use class member function as UDF, but first you need to register the function as an UDF.
spark.sqlContext.udf.register("registeredName", Class Method _)
No, other methods are initialized when calling one UDF
Yes, the class variable val will be initialized at the time of calling geouad and performing some actions.
Error message:
org.bson.codecs.configuration.CodecConfigurationException: Can't find a codec for class org.mongodb.scala.bson.collection.immutable.Document
Code:
def queueWrite(collection: String, filter: Map[String, () => String], data: Map[String, () => String]) {
val col = collections.get(collection).get
val filterBson = Document()
filter.foreach(f => { filterBson.append(f._1, f._2.apply) })
val dataBson = Document()
data.foreach(f => { dataBson.append(f._1, f._2.apply) })
val options = new FindOneAndUpdateOptions
options.returnDocument(ReturnDocument.AFTER)
options.upsert(true)
val observer = new Observer[Document] {
override def onNext(doc: Document) = println(doc.toJson)
override def onError(e: Throwable) = e.printStackTrace
override def onComplete = println("onComplete")
}
val observable: Observable[Document] = col.findOneAndUpdate(filterBson, dataBson, options)
observable.subscribe(observer)
}
Called with:
val filter = Map[String, () => String]("uuid", p.getUniqueId.toString)
var dataMap = Map[String, () => String]()
dataMap = dataMap.+("uuid" -> p.getUniqueId.toString)
dataMap = dataMap.+("nickname" -> p.getDisplayName)
queueWrite("players", filter, dataMap)
I've tried using mutable documents but then realized that findoneandupdate returns an immutable. I also tried using a BsonDocument for the filter with equal but that ofc had no effect. I'm not really sure where to go from here, any help would be greatly appreciated :)
private val settings = MongoClientSettings.builder
.clusterSettings(clusterSettings)
.build
My MongoClientSettings looked like this before, I needed to change it to this:
private val settings = MongoClientSettings.builder
.clusterSettings(clusterSettings)
.codecRegistry(MongoClient.DEFAULT_CODEC_REGISTRY)
.build
It seems mongo didn't assume default codec registry
Thanks to #Ross for the help!
What I did works fine (at least it looks look it does), but I am not convinced this is the best way to do it...
Basically, I wanted to have my i18n translations in a database instead of property files so users can easily edit those translations and a cache can serve them to the other users within a short period of time - I use an Akka Actor to read from the database and create a cache used by the messageApi (before I always needed to redeploy with the changes in the property files).
Basically, is what I did totally the wrong way to do it?
TranslationActor.scala :
class TranslationActor extends Actor {
def receive = {
case _ => {
Logger.info("Starting to cache the translations")
TranslationActor.tempCache = ListMap.empty
var translations: ListMap[String, String] = ListMap.empty
for (acceptedLanguage <- TranslationActor.acceptedLanguages) {
val translationLanguageId: Long = TranslationLanguage.findByCode(acceptedLanguage).get.id
val languageTranslations: Seq[Translation] = Translation.findAllByLanguageId(translationLanguageId)
translations = new ListMap[String, String]
for (languageTranslation <- languageTranslations) {
val tag = EnglishTranslation.findById(languageTranslation.englishTranslationId).get.tag
var returnedTranslation: String = languageTranslation.translation
if (returnedTranslation.isEmpty) {
returnedTranslation = tag
}
translations += tag -> new CacheValue(new Locale(acceptedLanguage), returnedTranslation).stringVar
}
TranslationActor.tempCache += acceptedLanguage -> translations
}
TranslationActor.cache = TranslationActor.tempCache
Logger.info("Finished to cache the translations")
}
}
}
object TranslationActor {
var acceptedLanguages: Seq[String] = Seq("fr", "en")
var cache: ListMap[String, ListMap[String, String]] = ListMap.empty
var tempCache: ListMap[String, ListMap[String, String]] = ListMap.empty
}
class CacheValue(locale: Locale, string: String) {
val created: Long = System.currentTimeMillis
var messageFormat: MessageFormat = null
var localeVar: Locale = locale
var stringVar: String = string
def isOlderThan(period: Long): Boolean = {
(System.currentTimeMillis - created) > (period * 1000)
}
def getMessageFormat: MessageFormat = {
if (messageFormat == null) {
if (stringVar != null) {
messageFormat = new MessageFormat(stringVar, localeVar)
} else {
messageFormat = new MessageFormat("", localeVar)
}
}
messageFormat
}
}
ManageTranslationDaemon.scala :
#Singleton
class ManageTranslationDaemon #Inject() (actorSystem: ActorSystem, applicationLifecycle: ApplicationLifecycle) {
Logger.info("Scheduling the translation daemon")
val translationActor = actorSystem.actorOf(Props(new TranslationActor()))
actorSystem.scheduler.schedule(1 seconds, 30 minutes, translationActor, "translationDaemon")
applicationLifecycle.addStopHook { () =>
Logger.info("Shutting down translation daemon")
Future.successful(actorSystem.shutdown())
}
}
TranslationGuiceConfiguration.scala : (from an com.google.inject.AbstractModule)
class TranslationGuiceConfiguration extends AbstractModule {
def configure() : Unit = {
bind(classOf[ManageTranslationDaemon]).asEagerSingleton()
}
}
Then I extended parts of the DefaultMessagesApi (by looking at the code of MessagesApi) in
MessagesPersoApi.scala :
class MessagesPersoApi #Inject() (environment: Environment, configuration: Configuration, langs: Langs) extends DefaultMessagesApi(environment: Environment, configuration: Configuration, langs: Langs) {
private def joinPaths(first: Option[String], second: String) = first match {
case Some(parent) => new java.io.File(parent, second).getPath
case None => second
}
override protected def loadMessages(langCode: String): Map[String, String] = {
TranslationActor.cache.getOrElse(langCode, loadMessagesFromFile("messages." + langCode))
}
protected def loadMessagesFromFile(langCode: String): Map[String, String] = {
import scala.collection.JavaConverters._
environment.classLoader.getResources(joinPaths(messagesPrefix, langCode)).asScala.toList
.filterNot(url => Resources.isDirectory(environment.classLoader, url)).reverse
.map { messageFile =>
Messages.parse(Messages.UrlMessageSource(messageFile), messageFile.toString).fold(e => throw e, identity)
}.foldLeft(Map.empty[String, String]) {_ ++ _}
}
override protected def loadAllMessages: Map[String, Map[String, String]] = {
langs.availables.map(_.code).map { lang =>
(lang, loadMessages(lang))
}.toMap
.+("default" -> loadMessagesFromFile("messages"))
.+("default.play" -> loadMessagesFromFile("messages.default"))
}
}
And finally created a module (play.api.inject.Module)
MessagesPersoModule.scala :
class MessagesPersoModule extends Module {
def bindings(environment: Environment, configuration: Configuration) = {
Seq(
bind[Langs].to[DefaultLangs],
bind[MessagesApi].to[MessagesPersoApi]
)
}
}
And at the end, used it in my application.conf :
play.modules.disabled += "play.api.i18n.I18nModule"
play.modules.enabled += "modules.MessagesPersoModule"
play.modules.enabled += "modules.TranslationGuiceConfiguration"
Does that actually make sense? It seemed to me that it was a bit "complicated" to write. Is there an easier way to do that same logic with less code/classes ?
Thanks,
Yoann
Is there a way to have Slick's code generation generate code for only a single schema? Say, public? I have extensions that create a whole ton of tables (eg postgis, pg_jobman) that make the code that slick generates gigantic.
Use this code with appropriate values and schema name,
object CodeGenerator {
def outputDir :String =""
def pkg:String =""
def schemaList:String = "schema1, schema2"
def url:String = "dburl"
def fileName:String =""
val user = "dbUsername"
val password = "dbPassword"
val slickDriver="scala.slick.driver.PostgresDriver"
val JdbcDriver = "org.postgresql.Driver"
val container = "Tables"
def generate() = {
val driver: JdbcProfile = buildJdbcProfile
val schemas = createSchemaList
var model = createModel(driver,schemas)
val codegen = new SourceCodeGenerator(model){
// customize Scala table name (table class, table values, ...)
override def tableName = dbTableName => dbTableName match {
case _ => dbTableName+"Table"
}
override def code = {
//imports is copied right out of
//scala.slick.model.codegen.AbstractSourceCodeGenerator
val imports = {
"import scala.slick.model.ForeignKeyAction\n" +
(if (tables.exists(_.hlistEnabled)) {
"import scala.slick.collection.heterogenous._\n" +
"import scala.slick.collection.heterogenous.syntax._\n"
} else ""
) +
(if (tables.exists(_.PlainSqlMapper.enabled)) {
"import scala.slick.jdbc.{GetResult => GR}\n" +
"// NOTE: GetResult mappers for plain SQL are only generated for tables where Slick knows how to map the types of all columns.\n"
} else ""
) + "\n\n" //+ tables.map(t => s"implicit val ${t.model.name.table}Format = Json.format[${t.model.name.table}]").mkString("\n")+"\n\n"
}
val bySchema = tables.groupBy(t => {
t.model.name.schema
})
val schemaFor = (schema: Option[String]) => {
bySchema(schema).sortBy(_.model.name.table).map(
_.code.mkString("\n")
).mkString("\n\n")
}
}
val joins = tables.flatMap( _.foreignKeys.map{ foreignKey =>
import foreignKey._
val fkt = referencingTable.TableClass.name
val pkt = referencedTable.TableClass.name
val columns = referencingColumns.map(_.name) zip
referencedColumns.map(_.name)
s"implicit def autojoin${fkt + name.toString} = (left:${fkt} ,right:${pkt}) => " +
columns.map{
case (lcol,rcol) =>
"left."+lcol + " === " + "right."+rcol
}.mkString(" && ")
})
override def entityName = dbTableName => dbTableName match {
case _ => dbTableName
}
override def Table = new Table(_) {
table =>
// customize table value (TableQuery) name (uses tableName as a basis)
override def TableValue = new TableValue {
override def rawName = super.rawName.uncapitalize
}
// override generator responsible for columns
override def Column = new Column(_){
// customize Scala column names
override def rawName = (table.model.name.table,this.model.name) match {
case _ => super.rawName
}
}
}
}
println(outputDir+"\\"+fileName)
(new File(outputDir)).mkdirs()
val fw = new FileWriter(outputDir+File.separator+fileName)
fw.write(codegen.packageCode(slickDriver, pkg, container))
fw.close()
}
def createModel(driver: JdbcProfile, schemas:Set[Option[String]]): Model = {
driver.simple.Database
.forURL(url, user = user, password = password, driver = JdbcDriver)
.withSession { implicit session =>
val filteredTables = driver.defaultTables.filter(
(t: MTable) => schemas.contains(t.name.schema)
)
PostgresDriver.createModel(Some(filteredTables))
}
}
def createSchemaList: Set[Option[String]] = {
schemaList.split(",").map({
case "" => None
case (name: String) => Some(name)
}).toSet
}
def buildJdbcProfile: JdbcProfile = {
val module = currentMirror.staticModule(slickDriver)
val reflectedModule = currentMirror.reflectModule(module)
val driver = reflectedModule.instance.asInstanceOf[JdbcProfile]
driver
}
}
I encountered the same problem and I found this question. The answer by S.Karthik sent me in the right direction. However, the code in the answer is slightly outdated. And I think a bit over-complicated. So I crafted my own solution:
import slick.codegen.SourceCodeGenerator
import slick.driver.JdbcProfile
import slick.model.Model
import scala.concurrent.duration.Duration
import scala.concurrent.{Await, ExecutionContext}
val slickDriver = "slick.driver.PostgresDriver"
val jdbcDriver = "org.postgresql.Driver"
val url = "jdbc:postgresql://localhost:5432/mydb"
val outputFolder = "/path/to/src/test/scala"
val pkg = "com.mycompany"
val user = "user"
val password = "password"
object MySourceCodeGenerator {
def run(slickDriver: String, jdbcDriver: String, url: String, outputDir: String,
pkg: String, user: Option[String], password: Option[String]): Unit = {
val driver: JdbcProfile =
Class.forName(slickDriver + "$").getField("MODULE$").get(null).asInstanceOf[JdbcProfile]
val dbFactory = driver.api.Database
val db = dbFactory.forURL(url, driver = jdbcDriver, user = user.orNull,
password = password.orNull, keepAliveConnection = true)
try {
// **1**
val allSchemas = Await.result(db.run(
driver.createModel(None, ignoreInvalidDefaults = false)(ExecutionContext.global).withPinnedSession), Duration.Inf)
// **2**
val publicSchema = new Model(allSchemas.tables.filter(_.name.schema.isEmpty), allSchemas.options)
// **3**
new SourceCodeGenerator(publicSchema).writeToFile(slickDriver, outputDir, pkg)
} finally db.close
}
}
MySourceCodeGenerator.run(slickDriver, jdbcDriver, url, outputFolder, pkg, Some(user), Some(password))
I'll explain what's going on here:
I copied the run function from the SourceCodeGenerator class that's in the slick-codegen library. (I used version slick-codegen_2.10-3.1.1.)
// **1**: In the origninal code, the generated Model was referenced in a val called m. I renamed that to allSchemas.
// **2**: I created a new Model (publicSchema), using the options from the original model, and using a filtered version of the tables set from the original model. It turns out tables from the public schema don't get a schema name in the model. Hence the isEmpty. Should you need tables from one or more other schemas, you can easily create a different filter expression.
// **3**: I create a SourceCodeGenerator with the created publicSchema model.
Of course, it would even be better if the Slick codegenerator could incorporate an option to select one or more schemas.