Bazaarvoice Jolt Generic Spec for multiple Kinds of Input JSON - jolt

I am new to JOLT. I have 2 different set of input json of same structure except one object inside differs based on the decider value like below.
Eg: Input json 1
{
"input": {
"decider": 1,
"object1": {
"object1Info": 1
"obj1SpecificObj2" : {
obj2info : "data"
}
},
"doc": {
"docId": "DOC100"
}
}
}
Eg: Input json 2
{
"input": {
"decider": 2,
"object2": {
"object2Info": 2
"obj2SpecificObj3" : {
"obj3info1" : "data1",
"obj3info2" : "data2",
"other" : {
"otherData" : "data3"
}
}
},
"doc": {
"docId": "DOC100"
}
}
}
output expectation if decider : 1
{
"out" : {
"object"" : {
"info"" : 1
"subObject" : {
"subInfo" : "data"
}
},
"doc": {
"docId": "DOC100"
}
}
}
output expectation if decider : 2
{
"out" : {
"object"" : {
"info"" : 1
"subObject" : {
"subInfo1" : "data1",
"subInfo2" : "data2",
"other" : {
"otherData" : "data3"
}
}
},
"doc": {
"docId": "DOC100"
}
}
}
I want to write a generic single conditional jolt spec which based on "decider" value output should be generated. Is there a way to write a conditional statement inside spec file....??

See https://github.com/bazaarvoice/jolt/issues/231
Same question was ask there.

Related

How to update/add new property to array elements with values from the same element without cursor

I have a huge dataset that needs to be updated quickly to reduce downtime for an application
I would like for documents that have example structure
{
"_id" : "5ed4a65efdb46e0001dc37ab",
"arr" : [
{
"oldProp" : "x"
},
{
"oldProp" : "y"
}
]
}
to be updated to
{
"_id" : "5ed4a65efdb46e0001dc37ab",
"arr" : [
{
"oldProp" : "x",
"newProp" : "x"
},
{
"oldProp" : "y",
"newProp" : "y"
}
]
}
Without using cursor!
db.getCollection('someCollection').update({x: { $type:"array"}}, [
{
$set: {
x: {
$map: {
input: "$x",
as: "entry",
in: {
$mergeObjects: [
"$$entry",
{
newProp: "$$entry.oldProp"
}
]
}
}
}
}
}],false, true)

Using $match and $unwind to get a count

I am learning MongoDB aggregation pipeline, i tried to get matched count from output of $unwind and $group. I am able to see the results for $unwind and $group. But I am not sure why I didn't get the matched count. Please help to get percentage field greater than 25.
Here's an example document:
{
"_id":ObjectId("599e9dbd8fbad926e712f902"),
"sample":"1",
"attribute":[
{
"functionName":"1",
"percentage":31.6
}
]
}
I tried this:
db.docs3.aggregate({
$unwind:'$attribute'
},
{
$group:{
_id:{
func:"$attribute.functionName",
percen:"$attribute.percentage"
}
}
})
And got this output:
{ "_id" : { "func" : "7", "percen" : 30 } }
{ "_id" : { "func" : "5", "percen" : 23.1 } }
{ "_id" : { "func" : "8", "percen" : 27.8 } }
{ "_id" : { "func" : "6", "percen" : 32.1 } }
{ "_id" : { "func" : "1", "percen" : 31.6 } }
{ "_id" : { "func" : "2", "percen" : 35 } }
{ "_id" : { "func" : "3", "percen" : 7.1 } }
{ "_id" : { "func" : "4", "percen" : 31.6 } }
I tried this:
db.docs3.aggregate({
$unwind:'$attribute'
},
{
$group:{
_id:{
func:"$attribute.functionName",
percen:"$attribute.percentage"
}
}
},
{
$match:{
"attribute.percentage":{
$gt:25
}
}
})
And I got an error.
I'm not sure whether you want a count of
Documents having at least one attribute with percentage > 25
Or
Attributes having percentage > 25
If you are interested in No. 1 then you do not need to unwind the attribute array in order to apply your match. The following will return a count of documents containing at least one attribute with percentage > 25:
db.getCollection('other').aggregate([
{ $match: { 'attribute.percentage': { $gt: 25 } } },
{ $group: { _id: null, count: { $sum: 1 } } }
])
If you are interested in No. 2 then the following will return a count of attributes with percentage > 25:
db.getCollection('other').aggregate([
{ $unwind: '$attribute' },
{ $match: { 'attribute.percentage': { $gt: 25 } } },
{ $group: { _id: null, count: { $sum: 1 } } }
])
For the following documents:
{
"_id" : ObjectId("599e9dbd8fbad926e712f902"),
"sample" : "1",
"attribute" : [
{
"functionName" : "1",
"percentage" : 31.6
}
]
}
{
"_id" : ObjectId("59a54104e7e9cc2109863beb"),
"sample" : "1",
"attribute" : [
{
"functionName" : "2",
"percentage" : 21.2
}
]
}
{
"_id" : ObjectId("59a542c4e7e9cc2109863c45"),
"sample" : "1",
"attribute" : [
{
"functionName" : "2",
"percentage" : 20.2
},
{
"functionName" : "2",
"percentage" : 28.2
},
{
"functionName" : "2",
"percentage" : 35.2
}
]
}
The first command returns count=2, the second command returns count=3.
Use the money sign to get the current value, and use the same field name you used in your $group stage
db.docs3.aggregate({
$unwind:'$attribute'
},
{
$group:{
_id:{
func:"$attribute.functionName",
percen:"$attribute.percentage"
}
}
},
{
$match:{
"$_id.percen":{
$gt:25
}
}
})

Update a nested array objects in a different collection and position in MongoDB

I have a douments like as follows.
How do I update a skillcluster name. Suppose the other document has name :"c" in 4th position.
{
Job: {
post: { name:"x" }
skill: {
skillcluster: [
{name:"c++",id:"23"},
{name:"c",id:"898"}
]
}
}
}
{
Job: {
post: { name:"x" }
skill: {
skillcluster: [
{name:"c++",id:"23"},
{name:"java"},
{name:"python"},
{name:"c",id:"898"}
]
}
}
}
You need to query to match the "name" field at the embedded level of the document using "dot notation", and then pass that match with the positional $ operator within the update:
db.collection.update(
{ "Job.skill.skillcluster.name": "c" },
{ "$set": { "Job.skill.skillcluster.$.name": "Simple C"}},
{ "multi": true }
)
Also use the "multi" flag to match and update more than one document.
The result will be:
{
"_id" : ObjectId("55dbfd0ed96d655eb0ed2b4f"),
"Job" : {
"post" : {
"name" : "x"
},
"skill" : {
"skillcluster" : [
{
"name" : "c++",
"id" : "23"
},
{
"name" : "Simple C",
"id" : "898"
}
]
}
}
}
{
"_id" : ObjectId("55dbfd0ed96d655eb0ed2b50"),
"Job" : {
"post" : {
"name" : "x"
},
"skill" : {
"skillcluster" : [
{
"name" : "c++",
"id" : "23"
},
{
"name" : "java"
},
{
"name" : "python"
},
{
"name" : "Simple C",
"id" : "898"
}
]
}
}
}

MongoDB query for a specific field for each document?

Let's say I have 2 users in my user collection:
id: "1",
outer: {
field1: {
inner1: {
a: cats
b: car
}
}
}
id: "2",
outer: {
field1: {
inner1: {
a: dogs
b: bus
}
}
}
Let's say I have 10 users. I want to get the value of field "a" for every user in my database. How do I make this query (in mongo shell)?
db.user.find({'field1.a': { $exists: true }, { 'field1.a': 1 });
Edit: As mentioned in the comments, you can also just do a project using the aggregation framework.
db.user.aggregate({ $project: { a: '$field1.inner1.a' } });
It doesn't matter how nested the objects get, just use dot notation. I wrote out an example below. Your documents were incomplete and ill-formed in a bunch of different ways, so I took the liberty of completing them while trying to match what I think you meant for them to look like.
> db.nested.insert([
{
"_id" : "1",
"outer" : {
"field1" : {
"inner1" : {
"a" : "cats",
"b" : "car"
}
}
}
},
{
"_id" : "2",
"outer" : {
"field1" : {
"inner1" : {
"a" : "dogs",
"b" : "bus"
}
}
}
}])
> db.nested.find({}, { "_id" : 0, "outer.field1.inner1.a" : 1 })
{ "outer" : { "field1" : { "inner1" : { "a" : "cats" } } } }
{ "outer" : { "field1" : { "inner1" : { "a" : "dogs" } } } }
Selecting a subset of the fields of a document is called projection.

How to mapReduce Twitter entities with id list?

TL;DR
My MapReduce isn't listing the _ids properly as values but creates multiple arrays. Any help?
Full story
I have a collection filled with tweets, including the entities. The portion of data that I'm interested in looks something like this:
{
"_id": ObjectId("h98342jdhs99191"),
"text": "tweet text",
"screen_name":"twittername",
"entities":{
media:[
{
"type":"photo",
"media_url":"http://wwww.twitpic.com/HzKd99.jpg"
},
{
"type":"photo",
"media_url":"http://wwww.twitpic.com/HDK43.jpg"
}
]
}
}
The key of the output should be the media_url. Because a url can be tweeted by more than one person I want the value to be an array containing the ids of tweeps. Something like this:
{
"_id": "http://www.foto.com/kdh34a.jpg",
"value":{
{ id:ObjectId("854737272343f8928") },
{ id:ObjectId("23137272378uie8928") },
{ id:ObjectId("85473727fdsd4x77665") },
{ id:ObjectId("8547372723dsd411zzc") }
}
}
I've created the following MapReduce functions:
map = function(){
if(!this.entities.media){
return;
}
for(index in this.entities.media){
emit(this.entities.media[index].media_url, {ids: [this._id]});
}
}
reduce = function(key, values){
var result = {};
for(id in values){
if(!values.indexOf(values[id])){
Array.prototype.push.apply(result, values);
}
}
return result;
}
db.tweets.mapReduce(map, reduce, {out: "media"});
When the media_url is unique the result is as follows:
{
"_id" : "http://wwww.twitpic.com/HzKd99.jpg",
"value" : {
"ids" : [
ObjectId("528748b423421150010021fd")
]
}
}
When it's not unique the results get weird:
{
"_id" : "http://wwww.twitpic.com/HzKd99.jpg",
"value" : {
"0" : {
"0" : {
"ids" : [
ObjectId("528733ac234211500100004f")
]
},
"1" : {
"ids" : [
ObjectId("52873c772342115001000d8d")
]
},
"2" : {
"ids" : [
ObjectId("52873e142342115001001017")
]
},
"3" : {
"ids" : [
ObjectId("5287545a2342115001004fd3")
]
},
"length" : 4
},
"1" : {
"ids" : [
ObjectId("5287c43b2342115001010e53")
]
},
"length" : 2
}
}
What is causing this and how do I get one nice list of values?