search document values within a text in mongodb - mongodb

I need to find all document values which are within a text/string:
Example:
Imagine I have the tag collection with the following documents:
db.tag.find()
{ "_id" : ObjectId("536f7107c55b2acc61000bc8"), "name" : "star" }
{ "_id" : ObjectId("536f710fc55b2acc61000bc9"), "name" : "star wars" }
{ "_id" : ObjectId("536f7117c55b2acc61000bca"), "name" : "spider" }
{ "_id" : ObjectId("537087d16ac5b5f6f58f0b1b"), "name" : "starting" }
I need something like this (example in mongodb shell):
db.tag.find({"name": { $subStrOF: "star wars episode VII" }})
returning this:
{ "_id" : ObjectId("536f7107c55b2acc61000bc8"), "name" : "star" }
{ "_id" : ObjectId("536f710fc55b2acc61000bc9"), "name" : "star wars" }
Any idea?
Thank you very much

Firstly, Start the mongod process with command line options
--setParameter textSearchEnabled=true
Example:
mongod --setParameter textSearchEnabled=true
Then in your mongo shell, create an index for name in tag collection
db.tag.ensureIndex({name : "text"});
Now you can query using text:
db.tag.runCommand("text", {search : "star wars episode"});
This will return you something like this
{
"queryDebugString" : "star||||||",
"language" : "english",
"results" : [
{
"score" : 1.1,
"obj" : {
"_id" : ObjectId("536f7107c55b2acc61000bc8"),
"name" : "star"
}
},
{
"score" : 0.75,
"obj" : {
"_id" : ObjectId("536f710fc55b2acc61000bc9"),
"name" : "star wars"
}
}
],
"stats" : {
"nscanned" : 2,
"nscannedObjects" : 0,
"n" : 2,
"nfound" : 2,
"timeMicros" : 152
},
"ok" : 1
}
To get only the results:
db.tag.runCommand("text", {search : "star wars episode"}).results
I have tested it with my local DB and it works fine. please check it
For more info about text : Mongo DOCs
BTW from mongoDB 2.6: DOCS
db.tag.find({$text : {$search : "star wars episode VII"}});

I got it working using $where, example below:
Command:
db.tag.find({$where:"'star wars episode VII'.search("star") >= 0"})
The output:
{ "_id" : ObjectId("536f7107c55b2acc61000bc8"), "name" : "star" }
{ "_id" : ObjectId("536f710fc55b2acc61000bc9"), "name" : "star wars" }
Hope it helps

Related

MongoDB find fields which are substring of a query text

I have been looking for a way to do that but couldn't find any.
I'd like to know if is possible to, from a given query, return all fields that are contained in that query.
For example my dataset is as follows:
{ "_id" : ObjectId("5d5c2b4cc1f74ace3a48a072"), "id" : 0, "term" : "shorts" }
{ "_id" : ObjectId("5d5c2b4cc1f74ace3a48a072"), "id" : 0, "term" : "jacket" }
{ "_id" : ObjectId("5d5c2b4cc1f74ace3a48a072"), "id" : 1, "term" : "yellow jacket" }
{ "_id" : ObjectId("5d5c2b56c1f74ace3a48a073"), "id" : 2, "term" : "blue jacket" }
{ "_id" : ObjectId("5d5c2b65c1f74ace3a48a074"), "id" : 3, "term" : "blue shorts" }
{ "_id" : ObjectId("5d5c2b71c1f74ace3a48a075"), "id" : 4, "term" : "red shorts" }
And now, given a text like: "I really love blue shorts", the return should be only:
{ "_id" : ObjectId("5d5c2b71c1f74ace3a48a075"), "id" : 3, "term" : "blue shorts" }
{ "_id" : ObjectId("5d5c2b4cc1f74ace3a48a072"), "id" : 0, "term" : "shorts" }
It's something like query.contains(field)
Using $where is generally discouraged in mongodb because of
javascript execution in the query system and can be slow.
You can try this out if the dataset is not very large. Its like doing reverse regex for the field value contained in the query.
db.collection.find({$where: "\""I really love blue shorts\".match(this.term)"});
Which outputs:
{ "_id" : ObjectId("5d5c32c1236f19364a8aad4d"), "id" : 0, "term" : "shorts"}
{ "_id" : ObjectId("5d5c32c1236f19364a8aad51"), "id" : 3, "term" : "blue shorts"}
NOTE: This takes the assumption that term is defined in the documents, else you can use a javascript function for the $where value to deal with edge cases such as not defined fields, etc.
{ $where: function() { return /* after edge cases dealt with*/ }
The following query can get us the expected output:
db.collection.aggregate([
{
$addFields:{
"searchString":"I really love blue shorts"
}
},
{
$match:{
$expr:{
$gt:[
{
$indexOfBytes:["$searchString","$term"]
},
-1
]
}
}
},
{
$project:{
"searchString":0
}
}
]).pretty()
Data set:
{
"_id" : ObjectId("5d5c2b4cc1f74ace3a48a070"),
"id" : 0,
"term" : "shorts"
}
{
"_id" : ObjectId("5d5c2b4cc1f74ace3a48a071"),
"id" : 0,
"term" : "jacket"
}
{
"_id" : ObjectId("5d5c2b4cc1f74ace3a48a072"),
"id" : 1,
"term" : "yellow jacket"
}
{
"_id" : ObjectId("5d5c2b56c1f74ace3a48a073"),
"id" : 2,
"term" : "blue jacket"
}
{
"_id" : ObjectId("5d5c2b65c1f74ace3a48a074"),
"id" : 3,
"term" : "blue shorts"
}
{
"_id" : ObjectId("5d5c2b71c1f74ace3a48a075"),
"id" : 4,
"term" : "red shorts"
}
Output:
{
"_id" : ObjectId("5d5c2b4cc1f74ace3a48a070"),
"id" : 0,
"term" : "shorts"
}
{
"_id" : ObjectId("5d5c2b65c1f74ace3a48a074"),
"id" : 3,
"term" : "blue shorts"
}

mongo query doesn't work with $ in field name

Command to get raw_data:
db.raw_data.find({'cat':'like'},
{'properties':1}).limit(1).pretty()
data:
{
"_id" : ObjectId("5656b9a0c2492dec3442da52"),
"properties" : {
"subcategory" : "49",
"$carrier" : "Vodafone India",
"$radio" : "HSDPA",
"$region" : "Gujarat",
"$screen_width" : 375,
"$wifi" : false,
"mp_lib" : "iphone",
"product_unlike_flag" : false,
"mp_device_model" : "iPhone7,2",
"user_id" : "4",
"$city" : "Ahmedabad",
"$manufacturer" : "Apple",
"$os" : "iPhone OS",
"brand" : "AO",
"gender" : "Men",
"mp_country_code" : "IN",
"time" : 1445376786,
"$app_release" : "0.8.0",
"$lib_version" : "2.8.2",
"$model" : "iPhone7,2",
"$screen_height" : 667,
"category" : "48",
"$app_version" : "0.8.0",
"$os_version" : "9.0.2",
"itemcode" : "174",
"source" : "Product"
}
}
I want to extract user_id and city from this data.
I tried these Command :
Command1 :
db.raw_data.aggregate([{$group : {_id :{'user_id': "$properties.user_id","cat":"$cat","brand":"$properties.brand" } ,"num_tutorial" :{$sum:1}}} ,{ $project : {properties.$city : 1 } } ])
Command2:
db.raw_data.find({'cat':'like'},{'properties.$city':1})
which gave me an error:
Error: error: {
"$err" : "Can't canonicalize query: BadValue Positional projection 'properties.$city' does not match the query document.",
"code" : 17287
}
Is there any way/query so I can extract user-city relations from this data.
In suck cases mongodb recommends to use unicode equivalent of '$'. This method is called $ sign escaping.
So your find query will look like following:
db.raw_data.find({'cat':'like'},{'properties.\uff04city':1})
Hope this helps :)
Result:
> db.collection.find({},{'properties.\uff04city':1})
{ "_id" : ObjectId("5656e09ccb0a925b3d5d16f2"), "properties" : { "$city" : "value" } }
{ "_id" : ObjectId("5656e502cb0a925b3d5d16f3"), "properties" : { "$city" : "value" } }

Get document based on multiple criteria of embedded collection

I have the following document, I need to search for multiple items from the embedded collection"items".
Here's an example of a single SKU
db.sku.findOne()
{
"_id" : NumberLong(1192),
"description" : "Uploaded via CSV",
"items" : [
{
"_id" : NumberLong(2),
"category" : DBRef("category", NumberLong(1)),
"description" : "840 tag visual",
"name" : "840 Visual Mini Round",
"version" : NumberLong(0)
},
{
"_id" : NumberLong(7),
"category" : DBRef("category", NumberLong(2)),
"description" : "Maxi",
"name" : "Maxi",
"version" : NumberLong(0)
},
{
"_id" : NumberLong(11),
"category" : DBRef("category", NumberLong(3)),
"description" : "Button",
"name" : "Button",
"version" : NumberLong(0)
},
{
"_id" : NumberLong(16),
"category" : DBRef("category", NumberLong(4)),
"customizationFields" : [
{
"_class" : "CustomizationField",
"_id" : NumberLong(1),
"displayText" : "Custom Print 1",
"fieldName" : "customPrint1",
"listOrder" : 1,
"maxInputLength" : 12,
"required" : false,
"version" : NumberLong(0)
},
{
"_class" : "CustomizationField",
"_id" : NumberLong(2),
"displayText" : "Custom Print 2",
"fieldName" : "customPrint2",
"listOrder" : 2,
"maxInputLength" : 17,
"required" : false,
"version" : NumberLong(0)
}
],
"description" : "2 custom lines of farm print",
"name" : "Custom 2",
"version" : NumberLong(2)
},
{
"_id" : NumberLong(20),
"category" : DBRef("category", NumberLong(5)),
"description" : "Color Red",
"name" : "Red",
"version" : NumberLong(0)
}
],
"skuCode" : "NF-USDA-XC2/SM-BC-R",
"version" : 0,
"webCowOptions" : "840miniwithcust2"
}
There are repeat items.id throughout the embedded collection. Each Sku is made up of multiple items, all combinations are unique, but one item will be part of many Skus.
I'm struggling with the query structure to get what I'm looking for.
Here are a few things I have tried:
db.sku.find({'items._id':2},{'items._id':7})
That one only returns items with the id of 7
db.sku.find({items:{$all:[{_id:5}]}})
That one doesn't return anything, but it came up when looking for solutions. I found about it in the MongoDB manual
Here's an example of a expected result:
sku:{ "_id" : NumberLong(1013),
"items" : [ { "_id" : NumberLong(5) },
{ "_id" : NumberLong(7) },
{ "_id" : NumberLong(12) },
{ "_id" : NumberLong(16) },
{ "_id" :NumberLong(2) } ] },
sku:
{ "_id" : NumberLong(1014),
"items" : [ { "_id" : NumberLong(5) },
{ "_id" : NumberLong(7) },
{ "_id" : NumberLong(2) },
{ "_id" : NumberLong(16) },
{ "_id" :NumberLong(24) } ] },
sku:
{ "_id" : NumberLong(1015),
"items" : [ { "_id" : NumberLong(5) },
{ "_id" : NumberLong(7) },
{ "_id" : NumberLong(12) },
{ "_id" : NumberLong(2) },
{ "_id" :NumberLong(5) } ] }
Each Sku that comes back has both a item of id:7, and id:2, with any other items they have.
To further clarify, my purpose is to determine how many remaining combinations exist after entering the first couple of items.
Basically a customer will start specifying items, and we'll weed it down to the remaining valid combinations. So Sku.items[0].id=5 can only be combined with items[1].id=7 or items[1].id=10 …. Then items[1].id=7 can only be combined with items[2].id=20 … and so forth
The goal was to simplify my rules for purchase, and drive it all from the Sku codes. I don't know if I dug a deeper hole instead.
Thank you,
On the part of extracting the sku with item IDs 2 and 7, when I recall correctly, you have to use $elemMatch:
db.sku.find({'items' :{ '$all' :[{ '$elemMatch':{ '_id' : 2 }},{'$elemMatch': { '_id' : 7 }}]}} )
which selects all sku where there is each an item with _id 2 and 7.
You can use aggregation pipelines
db.sku.aggregate([
{"$unwind": "$sku.items"},
{"$group": {"_id": "$_id", "items": {"$addToSet":{"_id": "$items._id"}}}},
{"$match": {"items._id": {$all:[2,7]}}}
])

MongoDB - $maxscan option

C:\>mongo
C:\>C:\Programs\MongoDB\bin\mongo.exe
MongoDB shell version: 2.4.8
connecting to: test
Welcome to the MongoDB shell!
[test] 2014-02-26 17:09:35.933 >>> db.people.count();
9
[test] 2014-02-26 17:09:39.10 >>> db.people.find({})._addSpecial("$maxscan", 5);
{
"_id" : ObjectId("530e61be188483458f1edca7"),
"name" : "joe",
"random" : 0.7170755963306874
}
{
"_id" : ObjectId("530e61c2188483458f1edca8"),
"name" : "mark",
"random" : 0.6132313262205571
}
{
"_id" : ObjectId("530e61c7188483458f1edca9"),
"name" : "john",
"random" : 0.07292630313895643
}
{
"_id" : ObjectId("530e621c188483458f1edcaa"),
"name" : "allen",
"random" : 0.09901093109510839
}
{
"_id" : ObjectId("530e636f188483458f1edcab"),
"name" : "kevin",
"random" : 0.9719919066410512
}
{
"_id" : ObjectId("530e6375188483458f1edcac"),
"name" : "nicola",
"random" : 0.4626409418415278
}
{
"_id" : ObjectId("530e6428188483458f1edcad"),
"name" : "peter",
"random" : 0.8568310006521642
}
{
"_id" : ObjectId("530e642d188483458f1edcae"),
"name" : "tim",
"random" : 0.5209994465112686
}
{
"_id" : ObjectId("530e6437188483458f1edcaf"),
"name" : "joseph",
"random" : 0.6217151982709765
}
[test] 2014-02-26 17:09:51.76 >>>
I have 9 documents in this collection.
I am calling the find query above with
the option _addSpecial("$maxscan", 5).
And still, 9 documents are returned.
I was expecting 5 documents returned.
Why is it behaving this way?
I looked at the documentation here
http://docs.mongodb.org/manual/reference/operator/meta/maxScan/
but I don't think it provides any clues.
MongoDB is type-sensitive and case-sensitive, that means that you have to write the operator exactly as it is. You wrote like $maxscan but actually, according with the documentation is $maxScan.

MongoDB query to break ties and remove duplicates

I have documents which have a Version, URL, and DateAdded field (among others but these are the relevant ones).
I'd like to find all documents where the Version is "5.5" and the DateAdded is less than or equal to January 1, 2013. That's pretty straightforward, but I also want the following behavior:
If two or more documents have the same URL, only return the one with the most recent DateAdded (provided, again, that is is less than or equal to January 1, 2013). It would be great if all of this could be expressed in a single query (but my main concern is performance).
I've been doing this last bit of filtering in my client code (outside of MongoDB) but this ends up being inefficient, not to mention inelegant.
I've also tried using Mongo's MapReduce functionality to accomplish the same thing but this is extremely slow, as it appears to copy much of my collection to another collection.
Is there a performant solution?
This should do the trick.
Example data:
db.foo.insert({ "_id" : ObjectId("528bd5bded29286a62959513"), "Version" : "5.3", "URL" : "foo.bar.com/asdfwoaef", "DateAdded" : ISODate("2012-10-05T00:00:00Z") })
db.foo.insert({ "_id" : ObjectId("528bd5e8ed29286a62959514"), "Version" : "5.6", "URL" : "foo.bar.com/asdfwoaef", "DateAdded" : ISODate("2012-12-05T00:00:00Z") })
db.foo.insert({ "_id" : ObjectId("528bd621ed29286a62959515"), "Version" : "5.5", "URL" : "foo.bar.com/aafoobbb", "DateAdded" : ISODate("2012-11-04T00:00:00Z") })
db.foo.insert({ "_id" : ObjectId("528bd629ed29286a62959516"), "Version" : "5.5", "URL" : "foo.bar.com/aafoobbb", "DateAdded" : ISODate("2012-11-05T00:00:00Z") })
db.foo.insert({ "_id" : ObjectId("528bd642ed29286a62959517"), "Version" : "5.5", "URL" : "foo.bar.com/aafoobbb", "DateAdded" : ISODate("2013-01-02T00:00:00Z") })
db.foo.insert({ "_id" : ObjectId("528bd744ed29286a62959518"), "Version" : "5.5", "URL" : "foo.bar.com/ccbarcc", "DateAdded" : ISODate("2013-01-02T00:00:00Z") })
db.foo.insert({ "_id" : ObjectId("528bd780ed29286a62959519"), "Version" : "5.5", "URL" : "foo.bar.com/ccbarcc", "DateAdded" : ISODate("2012-04-05T00:00:00Z") })
Pipeline:
pipeline = [
{
"$match" : {
"Version" : "5.5",
"DateAdded" : {
"$lt" : ISODate("2013-01-01T00:00:00Z")
}
}
},
{
"$sort" : {
"URL" : 1,
"DateAdded" : -1
}
},
{
"$group" : {
"_id" : "$URL",
"doc" : {
"$first" : {
"id" : "$_id",
"DateAdded" : "$DateAdded"
}
}
}
}
]
db.foo.aggregate(pipeline)
And here is the result:
{
"result" : [
{
"_id" : "foo.bar.com/ccbarcc",
"doc" : {
"id" : ObjectId("528bd780ed29286a62959519"),
"DateAdded" : ISODate("2012-04-05T00:00:00Z")
}
},
{
"_id" : "foo.bar.com/aafoobbb",
"doc" : {
"id" : ObjectId("528bd629ed29286a62959516"),
"DateAdded" : ISODate("2012-11-05T00:00:00Z")
}
}
],
"ok" : 1
}