Related
I'm new to mongodb and in this question I have 2 collections, one is selected_date, another is global_mobility_report, what I'm trying to do is to find entries in global_mobility_report whose date is in the selected_date so I use $lookup to join the two collections.
date_selected:
{
"_id" : ObjectId("5f60d81ba43174cf172ebfdc"),
"date" : ISODate("2020-05-22T00:00:00.000+08:00")
},
{
"_id" : ObjectId("5f60d81ba43174cf172ebfdd"),
"date" : ISODate("2020-05-23T00:00:00.000+08:00")
},
{
"_id" : ObjectId("5f60d81ba43174cf172ebfde"),
"date" : ISODate("2020-05-24T00:00:00.000+08:00")
},
{
"_id" : ObjectId("5f60d81ba43174cf172ebfdf"),
"date" : ISODate("2020-05-25T00:00:00.000+08:00")
},
{
"_id" : ObjectId("5f60d81ba43174cf172ebfe0"),
"date" : ISODate("2020-05-26T00:00:00.000+08:00")
},
{
"_id" : ObjectId("5f60d81ba43174cf172ebfe1"),
"date" : ISODate("2020-05-27T00:00:00.000+08:00")
}
global_mobility_report:
{
"_id" : ObjectId("5f49fb013acddb5eec37f99e"),
"country_region_code" : "AE",
"country_region" : "United Arab Emirates",
"sub_region_1" : "",
"sub_region_2" : "",
"metro_area" : "",
"iso_3166_2_code" : "",
"census_fips_code" : "",
"date" : "2020-02-15",
"retail_and_recreation_percent_change_from_baseline" : "0",
"grocery_and_pharmacy_percent_change_from_baseline" : "4",
"parks_percent_change_from_baseline" : "5",
"transit_stations_percent_change_from_baseline" : "0",
"workplaces_percent_change_from_baseline" : "2",
"residential_percent_change_from_baseline" : "1"
},
{
"_id" : ObjectId("5f49fb013acddb5eec37f99f"),
"country_region_code" : "AE",
"country_region" : "United Arab Emirates",
"sub_region_1" : "",
"sub_region_2" : "",
"metro_area" : "",
"iso_3166_2_code" : "",
"census_fips_code" : "",
"date" : "2020-02-16",
"retail_and_recreation_percent_change_from_baseline" : "1",
"grocery_and_pharmacy_percent_change_from_baseline" : "4",
"parks_percent_change_from_baseline" : "4",
"transit_stations_percent_change_from_baseline" : "1",
"workplaces_percent_change_from_baseline" : "2",
"residential_percent_change_from_baseline" : "1"
},
{
"_id" : ObjectId("5f49fb013acddb5eec37f9a0"),
"country_region_code" : "AE",
"country_region" : "United Arab Emirates",
"sub_region_1" : "",
"sub_region_2" : "",
"metro_area" : "",
"iso_3166_2_code" : "",
"census_fips_code" : "",
"date" : "2020-02-17",
"retail_and_recreation_percent_change_from_baseline" : "-1",
"grocery_and_pharmacy_percent_change_from_baseline" : "1",
"parks_percent_change_from_baseline" : "5",
"transit_stations_percent_change_from_baseline" : "1",
"workplaces_percent_change_from_baseline" : "2",
"residential_percent_change_from_baseline" : "1"
},
{
"_id" : ObjectId("5f49fb013acddb5eec37f9a1"),
"country_region_code" : "AE",
"country_region" : "United Arab Emirates",
"sub_region_1" : "",
"sub_region_2" : "",
"metro_area" : "",
"iso_3166_2_code" : "",
"census_fips_code" : "",
"date" : "2020-02-18",
"retail_and_recreation_percent_change_from_baseline" : "-2",
"grocery_and_pharmacy_percent_change_from_baseline" : "1",
"parks_percent_change_from_baseline" : "5",
"transit_stations_percent_change_from_baseline" : "0",
"workplaces_percent_change_from_baseline" : "2",
"residential_percent_change_from_baseline" : "1"
}
when I try to find all entries in global with 'date' match in selected_date(I have converted the string to data format in gobal_mobility_report), it returns empty array.
db.global_mobility_report.aggregate([
{$match:{country_region:"Indonesia"}},
{$addFields: {"dateconverted": {$convert: { input: "$date", to: "date", onError:"onErrorExpr", onNull:"onNullExpr"}:}}},
{
$lookup:
{
from: "selected_date",
localField:"dateconverted",
foreignField: "date",
as: "selected_dates" // empty
}
})]
The output is:
{
"_id" : ObjectId("5f49fd6a3acddb5eec4427bb"),
"country_region_code" : "ID",
"country_region" : "Indonesia",
"sub_region_1" : "",
"sub_region_2" : "",
"metro_area" : "",
"iso_3166_2_code" : "",
"census_fips_code" : "",
"date" : "2020-02-15",
"retail_and_recreation_percent_change_from_baseline" : "-2",
"grocery_and_pharmacy_percent_change_from_baseline" : "-2",
"parks_percent_change_from_baseline" : "-8",
"transit_stations_percent_change_from_baseline" : "1",
"workplaces_percent_change_from_baseline" : "5",
"residential_percent_change_from_baseline" : "1",
"dateconverted" : ISODate("2020-02-15T08:00:00.000+08:00"),
"selected_dates" : [ ]
},
{
"_id" : ObjectId("5f49fd6a3acddb5eec4427bc"),
"country_region_code" : "ID",
"country_region" : "Indonesia",
"sub_region_1" : "",
"sub_region_2" : "",
"metro_area" : "",
"iso_3166_2_code" : "",
"census_fips_code" : "",
"date" : "2020-02-16",
"retail_and_recreation_percent_change_from_baseline" : "-3",
"grocery_and_pharmacy_percent_change_from_baseline" : "-3",
"parks_percent_change_from_baseline" : "-7",
"transit_stations_percent_change_from_baseline" : "-4",
"workplaces_percent_change_from_baseline" : "2",
"residential_percent_change_from_baseline" : "2",
"dateconverted" : ISODate("2020-02-16T08:00:00.000+08:00"),
"selected_dates" : [ ]
}
The reason you are getting an empty array is because dateconverted does not match the date field.
The $lookup operator does an equality between the localField and the foreigntField field, so basically with an example
db.users.insertMany([
{ email: "test#example.com", userId: 0 },
{ email: "test2#example.com", userId: 1 },
{ email: "test3#example.com", userId: 2 },
{ email: "test3#example.com", userId: 3 }
]);
db.posts.insertMany([
{ by: 0, post: "hello world" },
{ by: 0 , post: "hello earthlings" },
{ by: 3, post: "test test test"}
]);
db.posts.aggregate([
{
$lookup: {
from: "users",
localField: "by",
foreignField: "userId",
as: "list_of_post"
}
}
]).toArray();
The output will be what it suppose to be, because the localField matched the ForeignField
[
{
"_id" : ObjectId("5f60f6859a6df3133b325eb0"),
"by" : 0,
"post" : "hello world",
"list_of_post" : [
{
"_id" : ObjectId("5f60f6849a6df3133b325eac"),
"email" : "test#example.com",
"userId" : 0
}
]
},
{
"_id" : ObjectId("5f60f6859a6df3133b325eb1"),
"by" : 0,
"post" : "hello earthlings",
"list_of_post" : [
{
"_id" : ObjectId("5f60f6849a6df3133b325eac"),
"email" : "test#example.com",
"userId" : 0
}
]
},
{
"_id" : ObjectId("5f60f6859a6df3133b325eb2"),
"by" : 3,
"post" : "test test test",
"list_of_post" : [
{
"_id" : ObjectId("5f60f6849a6df3133b325eaf"),
"email" : "test3#example.com",
"userId" : 3
}
]
}
]
Let's mimic a situation where it does not match
db.posts.drop();
db.posts.insertMany([
{ by: 20, post: "hello world" },
{ by: 23 , post: "hello earthlings" },
{ by: 50, post: "test test test"}
]);
We get an empty array
[
{
"_id" : ObjectId("5f60f83344304796ae700b4d"),
"by" : 20,
"post" : "hello world",
"list_of_post" : [ ]
},
{
"_id" : ObjectId("5f60f83344304796ae700b4e"),
"by" : 23,
"post" : "hello earthlings",
"list_of_post" : [ ]
},
{
"_id" : ObjectId("5f60f83344304796ae700b4f"),
"by" : 50,
"post" : "test test test",
"list_of_post" : [ ]
}
]
So, back to your question, the reason for the empty array is as a result of the dateconverted field not matching the date field. So, let's take a look at an example.
In the first document the dateconverted is
ISODate("2020-02-16T08:00:00.000+08:00") and checking at date_selected document , there is no field that correspond to this value ISODate("2020-02-16T08:00:00.000+08:00"). But let's manually insert this, so you will properly understand what I am talking about.
db.date_selected.insert({
"_id" : ObjectId(),
"date": ISODate("2020-02-16T08:00:00.000+08:00")
});
Running the aggregation pipeline will also make selected_dates an empty array. And the other thing you have to note is that the mm/dd/yyy part of the ISODate object does not also match any document in your question. Secondly, you have to devise another means of running the comparison, because the aggregation pipeline in the $addFileds stage will be affected by timezone and other issues as well.
I quite can't get how should I use the pipeline to filter the resulting array of my look up here's the code
{
"_id" : ObjectId("5d73d591b35c943a201837e2"),
"itemName" : "Vape",
"itemSellingPrice" : "350",
"itemPurchasePrice" : "300",
"itemAveragePurchasePrice" : "133.33333333333334",
"itemBaseUnit" : "Unit 2",
"itemReorderPoint" : "100",
"itemTotalQuantity" : "300",
"itemSumQuantity" : "500",
"itemLocation" : "1",
"itemSubLocation" : "sub loc 1",
"itemDateCreated" : ISODate("2019-09-07T16:06:41.521Z"),
"itemID" : 88,
"__v" : 0,
"salesData" : [
{
"_id" : ObjectId("5d73e23ed8422d2ba42049b4"),
"salesOrderCustomerName" : "Manong Puring",
"salesOrderInvoiceNumber" : "1123",
"salesOrderAddress" : "Jan lang",
"salesOrderPaymentStatus" : "Open",
"salesOrderTotalPaid" : "0",
"salesOrderTotalAmount" : "2800",
"salesOrderDiscrepancyAmount" : "2800",
"salesOrderItemList" : [
{
"_id" : ObjectId("5d73e23ed8422d2ba42049b5"),
"salesOrderSelectedItem" : "Vape",
"salesOrderAverage" : "133.33333333333334",
"salesOrderNewPrice" : "350",
"salesOrderPurchasePrice" : "300",
"salesOrderQuantity" : "8",
"salesOrderSubTotal" : "2800"
}
],
"salesOrderDateCreated" : ISODate("2019-09-07T16:00:00.000Z"),
"salesOrderSubLocation" : "sub loc 1",
"salesOrderLocation" : "1",
"salesOrderID" : 62,
"__v" : 0
},
{
"_id" : ObjectId("5d73e37164ade31b40775038"),
"salesOrderCustomerName" : "Manong Puring",
"salesOrderInvoiceNumber" : "123",
"salesOrderAddress" : "Jan lang",
"salesOrderPaymentStatus" : "Open",
"salesOrderTotalPaid" : "0",
"salesOrderTotalAmount" : "350",
"salesOrderDiscrepancyAmount" : "350",
"salesOrderItemList" : [
{
"_id" : ObjectId("5d73e37164ade31b40775039"),
"salesOrderSelectedItem" : "Vape",
"salesOrderAverage" : "133.33333333333334",
"salesOrderNewPrice" : "350",
"salesOrderPurchasePrice" : "300",
"salesOrderQuantity" : "1",
"salesOrderSubTotal" : "350"
}
],
"salesOrderDateCreated" : ISODate("2019-09-07T16:00:00.000Z"),
"salesOrderSubLocation" : "sub loc 2",
"salesOrderLocation" : "1",
"salesOrderID" : 63,
"__v" : 0
}
]
}
I only want to get salesData with salesOrderSubLocation: "sub loc 1"
but it is showing data with sub loc 2 also. Searched for a while but can't find an exact problem with mine.
here's my query
db.getCollection('itemmodels').aggregate(
{ '$match': { itemName: 'Vape' } },
{ '$lookup':
{
from: 'itemmodels',
let: { "itemName": "$itemName" },
pipeline: [
{ $match: {
"salesOrderSubLocation": "sub loc 1",
"salesOrderItemList.salesOrderSelectedItem": "$$itemName"
}
}
],
as: 'salesData'
}
})
any idea guys? I don't want to filter the result in the front end because it my cause some problems with tons of data in the future.
I am a newbie in druid. Trying to load a very simple data in JSON format to druid. The data contains just one dimension, one metric and timestamp. I have been successfully able to load data to druid for a different dataset but somehow I am getting errors for this dataset.
This is my index file :
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "datatemplate",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"Loc"
]
},
"timestampSpec" : {
"format" : "auto",
"column" : "Timestamp"
}
}
},
"metricsSpec" : [{"name" : "Qty","type" : "doubleSum","fieldName" : "Qty"}],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "day",
"queryGranularity" : "none",
"intervals" : ["2016-01-01T00:00:00Z/2030-06-30T00:00:00Z"],
"rollup" : true
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "datatemplate/",
"filter" : "datatemplate.json"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 10000000,
"maxRowsInMemory" : 40000,
"forceExtendableShardSpecs" : true
}
}
}
Also here is my dataset in JSON format:
{"Loc": "A", "Qty": "1", "Timestamp": "2017-12-01T00:00:00Z"}
{"Loc": "A", "Qty": "1", "Timestamp": "2017-12-01T00:00:00Z"}
{"Loc": "B", "Qty": "2", "Timestamp": "2017-12-01T00:00:00Z"}
{"Loc": "B", "Qty": "1", "Timestamp": "2017-12-01T00:00:00Z"}
By following tutorial at http://druid.io/docs/latest/tutorials/tutorial-loading-streaming-data.html , I was able to insert data into druid via Kafka console
Kafka console
The spec file looks as following
examples/indexing/wikipedia.spec
[
{
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"],
"dimensionExclusions" : [],
"spatialDimensions" : []
}
}
},
"metricsSpec" : [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "DAY",
"queryGranularity" : "NONE"
}
},
"ioConfig" : {
"type" : "realtime",
"firehose": {
"type": "kafka-0.8",
"consumerProps": {
"zookeeper.connect": "localhost:2181",
"zookeeper.connection.timeout.ms" : "15000",
"zookeeper.session.timeout.ms" : "15000",
"zookeeper.sync.time.ms" : "5000",
"group.id": "druid-example",
"fetch.message.max.bytes" : "1048586",
"auto.offset.reset": "largest",
"auto.commit.enable": "false"
},
"feed": "wikipedia"
},
"plumber": {
"type": "realtime"
}
},
"tuningConfig": {
"type" : "realtime",
"maxRowsInMemory": 500000,
"intermediatePersistPeriod": "PT10m",
"windowPeriod": "PT10m",
"basePersistDirectory": "\/tmp\/realtime\/basePersist",
"rejectionPolicy": {
"type": "messageTime"
}
}
}
]
I start realtime via
java -Xmx512m -Duser.timezone=UTC -Dfile.encoding=UTF-8 -Ddruid.realtime.specFile=examples/indexing/wikipedia.spec -classpath config/_common:config/realtime:lib/* io.druid.cli.Main server realtime
In Kafka console, I paste and enter the following
{"timestamp": "2013-08-10T01:02:33Z", "page": "Good Bye", "language" : "en", "user" : "catty", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
Then I tend to perform query by creating select.json and run curl -X POST 'http://localhost:8084/druid/v2/?pretty' -H 'content-type: application/json' -d #select.json
select.json
{
"queryType": "select",
"dataSource": "wikipedia",
"dimensions":[],
"metrics":[],
"granularity": "all",
"intervals": [
"2000-01-01/2020-01-02"
],
"filter" : {"type":"and",
"fields" : [
{ "type": "selector", "dimension": "user", "value": "catty" }
]
},
"pagingSpec":{"pagingIdentifiers": {}, "threshold":500}
}
I was able to get the following result.
[ {
"timestamp" : "2013-08-10T01:02:33.000Z",
"result" : {
"pagingIdentifiers" : {
"wikipedia_2013-08-10T00:00:00.000Z_2013-08-11T00:00:00.000Z_2013-08-10T00:00:00.000Z" : 0
},
"events" : [ {
"segmentId" : "wikipedia_2013-08-10T00:00:00.000Z_2013-08-11T00:00:00.000Z_2013-08-10T00:00:00.000Z",
"offset" : 0,
"event" : {
"timestamp" : "2013-08-10T01:02:33.000Z",
"continent" : "North America",
"robot" : "false",
"country" : "United States",
"city" : "San Francisco",
"newPage" : "true",
"unpatrolled" : "true",
"namespace" : "article",
"anonymous" : "false",
"language" : "en",
"page" : "Good Bye",
"region" : "Bay Area",
"user" : "catty",
"deleted" : 200.0,
"added" : 57.0,
"count" : 1,
"delta" : -143.0
}
} ]
}
} ]
It seem that I had setup Druid correctly.
Now, I would like to insert data via HTTP endpoint. According to How realtime data input to Druid?, it seems like recommended way is to use tranquility
tranquility
I have indexing service started via
java -Xmx2g -Duser.timezone=UTC -Dfile.encoding=UTF-8 -classpath config/_common:config/overlord:lib/*: io.druid.cli.Main server overlord
conf/server.json looks like
{
"dataSources" : [
{
"spec" : {
"dataSchema" : {
"dataSource" : "wikipedia",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"timestampSpec" : {
"column" : "timestamp",
"format" : "auto"
},
"dimensionsSpec" : {
"dimensions": ["page","language","user","unpatrolled","newPage","robot","anonymous","namespace","continent","country","region","city"],
"dimensionExclusions" : [],
"spatialDimensions" : []
}
}
},
"metricsSpec" : [{
"type" : "count",
"name" : "count"
}, {
"type" : "doubleSum",
"name" : "added",
"fieldName" : "added"
}, {
"type" : "doubleSum",
"name" : "deleted",
"fieldName" : "deleted"
}, {
"type" : "doubleSum",
"name" : "delta",
"fieldName" : "delta"
}],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "DAY",
"queryGranularity" : "NONE"
}
},
"tuningConfig" : {
"windowPeriod" : "PT10M",
"type" : "realtime",
"intermediatePersistPeriod" : "PT10M",
"maxRowsInMemory" : "100000"
}
},
"properties" : {
"task.partitions" : "1",
"task.replicants" : "1"
}
}
],
"properties" : {
"zookeeper.connect" : "localhost",
"http.port" : "8200",
"http.threads" : "8"
}
}
Then, I start the server using
bin/tranquility server -configFile conf/server.json
I perform post to http://xx.xxx.xxx.xxx:8200/v1/post/wikipedia, with content-type equals application/json
{"timestamp": "2013-08-10T01:02:33Z", "page": "Selamat Pagi", "language" : "en", "user" : "catty", "unpatrolled" : "true", "newPage" : "true", "robot": "false", "anonymous": "false", "namespace":"article", "continent":"North America", "country":"United States", "region":"Bay Area", "city":"San Francisco", "added": 57, "deleted": 200, "delta": -143}
I get the the following respond
{"result":{"received":1,"sent":0}}
It seems that tranquility has received our data, but failed to send it to druid!
I try to run curl -X POST 'http://localhost:8084/druid/v2/?pretty' -H 'content-type: application/json' -d #select.json, but doesn't get the output I inserted via tranquility.
Any idea why? Thanks.
This generally happens when the data you send is out of the window period. If you are inserting data manually, give the exact current timestamp (UTC) in milliseconds. Else it can be easily done if you are using any script to generate data. Make sure it is UTC current time.
It is extremely difficult to setup druid to work properly with real-time data insertion.
The best bet I found is, use https://github.com/implydata . Imply is a set of wrappers around druid, to make it easy to use.
However, the real-time insertion in imply is not perfect either. I had experiment OutOfMemoryException, after inserting 30 millions items via real-time. This will caused data loss on previous inserted 30 millions rows.
The detailed regarding data loss can be found here : https://groups.google.com/forum/#!topic/imply-user-group/95xpYojxiOg
An issue ticket has been filed : https://github.com/implydata/distribution/issues/8
Druid streaming windowPeriod is very short (10 minutes). Outside this period, your event will be ignored.
As you got {"result":{"received":1,"sent":0}}, your worker threads are working fine. Tranquility decides what data is sent to the druid based on the timestamp associated with the data.
This period is decided by the "windowPeriod" configuration. So if your type is realtime ("type":"realtime") and window period is PT10M ("windowPeriod" : "PT10M"), tranquility will send any data between t-10, t+10 and not send anything outside this period.
I disagree with the insertion efficiency problems, we have been sending 3million rows every 15 minutes since June 2016 and has been running beautifully. Of course, we have a stronger infrastructure deemed for the scale.
Another reason for not inserting, is out memory on the coordinador/overloard are running
I'm trying to do a query in mongodb but I can't get it to work.
My document looks something like this.
{
"_id" : ObjectId("5305e54133e65b7341d63af3"),
"clients" : [
{
"aggregations" : {
"department" : [
"department1",
"department3"
],
"customer" : "customer2"
},
"lastLogin" : ISODate("2014-02-26T09:41:56.445Z"),
"locale" : "en"
"name" : "Test",
"validFrom" : null,
"validTo" : null,
"visiting" : {
"phone" : "031-303030",
"company" : "MyCompany",
"office" : [
"jag är ett test",
"lite mer data"
],
"country" : "Norge"
}
},
{
"approvedEmailSent" : true,
"lastLogin" : ISODate("2014-03-01T15:27:12.252Z"),
"locale" : "en",
"name" : "Test2",
"visiting" : {
"phone" : "031-307450",
"company" : "Other Company",
"branch" : "Advertising agency"
}
}
],
"firstname" : "Greger",
"lastname" : "Aronsson",
"username" : "TheUsername"
}
As you can see a user can have many clients. They are matched by name. The clients have visiting.company but sometimes this will not be the case.
I want to query where the clients.name is Test and regexp for visting.company and also firstname, lastname. If I'm logged in at Test2 I don't want hits on visiting.company "MyCompany". Hope this makes sense!
You can write query like :
db.visitCompany2.find({ $or : [
{'clients.name': 'Test2'}, //Company name
{'clients.visiting.company': {
$regex: /Other/g //Your visiting company regex
}},
{firstname: "Greger"},
{"lastname": "Aronsson}"
]}, {
'clients.$': 1, //projection for clients
firstname: 1,
lastname: 1
});
Output:
{
"_id": ObjectId("5305e54133e65b7341d63af3"),
"clients": [{
"approvedEmailSent": true,
"lastLogin": ISODate("2014-03-01T15:27:12.252Z"),
"locale": "en",
"name": "Test2",
"visiting": {
"phone": "031-307450",
"company": "Other Company",
"branch": "Advertising agency"
}
}],
"firstname": "Greger",
"lastname": "Aronsson"
}