Find Rank Using Druid via druid-datasketches extension - druid

Need to find rank of a company lets say McDonalds in various scenarios in druid. For this purpose I came across an extension called "druid-datasketches" which I think can be helpful.
After going through the documentation I was able to form this query but its incorrect.
{
"queryType": "topN",
"dataSource": "company",
"granularity": "all",
"intervals": [
"2021-01-01T00:00:00/2021-02-28T23:59:59"
],
"dimension": "corporation",
"threshold": 50,
"metric": "rank",
"aggregations": [
{
"type": "doubleSum",
"name": "cr_sales",
"fieldName": "cr_sales"
},
{
"type": "quantilesDoublesSketch",
"name": "skechers",
"fieldName": "cr_sales",
"k": 4096
}
],
"postAggregations": [
{
"type": "quantilesDoublesSketchToRank",
"name": "rank",
"field": {
"type": "fieldAccess",
"fieldName": "skechers"
},
"value": "McDonalds"
}
],
"context": {
"skipEmptyBuckets": true,
"minTopNThreshold": 100000,
"populateResultLevelCache": false
}
}
Need some guidance to be able to form the correct query.

Related

How to group by single field and return more values together

I'm starting to use apache druid but having some difficult to run native queries (and some SQL too).
1- Is it possible to groupBy a single column while also returning more channels?
2- How could I groupBy a single column, while returning different grouped itens on same query/row ?
Query I'm trying to use:
{
"queryType": "groupBy",
"dataSource": "my-data-source",
"granularity": "all",
"intervals": ["2022-06-27T03:00:00.000Z/2022-06-28T03:00:00.000Z"],
"context:": { "timeout: 30000 },
"dimensions": ["userId"],
"filter": {
"type": "and",
"fields": [
{
"type": "or",
"fields": [{...}]
}
]
},
"aggregations": [
{
"type": "count",
"name": "count"
}
]
}
Tried to add a filtered type inside aggregations:[] but 0 changes happened.
"aggregations": [
{
"type: "count",
"name": "count"
},
{
"type": "filtered",
"filter": {
"type": "selector",
"dimension": "block_id",
"value": "block1"
},
"aggregator": {
"type": "count",
"name": "block1",
"fieldName": "block_id"
}
}
]
Grouping Aggregator also didn't work.
"aggregations": [
{
"type": "count",
"name": "count"
},
{
"type": "grouping",
"name": "groupedData",
"groupings": ["block_id"]
}
],
Below is the image illustrating the results I'm trying to achieve.
Not sure yet how to get the results in the format you want, but as a start, something like this might be a step:
{
"queryType": "groupBy",
"dataSource": {
"type": "table",
"name": "dataTest"
},
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"filter": null,
"granularity": {
"type": "all"
},
"dimensions": [
{
"type": "default",
"dimension": "d2_ts2",
"outputType": "STRING"
},
{
"type": "default",
"dimension": "d3_email",
"outputType": "STRING"
}
],
"aggregations": [
{
"type": "count",
"name": "myCount",
}
],
"descending": false
}
I'm curious, what is the use case?
Using a SQL query you can do it this way:
SELECT UserID,
sum(1) FILTER (WHERE BlockId = 'block1') as Block1,
sum(1) FILTER (WHERE BlockId = 'block2') as Block2,
sum(1) FILTER (WHERE BlockId = 'block3') as Block3
FROM inline_data
GROUP BY 1
The Native Query for this (from the explain) is:
{
"queryType": "topN",
"dataSource": {
"type": "table",
"name": "inline_data"
},
"virtualColumns": [
{
"type": "expression",
"name": "v0",
"expression": "1",
"outputType": "LONG"
}
],
"dimension": {
"type": "default",
"dimension": "UserID",
"outputName": "d0",
"outputType": "STRING"
},
"metric": {
"type": "dimension",
"previousStop": null,
"ordering": {
"type": "lexicographic"
}
},
"threshold": 101,
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"filter": null,
"granularity": {
"type": "all"
},
"aggregations": [
{
"type": "filtered",
"aggregator": {
"type": "longSum",
"name": "a0",
"fieldName": "v0",
"expression": null
},
"filter": {
"type": "selector",
"dimension": "BlockId",
"value": "block1",
"extractionFn": null
},
"name": "a0"
},
{
"type": "filtered",
"aggregator": {
"type": "longSum",
"name": "a1",
"fieldName": "v0",
"expression": null
},
"filter": {
"type": "selector",
"dimension": "BlockId",
"value": "block2",
"extractionFn": null
},
"name": "a1"
},
{
"type": "filtered",
"aggregator": {
"type": "longSum",
"name": "a2",
"fieldName": "v0",
"expression": null
},
"filter": {
"type": "selector",
"dimension": "BlockId",
"value": "block3",
"extractionFn": null
},
"name": "a2"
}
],
"postAggregations": [],
"context": {
"populateCache": false,
"sqlOuterLimit": 101,
"sqlQueryId": "bb92e899-c127-49b0-be1b-d4b38909d166",
"useApproximateCountDistinct": false,
"useApproximateTopN": false,
"useCache": false,
"useNativeQueryExplain": true
},
"descending": false
}

How to add "and" and "or" filters at the same time in a druid query?

I am trying to write a druid query (json file) which is "and" and "or" filters at the same time. I tried this:
{
"queryType": "select",
"dataSource": "timeseries_db",
"intervals": ["2020-09-08T17:00:00\/2020-09-08T17:30:00"],
"pagingSpec":{ "threshold":100},
"filter": {
"type": "and",
"fields": [
{
"type": "selector",
"dimension": "orgid",
"value": "864e1875-ea80-455b-a835-f75bed0df2a7"
},
"filter": {
"type": "or",
"fields": [
{
"type": "selector",
"dimension": "host_name",
"value": "testname.82fae43ca"
},
{
"type": "selector",
"dimension": "host_name",
"value": "testname.09db12d4a"
}
]
}
]
},
"granularity": "all"
}
I want to get the records whose org id is 864e1875-ea80-455b-a835-f75bed0df2a7 and whose host name is testname.82fae43ca or testname.09db12d4a
When I run this query, it throws error saying that "type" is missing. But I have included "type". Any help is appreciated.
You don't need the extra "filter" inside the and filter. Try this:
{
"queryType": "select",
"dataSource": "timeseries_db",
"intervals": ["2020-09-08T17:00:00\/2020-09-08T17:30:00"],
"pagingSpec":{ "threshold":100},
"filter": {
"type": "and",
"fields": [
{
"type": "selector",
"dimension": "orgid",
"value": "864e1875-ea80-455b-a835-f75bed0df2a7"
},
{
"type": "or",
"fields": [
{
"type": "selector",
"dimension": "host_name",
"value": "testname.82fae43ca"
},
{
"type": "selector",
"dimension": "host_name",
"value": "testname.09db12d4a"
}
]
}
]
},
"granularity": "all"
}
Another way to do this is to use an in filter instead of two selector filters inside an or:
{
"queryType": "select",
"dataSource": "timeseries_db",
"intervals": ["2020-09-08T17:00:00\/2020-09-08T17:30:00"],
"pagingSpec":{ "threshold":100},
"filter": {
"type": "and",
"fields": [
{
"type": "selector",
"dimension": "orgid",
"value": "864e1875-ea80-455b-a835-f75bed0df2a7"
},
{
"type": "in",
"dimension": "host_name",
"values": ["testname.82fae43ca", "testname.09db12d4a"]
}
]
},
"granularity": "all"
}

Apache Druid GroupBy Virtual columns

I am trying to do a groupby virtual column in a Druid native query which looks like this...
{
"queryType": "groupBy",
"dataSource": "trace_info",
"granularity": "none",
"virtualColumns": [
{
"type": "expression",
"name": "tenant",
"expression": "replace(array_offset(tags, array_offset_of(tagNames, 'tenant')), 'tenant:', '')"
},
{
"type": "expression",
"name": "rc",
"expression": "replace(array_offset(tags, array_offset_of(tagNames, 'row_count')), 'row_count:', '')"
}
],
"dimensions": [
"tenant"
],
"aggregations": [
{
"type": "longSum",
"name": "trc",
"fieldName": "rc"
}
],
...
...
...
"intervals": [
"..."
]
}
This gives out a single row with longsum of all row_counts as if the groupBy column is null.
Is my usage correct or is this a known issue in Druid. The documentation says virtual columns can be used like normal dimensions but, is not very clear on how or even a working example is missing.
Thanks!
Phani
Latest Edit...
Some more digging to find out that the issue was with missing "outputType" attributes on the the virtual columns. Strange because the aggregator is able to auto-detect time and calculate the long sum properly even though the group by results were wrong.
"virtualColumns": [
{
"type": "expression",
"name": "tenant",
"expression": "replace(array_offset(tags, array_offset_of(tagNames, 'tenant')), 'tenant:', '')",
"outputType": "STRING"
},
{
"type": "expression",
"name": "rc",
"expression": "replace(array_offset(tags, array_offset_of(tagNames, 'row_count')), 'row_count:', '')"
"outputType": "LONG"
}
],
See above (below is likely a non-performant way of working around the problem).
After some trial and error I have a workaround for this using extraction dimensions. Although not sure, I suspect that this is a temporary issue in Druid 0.18.1. Hopefully Grouping on VCs will work as advertised in future builds.
{
"queryType": "groupBy",
"dataSource": "trace_info",
"granularity": "none",
"virtualColumns": [
{
"type": "expression",
"name": "tenant",
"expression": "replace(array_offset(tags, array_offset_of(tagNames, 'tenant')), 'tenant:', '')"
},
{
"type": "expression",
"name": "rc",
"expression": "replace(array_offset(tags, array_offset_of(tagNames, 'row_count')), 'row_count:', '')"
}
],
"dimensions": [
{
"type": "extraction",
"dimension": "tenant",
"outputName": "t",
"extractionFn": {
"type" : "substring", "index" : 1
}
}
],
"aggregations": [
{
"type": "longSum",
"name": "trc",
"fieldName": "rc"
}
],
...
...
...
"intervals": [
"..."
]
}

How to use stored procedure as input dataset in ADF (How to assign database it uses)

I want to run a stored procedure against a linkedservice (azure sql database) and output the result of that stored procedure to a dataset (azure sql database).
Is this possible?
I currently have ended up with this:
Pipeline: It should use a stored procedure that is found on a database defined as a linkedservice and copy that over to the output dataset (an azure sql database)
{
"$schema": "http://datafactories.schema.management.azure.com/schemas/2015-09-01/Microsoft.DataFactory.Pipeline.json",
"name": "CopyGetViewsByDateRange",
"properties": {
"description": "<Enter the Pipeline description here>",
"activities": [
{
"name": "CopyActivityTemplate",
"type": "Copy",
"inputs": [
{
"name": "InputDataset"
}
],
"outputs": [
{
"name": "OutputDataset"
}
],
"typeProperties": {
"source": {
"type": "SqlSource",
"sqlReaderStoredProcedureName": "Analytics_GetViewsByDateRange2",
"storedProcedureParameters": {
"clientid": { "value": "12345", "type": "Int" },
"startdateid": { "value": "20170421", "type": "Int" },
"enddateid": { "value": "20170514", "type": "Int" }
}
},
"sink": {
"type": "SqlSink"
}
},
"policy": {
"concurrency": 1,
"executionPriorityOrder": "OldestFirst",
"retry": 3,
"timeout": "01:00:00"
},
"scheduler": {
"frequency": "Minute",
"interval": "15"
}
}
],
"start": "2017-05-15T00:00:00Z",
"end": "2017-05-17T00:00:00Z"
}
}
Input dataset (Note the comments):
{
"$schema": "http://datafactories.schema.management.azure.com/schemas/2015-09-01/Microsoft.DataFactory.Table.json",
"name": "InputDataset",
"properties": {
"type": "AzureSqlTable", // This surely needs to be a stored procedure type
"linkedServiceName": "AnalyticsAMECDevDB",
"structure": [
{
"name": "Client_Id",
"type": "Int64"
},
{
"name": "DimDate_Id",
"type": "Int64"
},
{
"name": "TotalContentViews",
"type": "Int64"
} // The structure represents what the stored procedure is outputting
],
"typeProperties": {
"tableName": "Analytics.FactPageViews" // This is obviously not right
},
"availability": {
"frequency": "Minute",
"interval": "15"
},
"external": true
}
}
My stored procedure looks like this:
SELECT
#clientid as Client_Id,
[DimDateId] as DimDate_Id,
count(1) as TotalContentViews
FROM
[Analytics].[FactPageViews] as pageviews
inner join Analytics.DimPages as pages
on pageviews.dimpageid = pages.id
where
DimDateId between #startdateid and #enddateid
group by
dimdateid
order by
dimdateid
EDIT (got something to work atleast)
I am currently managing it by defining a query and running the command there:
"activities": [
{
"type": "Copy",
"typeProperties": {
"source": {
"type": "SqlSource",
"sqlReaderQuery": "$$Text.Format('EXEC [dbo].[GetViewsByDateRange] 2, 20170421, 20170514', WindowStart, WindowEnd)"
},
"sink": {
"type": "SqlSink",
"writeBatchSize": 0,
"writeBatchTimeout": "00:00:00"
},
"translator": {
"type": "TabularTranslator",
"columnMappings": "Client_Id:Client_Id,DimDate_Id:DimDate_Id,TotalContentViews:TotalContentViews"
}
},
"inputs": [
{
"name": "InputDataset-0af"
}
],
"outputs": [
{
"name": "OutputDataset-0af"
}
],
I think you've got everything right. To answer your question. Simply, you don't need to have an input dataset defined in your pipeline/activity. So yes, certainly possible.
Just have the output dataset defined as the result of the stored proc.
Hope this helps
I'm not sure this may help you to solve your problem,
Change your input and output dataset as below.
Input dataset
{
"$schema": "http://datafactories.schema.management.azure.com/schemas/2015-09-01/Microsoft.DataFactory.Table.json",
"name": "ofcspdataset",
"properties": {
"type": "AzureSqlTable",
"linkedServiceName": "sproctestout",
"typeProperties": {
"tableName": "dbo.emp" ==> >>need to refer any table be in the source database.
},
"external": true,
"availability": {
"frequency": "Day",
"interval": 1
}
}
}
Output Dataset:
{
"$schema": "http://datafactories.schema.management.azure.com/schemas/2015-09-01/Microsoft.DataFactory.Table.json",
"name": "OfficeTestOuputTable",
"properties": {
"published": false,
"type": "AzureSqlTable",
"linkedServiceName": "sproctestout",
"structure": [
{ "name": "Id" },
{ "name": "GroupId" }
],
"typeProperties": {
"tableName": "dbo.testAdf_temp"
},
"availability": {
"frequency": "Day",
"interval": 1
}
}
}
And I'm sure your pipeline is good. Just try to change the input and output dataset.
For me its works.

Orion notification to cygnus

I have followed the official guide about entity creation/update and subscribtion in Orion, they are working and I get success responses. But Orion doesnt send notifications to Cygnus.
Am i missing some step here?
These are the basic scripts I am using:
create entity
{
"contextElements": [{
"type": "Room",
"isPattern": "false",
"id": "2FebRoom",
"attributes": [{
"name": "temperature",
"type": "float",
"value": "888"
}]
}],
"updateAction": "APPEND"
}
subscribe entity http://orion.lab.fiware.org:1026/v1/subscribeContext
{
"entities": [
{
"type": "Room",
"isPattern": "false",
"id": "2FebRoom"
}
],
"attributes": [
"temperature"
],
"reference": "http://A.B.C.D:5050/notify",
"duration": "P1M",
"notifyConditions": [
{
"type": "ONCHANGE",
"condValues": [
"temperature"
]
}
],
"throttling": "PT5S"
}
update entity
{
"contextElements": [
{
"type": "Room",
"isPattern": "false",
"id": "2FebRoom",
"attributes": [
{
"name": "temperature",
"type": "float",
"value": "111"
}
]
}
],
"updateAction": "UPDATE"
}
I can query the new value in Orion after the update operation but Cygnus doesnt receive any notification, what would be the problem?
Many thanks
Problem was caused by a temporal outage in outgoing notifications from orion.lab.fiware.org. The problem has been solved.