How to group by single field and return more values together - group-by

I'm starting to use apache druid but having some difficult to run native queries (and some SQL too).
1- Is it possible to groupBy a single column while also returning more channels?
2- How could I groupBy a single column, while returning different grouped itens on same query/row ?
Query I'm trying to use:
{
"queryType": "groupBy",
"dataSource": "my-data-source",
"granularity": "all",
"intervals": ["2022-06-27T03:00:00.000Z/2022-06-28T03:00:00.000Z"],
"context:": { "timeout: 30000 },
"dimensions": ["userId"],
"filter": {
"type": "and",
"fields": [
{
"type": "or",
"fields": [{...}]
}
]
},
"aggregations": [
{
"type": "count",
"name": "count"
}
]
}
Tried to add a filtered type inside aggregations:[] but 0 changes happened.
"aggregations": [
{
"type: "count",
"name": "count"
},
{
"type": "filtered",
"filter": {
"type": "selector",
"dimension": "block_id",
"value": "block1"
},
"aggregator": {
"type": "count",
"name": "block1",
"fieldName": "block_id"
}
}
]
Grouping Aggregator also didn't work.
"aggregations": [
{
"type": "count",
"name": "count"
},
{
"type": "grouping",
"name": "groupedData",
"groupings": ["block_id"]
}
],
Below is the image illustrating the results I'm trying to achieve.

Not sure yet how to get the results in the format you want, but as a start, something like this might be a step:
{
"queryType": "groupBy",
"dataSource": {
"type": "table",
"name": "dataTest"
},
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"filter": null,
"granularity": {
"type": "all"
},
"dimensions": [
{
"type": "default",
"dimension": "d2_ts2",
"outputType": "STRING"
},
{
"type": "default",
"dimension": "d3_email",
"outputType": "STRING"
}
],
"aggregations": [
{
"type": "count",
"name": "myCount",
}
],
"descending": false
}

I'm curious, what is the use case?
Using a SQL query you can do it this way:
SELECT UserID,
sum(1) FILTER (WHERE BlockId = 'block1') as Block1,
sum(1) FILTER (WHERE BlockId = 'block2') as Block2,
sum(1) FILTER (WHERE BlockId = 'block3') as Block3
FROM inline_data
GROUP BY 1
The Native Query for this (from the explain) is:
{
"queryType": "topN",
"dataSource": {
"type": "table",
"name": "inline_data"
},
"virtualColumns": [
{
"type": "expression",
"name": "v0",
"expression": "1",
"outputType": "LONG"
}
],
"dimension": {
"type": "default",
"dimension": "UserID",
"outputName": "d0",
"outputType": "STRING"
},
"metric": {
"type": "dimension",
"previousStop": null,
"ordering": {
"type": "lexicographic"
}
},
"threshold": 101,
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"filter": null,
"granularity": {
"type": "all"
},
"aggregations": [
{
"type": "filtered",
"aggregator": {
"type": "longSum",
"name": "a0",
"fieldName": "v0",
"expression": null
},
"filter": {
"type": "selector",
"dimension": "BlockId",
"value": "block1",
"extractionFn": null
},
"name": "a0"
},
{
"type": "filtered",
"aggregator": {
"type": "longSum",
"name": "a1",
"fieldName": "v0",
"expression": null
},
"filter": {
"type": "selector",
"dimension": "BlockId",
"value": "block2",
"extractionFn": null
},
"name": "a1"
},
{
"type": "filtered",
"aggregator": {
"type": "longSum",
"name": "a2",
"fieldName": "v0",
"expression": null
},
"filter": {
"type": "selector",
"dimension": "BlockId",
"value": "block3",
"extractionFn": null
},
"name": "a2"
}
],
"postAggregations": [],
"context": {
"populateCache": false,
"sqlOuterLimit": 101,
"sqlQueryId": "bb92e899-c127-49b0-be1b-d4b38909d166",
"useApproximateCountDistinct": false,
"useApproximateTopN": false,
"useCache": false,
"useNativeQueryExplain": true
},
"descending": false
}

Related

org.apache.avro.AvroTypeException: Unknown union branch EventId

I am trying to convert a json to avro using 'kafka-avro-console-producer' and publish it to kafka topic.
I am able to do that flat json/schema's but for below given schema and json I am getting
"org.apache.avro.AvroTypeException: Unknown union branch EventId" error.
Any help would be appreciated.
Schema :
{
"type": "record",
"name": "Envelope",
"namespace": "CoreOLTPEvents.dbo.Event",
"fields": [{
"name": "before",
"type": ["null", {
"type": "record",
"name": "Value",
"fields": [{
"name": "EventId",
"type": "long"
}, {
"name": "CameraId",
"type": ["null", "long"],
"default": null
}, {
"name": "SiteId",
"type": ["null", "long"],
"default": null
}],
"connect.name": "CoreOLTPEvents.dbo.Event.Value"
}],
"default": null
}, {
"name": "after",
"type": ["null", "Value"],
"default": null
}, {
"name": "op",
"type": "string"
}, {
"name": "ts_ms",
"type": ["null", "long"],
"default": null
}],
"connect.name": "CoreOLTPEvents.dbo.Event.Envelope"
}
And Json input is like below :
{
"before": null,
"after": {
"EventId": 12,
"CameraId": 10,
"SiteId": 11974
},
"op": "C",
"ts_ms": null
}
And in my case I cant alter schema, I can alter only json such a way that it works
If you are using the Avro JSON format, the input you have is slightly off. For unions, non-null values need to be specified such that the type information is listed: https://avro.apache.org/docs/current/spec.html#json_encoding
See below for an example which I think should work.
{
"before": null,
"after": {
"CoreOLTPEvents.dbo.Event.Value": {
"EventId": 12,
"CameraId": {
"long": 10
},
"SiteId": {
"long": 11974
}
}
},
"op": "C",
"ts_ms": null
}
Removing "connect.name": "CoreOLTPEvents.dbo.Event.Value" and "connect.name": "CoreOLTPEvents.dbo.Event.Envelope" as The RecordType can only contains {'namespace', 'aliases', 'fields', 'name', 'type', 'doc'} keys.
Could you try with below schema and see if you are able to produce the msg?
{
"type": "record",
"name": "Envelope",
"namespace": "CoreOLTPEvents.dbo.Event",
"fields": [
{
"name": "before",
"type": [
"null",
{
"type": "record",
"name": "Value",
"fields": [
{
"name": "EventId",
"type": "long"
},
{
"name": "CameraId",
"type": [
"null",
"long"
],
"default": "null"
},
{
"name": "SiteId",
"type": [
"null",
"long"
],
"default": "null"
}
]
}
],
"default": null
},
{
"name": "after",
"type": [
"null",
"Value"
],
"default": null
},
{
"name": "op",
"type": "string"
},
{
"name": "ts_ms",
"type": [
"null",
"long"
],
"default": null
}
]
}

How to add "and" and "or" filters at the same time in a druid query?

I am trying to write a druid query (json file) which is "and" and "or" filters at the same time. I tried this:
{
"queryType": "select",
"dataSource": "timeseries_db",
"intervals": ["2020-09-08T17:00:00\/2020-09-08T17:30:00"],
"pagingSpec":{ "threshold":100},
"filter": {
"type": "and",
"fields": [
{
"type": "selector",
"dimension": "orgid",
"value": "864e1875-ea80-455b-a835-f75bed0df2a7"
},
"filter": {
"type": "or",
"fields": [
{
"type": "selector",
"dimension": "host_name",
"value": "testname.82fae43ca"
},
{
"type": "selector",
"dimension": "host_name",
"value": "testname.09db12d4a"
}
]
}
]
},
"granularity": "all"
}
I want to get the records whose org id is 864e1875-ea80-455b-a835-f75bed0df2a7 and whose host name is testname.82fae43ca or testname.09db12d4a
When I run this query, it throws error saying that "type" is missing. But I have included "type". Any help is appreciated.
You don't need the extra "filter" inside the and filter. Try this:
{
"queryType": "select",
"dataSource": "timeseries_db",
"intervals": ["2020-09-08T17:00:00\/2020-09-08T17:30:00"],
"pagingSpec":{ "threshold":100},
"filter": {
"type": "and",
"fields": [
{
"type": "selector",
"dimension": "orgid",
"value": "864e1875-ea80-455b-a835-f75bed0df2a7"
},
{
"type": "or",
"fields": [
{
"type": "selector",
"dimension": "host_name",
"value": "testname.82fae43ca"
},
{
"type": "selector",
"dimension": "host_name",
"value": "testname.09db12d4a"
}
]
}
]
},
"granularity": "all"
}
Another way to do this is to use an in filter instead of two selector filters inside an or:
{
"queryType": "select",
"dataSource": "timeseries_db",
"intervals": ["2020-09-08T17:00:00\/2020-09-08T17:30:00"],
"pagingSpec":{ "threshold":100},
"filter": {
"type": "and",
"fields": [
{
"type": "selector",
"dimension": "orgid",
"value": "864e1875-ea80-455b-a835-f75bed0df2a7"
},
{
"type": "in",
"dimension": "host_name",
"values": ["testname.82fae43ca", "testname.09db12d4a"]
}
]
},
"granularity": "all"
}

how to add multiple Druid filters

For example I have the following Druid query:
"filter": {
"type": "and",
"fields": [
{
"type": "selector",
"dimension": "city",
"value": "landon"
},
{
"type": "selector",
"dimension": "name",
"value": "Tom"
}
]
}
Now I'd like to add a NOT field, where should I insert it?
The mapping SQL like:
select * from User where city='landon' and name="Tom" and title<>"teacher".
I got the answer as following:
"filter": {
"type": "and",
"fields": [
{
"type": "selector",
"dimension": "city",
"value": "landon"
},
{
"type": "selector",
"dimension": "name",
"value": "Tom"
},
{
"field": {
"type": "selector",
"dimension": "title",
"value": "teacher"
},
"type": "not"
}
]
}

azure data factory start pipeline different from starting job

I am getting crazy on this issue, I am running an Azure data factory V1, I need to schedule a copy job every week from 01/03/2009 through 01/31/2009, so I defined this schedule on the pipeline:
"start": "2009-01-03T00:00:00Z",
"end": "2009-01-31T00:00:00Z",
"isPaused": false,
monitoring the pipeline, the data factory schedule on these date:
12/29/2008
01/05/2009
01/12/2009
01/19/2009
01/26/2009
instead of this wanted schedule:
01/03/2009
01/10/2009
01/17/2009
01/24/2009
01/31/2009
why the starting date defined on the pipeline doesn't correspond to the schedule date on the monitor?
Many thanks!
Here is the JSON Pipeline:
{
"name": "CopyPipeline-blob2datalake",
"properties": {
"description": "copy from blob storage to datalake directory structure",
"activities": [
{
"type": "DataLakeAnalyticsU-SQL",
"typeProperties": {
"scriptPath": "script/dat230.usql",
"scriptLinkedService": "AzureStorageLinkedService",
"degreeOfParallelism": 5,
"priority": 100,
"parameters": {
"salesfile": "$$Text.Format('/DAT230/{0:yyyy}/{0:MM}/{0:dd}.txt', Date.StartOfDay (SliceStart))",
"lineitemsfile": "$$Text.Format('/dat230/dataloads/{0:yyyy}/{0:MM}/{0:dd}/factinventory/fact.csv', Date.StartOfDay (SliceStart))"
}
},
"inputs": [
{
"name": "InputDataset-dat230"
}
],
"outputs": [
{
"name": "OutputDataset-dat230"
}
],
"policy": {
"timeout": "01:00:00",
"concurrency": 1,
"retry": 1
},
"scheduler": {
"frequency": "Day",
"interval": 7
},
"name": "DataLakeAnalyticsUSqlActivityTemplate",
"linkedServiceName": "AzureDataLakeAnalyticsLinkedService"
}
],
"start": "2009-01-03T00:00:00Z",
"end": "2009-01-11T00:00:00Z",
"isPaused": false,
"hubName": "edxlearningdf_hub",
"pipelineMode": "Scheduled"
}
}
and here the datasets:
{
"name": "InputDataset-dat230",
"properties": {
"structure": [
{
"name": "Date",
"type": "Datetime"
},
{
"name": "StoreID",
"type": "Int64"
},
{
"name": "StoreName",
"type": "String"
},
{
"name": "ProductID",
"type": "Int64"
},
{
"name": "ProductName",
"type": "String"
},
{
"name": "Color",
"type": "String"
},
{
"name": "Size",
"type": "String"
},
{
"name": "Manufacturer",
"type": "String"
},
{
"name": "OnHandQuantity",
"type": "Int64"
},
{
"name": "OnOrderQuantity",
"type": "Int64"
},
{
"name": "SafetyStockQuantity",
"type": "Int64"
},
{
"name": "UnitCost",
"type": "Double"
},
{
"name": "DaysInStock",
"type": "Int64"
},
{
"name": "MinDayInStock",
"type": "Int64"
},
{
"name": "MaxDayInStock",
"type": "Int64"
}
],
"published": false,
"type": "AzureBlob",
"linkedServiceName": "Source-BlobStorage-dat230",
"typeProperties": {
"fileName": "*.txt.gz",
"folderPath": "dat230/{year}/{month}/{day}/",
"format": {
"type": "TextFormat",
"columnDelimiter": "\t",
"firstRowAsHeader": true
},
"partitionedBy": [
{
"name": "year",
"value": {
"type": "DateTime",
"date": "WindowStart",
"format": "yyyy"
}
},
{
"name": "month",
"value": {
"type": "DateTime",
"date": "WindowStart",
"format": "MM"
}
},
{
"name": "day",
"value": {
"type": "DateTime",
"date": "WindowStart",
"format": "dd"
}
}
],
"compression": {
"type": "GZip"
}
},
"availability": {
"frequency": "Day",
"interval": 7
},
"external": true,
"policy": {}
}
}
{
"name": "OutputDataset-dat230",
"properties": {
"structure": [
{
"name": "Date",
"type": "Datetime"
},
{
"name": "StoreID",
"type": "Int64"
},
{
"name": "StoreName",
"type": "String"
},
{
"name": "ProductID",
"type": "Int64"
},
{
"name": "ProductName",
"type": "String"
},
{
"name": "Color",
"type": "String"
},
{
"name": "Size",
"type": "String"
},
{
"name": "Manufacturer",
"type": "String"
},
{
"name": "OnHandQuantity",
"type": "Int64"
},
{
"name": "OnOrderQuantity",
"type": "Int64"
},
{
"name": "SafetyStockQuantity",
"type": "Int64"
},
{
"name": "UnitCost",
"type": "Double"
},
{
"name": "DaysInStock",
"type": "Int64"
},
{
"name": "MinDayInStock",
"type": "Int64"
},
{
"name": "MaxDayInStock",
"type": "Int64"
}
],
"published": false,
"type": "AzureDataLakeStore",
"linkedServiceName": "Destination-DataLakeStore-dat230",
"typeProperties": {
"fileName": "txt.gz",
"folderPath": "dat230/dataloads/{year}/{month}/{day}/factinventory/",
"format": {
"type": "TextFormat",
"columnDelimiter": "\t"
},
"partitionedBy": [
{
"name": "year",
"value": {
"type": "DateTime",
"date": "WindowStart",
"format": "yyyy"
}
},
{
"name": "month",
"value": {
"type": "DateTime",
"date": "WindowStart",
"format": "MM"
}
},
{
"name": "day",
"value": {
"type": "DateTime",
"date": "WindowStart",
"format": "dd"
}
}
]
},
"availability": {
"frequency": "Day",
"interval": 7
},
"external": false,
"policy": {}
}
}
You need to look at the time slices for the datasets and there activity.
The pipeline schedule (badly named) only defines the start and end period in which any activities can use to provision and run there time slices.
ADFv1 doesn't use a recursive schedule like the SQL Server Agent. Each execution has to be provisioned at an interval on the time line (the schedule) you create.
For example, if you pipeline start and end is for 1 year. But your dataset and activity has a frequency of monthly and interval of 1 month you will only get 12 executions of the whatever is happening.
Apologies, but the concept of time slices is a little difficult to explain if you aren't already familiar. Maybe read this post: https://blogs.msdn.microsoft.com/ukdataplatform/2016/05/03/demystifying-activity-scheduling-with-azure-data-factory/
Hope this helps.
Would you share with us the json for the datasets and the pipeline? It would be easier to help you having that.
In the meanwhile, check if you are using "style": "StartOfInterval" at the scheduler property of the activity, and also check if you are using an offset.
Cheers!

What rest API can be used to get default values of fields for create issue?

I am using jira rest api's in my application.
I have found the api for getting the meta-data for creating jira issue but that API doesn't return default values of the fields for example :-
This is the request :-
http://kelpie9:8081/rest/api/latest/issue/createmeta?projectKeys=QA&issuetypeNames=Bug&expand=project.issuetypes.fields
the default value of priority field is set to "major" and the description of priority is also customized but the return from api is:-
{
"expand": "projects",
"projects": [
{
"expand": "issuetypes",
"self": "http://kelpie9:8081/rest/api/2/project/QA",
"id": "10010",
"key": "QA",
"name": "QA",
"avatarUrls": {
"16x16": "http://kelpie9:8081/secure/projectavatar?size=small&pid=10010&avatarId=10011",
"48x48": "http://kelpie9:8081/secure/projectavatar?pid=10010&avatarId=10011"
},
"issuetypes": [
{
"expand": "fields",
"self": "http://kelpie9:8081/rest/api/2/issuetype/1",
"id": 1,
"name": "Bug",
"iconUrl": "http://kelpie9:8081/images/icons/bug.gif",
"fields": {
"summary": {
"required": true,
"schema": {
"type": "string",
"system": "summary"
},
"operations": [
"set"
]
},
"timetracking": {
"required": false,
"operations": [ ]
},
"issuetype": {
"required": true,
"schema": {
"type": "issuetype",
"system": "issuetype"
},
"operations": [ ],
"allowedValues": [
{
"id": "1",
"name": "Bug",
"description": "A problem which impairs or prevents the functions of the product.",
"iconUrl": "http://kelpie9:8081/images/icons/bug.gif"
}
]
},
"priority": {
"required": false,
"schema": {
"type": "priority",
"system": "priority"
},
"name": "Priority",
"operations": [
"set"
],
"allowedValues": [
{
"self": "http://172.19.30.101:18080/rest/api/2/priority/1",
"iconUrl": "http://172.19.30.101:18080/images/icons/priority_blocker.gif",
"name": "Blocker",
"id": "1"
},
{
"self": "http://172.19.30.101:18080/rest/api/2/priority/2",
"iconUrl": "http://172.19.30.101:18080/images/icons/priority_critical.gif",
"name": "Critical",
"id": "2"
},
{
"self": "http://172.19.30.101:18080/rest/api/2/priority/3",
"iconUrl": "http://172.19.30.101:18080/images/icons/priority_major.gif",
"name": "Major",
"id": "3"
},
{
"self": "http://172.19.30.101:18080/rest/api/2/priority/4",
"iconUrl": "http://172.19.30.101:18080/images/icons/priority_minor.gif",
"name": "Minor",
"id": "4"
},
{
"self": "http://172.19.30.101:18080/rest/api/2/priority/5",
"iconUrl": "http://172.19.30.101:18080/images/icons/priority_trivial.gif",
"name": "Trivial",
"id": "5"
}
]
},
"customfield_10080": {
"required": false,
"schema": {
"type": "array",
"items": "string",
"custom": "com.atlassian.jira.plugin.system.customfieldtypes:labels",
"customId": 10080
},
"operations": [ ]
},
"customfield_10010": {
"required": false,
"schema": {
"type": "array",
"items": "string",
"custom": "com.atlassian.jira.plugin.system.customfieldtypes:labels",
"customId": 10010
},
"operations": [ ]
},
"customfield_10071": {
"required": false,
"schema": {
"type": "array",
"items": "string",
"custom": "com.atlassian.jira.plugin.system.customfieldtypes:textfield",
"customId": 10071
},
"operations": [ ]
}
}
}
]
}
]
}
There is nothing like default value or description in priority field, how will I get those values?