Hi all I'm trying to put together a script to create multiple partitions within a tabular data model. I can do one at a time, but multiples seems to be erroring with the following message.
Unrecognized JSON property: partitions. Check path 'create.partitions'
I'm using the following (anonymised) generated script.
{
"create": {
"parentObject": {
"database": "MY_TABULAR",
"table": "MY_TABLE"
},
"partitions": [{
"name": "MY_TABLE 12 2018-09",
"source": {
"query": "SELECT * FROM [Fact].[MY_TABLE] WHERE PlanKey = 12 AND dateKey BETWEEN 20180901 AND 20180930",
"dataSource": "MY_DW"
}
},
{
"name": "MY_TABLE 12 2018-10",
"source": {
"query": "SELECT * FROM [Fact].[MY_TABLE] WHERE PlanKey = 12 AND dateKey BETWEEN 20181001 AND 20181031",
"dataSource": "MY_DW"
}
},
{
"name": "MY_TABLE 12 2018-11",
"source": {
"query": "SELECT * FROM [Fact].[MY_TABLE] WHERE PlanKey = 12 AND dateKey BETWEEN 20181101 AND 20181130",
"dataSource": "MY_DW"
}
}]
}
}
As far as I can tell from looking at the references this is correct, but SSMS doesn't appear to like it.
You can do this by using the Sequence command to execute multiple CreateOrReplace commands that will create the partitions. The Sequence command does have an optional maxParallelism property, however only refresh operations run in parallel (per MSDN). The example below details this further.
{
"sequence":
{
"operations": [
{
"createOrReplace": {
"object": {
"database": "YourTabularDatabase",
"table": "YourTable",
"partition": "Partition 1"
},
"partition": {
"name": "Partition 1",
"dataView": "full",
"source": {
"query": "SELECT * FROM [dbo].[SourceTable] where DateKey < 20180901",
"dataSource": "YourDataSource"
}
}
}
},
{
"createOrReplace": {
"object": {
"database": "YourTabularDatabase",
"table": "YourTable",
"partition": "Partition 2"
},
"partition": {
"name": "Partition 2,
"source": {
"query": "SELECT * FROM [dbo].[SourceTable] where DateKey >= 20180901",
"dataSource": "YourDataSource"
}
}
}
}]
}
}
Related
I have the following output from a web activity .
{
"value": [
{
"id": "/subscriptions/xy_csv",
"name": "xy_csv",
"type": "Microsoft.code",
"etag": "6200",
"properties": {
"folder": {
"name": "samplecodes"
},
"content": {
"query": "select * from table 1",
"metadata": {
"language": "sql"
},
"currentConnection": {
"databaseName": "demo",
"poolName": "Built-in"
},
"resultLimit": 5000
},
"type": "SqlQuery"
}
},
{
"id": "/subscriptions/ab_csv",
"name": "ab_csv",
"type": "Microsoft.code",
"etag": "6200",
"properties": {
"folder": {
"name": "livecode"
},
"content": {
"query": "select * from table 2",
"metadata": {
"language": "sql"
},
"currentConnection": {
"databaseName": "demo",
"poolName": "Built-in"
},
"resultLimit": 5000
},
"type": "SqlQuery"
}
}
]
I would like to create filter activity after the web activity just to filter out items that are saved under the folder name "livecode".
On the filter activity item field I have -#activity('Web1').output.value
On the condition field I have -- #startswith(item().properties.folder.name,'livecode')
The web activity is successful but the filter activity is failed with this error.
{
"errorCode": "InvalidTemplate",
"message": "The execution of template action 'FilterFilter1' failed: The evaluation of 'query' action 'where' expression '#startswith(item().properties.folder.name,'sql')' failed: 'The expression 'startswith(item().properties.folder.name,'sql')' cannot be evaluated because property 'folder' doesn't exist, available properties are 'content, type'.",
"failureType": "UserError",
"target": "Filter1",
"details": ""
}
it feels like I am going wrong on how i have written the Condition Dynamic Content filter to navigate to properties.folder.name. I am not sure what is missing in my condition. Can anyone help? thanks Much appreciated.
The error is because of the web activity output's properties object might not contain the folder sometimes.
I have taken the following json and got the same error:
{
"value":[
{
"id":"/subscriptions/xy_csv",
"name":"xy_csv",
"type":"Microsoft.code",
"etag":"6200",
"properties":{
"content":{
"query":"select * from table 1",
"metadata":{
"language":"sql"
},
"currentConnection":{
"databaseName":"demo",
"poolName":"Built-in"
},
"resultLimit":5000
},
"type":"SqlQuery"
}
},
{
"id":"/subscriptions/ab_csv",
"name":"ab_csv",
"type":"Microsoft.code",
"etag":"6200",
"properties":{
"folder":{
"name":"livecode"
},
"content":{
"query":"select * from table 2",
"metadata":{
"language":"sql"
},
"currentConnection":{
"databaseName":"demo",
"poolName":"Built-in"
},
"resultLimit":5000
},
"type":"SqlQuery"
}
}
]
}
So, you have to modify the filter condition to check whether it contains a folder key or not using the following dynamic content. I have taken your web activity output as a parameter value and took out folder key from properties object:
#startswith(if(contains(item().properties,'folder'),item().properties.folder.name,''),'livecode')
When I debug the pipeline, we get desired result:
Here is a template that works correctly and saves a query to Athena. But how do I save more than 1 query in a single template?
{
"Resources": {
"AthenaNamedQuery": {
"Type": "AWS::Athena::NamedQuery",
"Properties": {
"Database": "swfnetadata",
"Description": "A query that selects all aggregated data",
"Name": "MostExpensiveWorkflow",
"QueryString": "SELECT workflowname, AVG(activitytaskstarted) AS AverageWorkflow FROM swfmetadata WHERE year='17' AND GROUP BY workflowname ORDER BY AverageWorkflow DESC LIMIT 10"
}
}
}
}
Just stick another resource in the template:
{
"Resources": {
"AthenaNamedQuery": {
"Type": "AWS::Athena::NamedQuery",
"Properties": {
"Database": "swfnetadata",
"Description": "A query that selects all aggregated data",
"Name": "MostExpensiveWorkflow",
"QueryString": "SELECT workflowname, AVG(activitytaskstarted) AS AverageWorkflow FROM swfmetadata WHERE year='17' AND GROUP BY workflowname ORDER BY AverageWorkflow DESC LIMIT 10"
}
},
"AnotherAthenaNamedQuery": {
"Type": "AWS::Athena::NamedQuery",
"Properties": {
"Database": "swfnetadata",
"Description": "Another query",
"Name": "AnotherQuery",
"QueryString": "SELECT 1"
}
}
}
}
I am trying to execute an unload command on Redshift via Data Pipeline. The script looks something like:
unload ($$ SELECT *, count(*) FROM (SELECT APP_ID, CAST(record_date AS DATE) WHERE len(APP_ID)>0 AND CAST(record_date as DATE)=$1) GROUP BY APP_ID $$) to 's3://test/unload/' iam_role 'arn:aws:iam::xxxxxxxxxxx:role/Test' delimiter ',' addquotes;
The pipeline looks something like this:
{
"objects": [
{
"role": "DataPipelineDefaultRole",
"subject": "SuccessNotification",
"name": "SNS",
"id": "ActionId_xxxxx”,
"message": "SUCCESS: #{format(minusDays(node.#scheduledStartTime,1),'MM-dd-YYYY')}",
"type": "SnsAlarm",
"topicArn": "arn:aws:sns:us-west-2:xxxxxxxxxx:notification"
},
{
"connectionString": “connection-url”,
"password": “password”,
"name": “Test”,
"id": "DatabaseId_xxxxx”,
"type": "RedshiftDatabase",
"username": “username”
},
{
"subnetId": "subnet-xxxxxx”,
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"name": "EC2",
"id": "ResourceId_xxxxx”,
"type": "Ec2Resource"
},
{
"failureAndRerunMode": "CASCADE",
"resourceRole": "DataPipelineDefaultResourceRole",
"role": "DataPipelineDefaultRole",
"pipelineLogUri": "s3://test/logs/",
"scheduleType": "ONDEMAND",
"name": "Default",
"id": "Default"
},
{
"database": {
"ref": "DatabaseId_xxxxxx”
},
"scriptUri": "s3://test/script.sql",
"name": "SqlActivity",
"scriptArgument": "#{format(minusDays(node.#scheduledStartTime,1),"MM-dd-YYYY”)}”,
"id": "SqlActivityId_xxxxx”,
"runsOn": {
"ref": "ResourceId_xxxx”
},
"type": "SqlActivity",
"onSuccess": {
"ref": "ActionId_xxxxx”
}
}
],
"parameters": []
}
However, I keep getting the error: The column index is out of range: 1, number of columns: 0.
I just can't get it to work. I have tried using ?, $1 and I even tried putting the expression #{format(minusDays(node.#scheduledStartTime,1),"MM-dd-YYYY”)} directly in the script. None of them works.
I have looked at the answers to Amazon Data Pipline: How to use a script argument in a SqlActivity? but none of them are helpful.
Does anyone has idea how to use script argument in SQL Script in AWS Data Pipeline?
I've crereated an index on elasticsearch same as bellow:
"settings" : {
"number_of_shards": 1,
"number_of_replicas": 0,
"analysis": {
"filter": {
"trigrams_filter": {
"type": "ngram",
"min_gram": 3,
"max_gram": 3
}
},
"analyzer": {
"trigrams": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"lowercase",
"trigrams_filter"
]
}
}
}
},
"mappings": {
"issue": {
"properties": {
"description": {
"type": "string",
"analyzer": "trigrams"
}
}
}
}
My test items are bellow:
"alici onay verdi basarili satisiniz gerceklesti diyor ama hesabima para transferi gerceklesmemis"
"otomatik onay işlemi gecikmiş"
"************* nolu iade islemi urun kargoya verilmedi zamaninda iade islemlerinde urun erorr hata veriyor"
I've test this index with bellow query:
GET issue/_search
{
"query": {
"match": {
"description":{
"query": "otomatik onay istemi zamaninda gerceklesmemis"
}
}
}
}
And result:
{
....
"hits": {
....
"max_score": 2.3507352,
"hits": [
{
....
"_score": 2.3507352,
"_source": {
"issue_id": "*******",
"description": "alici onay verdi basarili satisiniz gerceklesti diyor ama hesabima para transferi gerceklesmemis"
}
}
]
}
}
But same data on postgresql with bellow SQL response another result:
SELECT
public.tbl_issue_descriptions_big.description,
similarity(description, 'otomatik onay islemi zamaninda gerceklesmemis') AS sml
FROM
public.tbl_issue_descriptions_big
WHERE
description %'otomatik onay islemi zamaninda gerceklesmemis'
ORDER BY
sml DESC
LIMIT 10
Result is:
description | sml
======================================================|======
otomatik onay islemi gecikmis |0,351852
Why is this difference caused?
I dont know enough about postgres to give a qualified answer there (as this also depends on the documents that are indexed and if they scoring formulas are exactly the same, which I doubt), but Elasticsearch has an explain API and an explain parameter in the search, that help you to find out why a certain document was scored this way.
Consider the following Elasticsearch (v5.4) object (an "award" doc type):
{
"name": "Gold 1000",
"date": "2017-06-01T16:43:00.000+00:00",
"recipient": {
"name": "James Conroy",
"date_of_birth": "1991-05-30"
}
}
The mapping type for both award.date and award.recipient.date_of_birth is "date".
I want to perform a range aggregation to get a list of the age ranges of the recipients of this award ("Under 18", "18-24", "24-30", "30+"), at the time of their award. I tried the following aggregation query:
{
"size": 0,
"query": {"match_all": {}},
"aggs": {
"recipients": {
"nested": {
"path": "recipient"
},
"aggs": {
"age_ranges": {
"range": {
"script": {
"inline": "doc['date'].date - doc['recipient.date_of_birth'].date"
},
"keyed": true,
"ranges": [{
"key": "Under 18",
"from": 0,
"to": 18
}, {
"key": "18-24",
"from": 18,
"to": 24
}, {
"key": "24-30",
"from": 24,
"to": 30
}, {
"key": "30+",
"from": 30,
"to": 100
}]
}
}
}
}
}
}
Problem 1
But I get the following error due to the comparison of dates in the script portion:
Cannot apply [-] operation to types [org.joda.time.DateTime] and [org.joda.time.MutableDateTime].
The DateTime object is the award.date field, and the MutableDateTime object is the award.recipient.date_of_birth field. I've tried doing something like doc['recipient.date_of_birth'].date.toDateTime() (which doesn't work despite the Joda docs claiming that MutableDateTime has this method inherited from a parent class). I've also tried doing something further like this:
"script": "ChronoUnit.YEARS.between(doc['date'].date, doc['recipient.date_of_birth'].date)"
Which sadly also doesn't work :(
Problem 2
I notice if I do this:
"aggs": {
"recipients": {
"nested": {
"path": "recipient"
},
"aggs": {
"award_years": {
"terms": {
"script": {
"inline": "doc['date'].date.year"
}
}
}
}
}
}
I get 1970 with a doc_count that happens to equal the total number of docs in ES. This leads me to believe that accessing a property outside of the nested object simply does not work and gives me back some default like the epoch datetime. And if I do the opposite (aggregating dates of birth without nesting), I get the exact same thing for all the dates of birth instead (1970, epoch datetime). So how can I compare those two dates?
I am racking my brain here, and I feel like there's some clever solution that is just beyond my current expertise with Elasticsearch. Help!
If you want to set up a quick environment for this to help me out, here is some curl goodness:
curl -XDELETE http://localhost:9200/joelinux
curl -XPUT http://localhost:9200/joelinux -d "{\"mappings\": {\"award\": {\"properties\": {\"name\": {\"type\": \"string\"}, \"date\": {\"type\": \"date\", \"format\": \"yyyy-MM-dd'T'HH:mm:ss.SSSSSSZ\"}, \"recipient\": {\"type\": \"nested\", \"properties\": {\"name\": {\"type\": \"string\"}, \"date_of_birth\": {\"type\": \"date\", \"format\": \"yyyy-MM-dd\"}}}}}}}"
curl -XPUT http://localhost:9200/joelinux/award/1 -d '{"name": "Gold 1000", "date": "2016-06-01T16:43:00.000000+00:00", "recipient": {"name": "James Conroy", "date_of_birth": "1991-05-30"}}'
curl -XPUT http://localhost:9200/joelinux/award/2 -d '{"name": "Gold 1000", "date": "2017-02-28T13:36:00.000000+00:00", "recipient": {"name": "Martin McNealy", "date_of_birth": "1983-01-20"}}'
That should give you a "joelinux" index with two "award" docs to test this out ("James Conroy" and "Martin McNealy"). Thanks in advance!
Unfortunately, you can't access nested and non-nested fields within the same context. As a workaround, you can change your mapping to automatically copy date from nested document to root context using copy_to option:
{
"mappings": {
"award": {
"properties": {
"name": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
},
"date": {
"type": "date"
},
"date_of_birth": {
"type": "date" // will be automatically filled when indexing documents
},
"recipient": {
"properties": {
"name": {
"fields": {
"keyword": {
"ignore_above": 256,
"type": "keyword"
}
},
"type": "text"
},
"date_of_birth": {
"type": "date",
"copy_to": "date_of_birth" // copy value to root document
}
},
"type": "nested"
}
}
}
}
}
After that you can access date of birth using path date, though the calculations to get number of years between dates are slightly tricky:
Period.between(LocalDate.ofEpochDay(doc['date_of_birth'].date.getMillis() / 86400000L), LocalDate.ofEpochDay(doc['date'].date.getMillis() / 86400000L)).getYears()
Here I convert original JodaTime date objects to system.time.LocalDate objects:
Get number of milliseconds from 1970-01-01
Convert to number of days from 1970-01-01 by dividing it to 86400000L (number of ms in one day)
Convert to LocalDate object
Create date-based Period object from two dates
Get number of years between two dates.
So, the final aggregation query looks like this:
{
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"age_ranges": {
"range": {
"script": {
"inline": "Period.between(LocalDate.ofEpochDay(doc['date_of_birth'].date.getMillis() / 86400000L), LocalDate.ofEpochDay(doc['date'].date.getMillis() / 86400000L)).getYears()"
},
"keyed": true,
"ranges": [
{
"key": "Under 18",
"from": 0,
"to": 18
},
{
"key": "18-24",
"from": 18,
"to": 24
},
{
"key": "24-30",
"from": 24,
"to": 30
},
{
"key": "30+",
"from": 30,
"to": 100
}
]
}
}
}
}