PySpark: MutableLong cannot be cast to MutableInt (no long in dataframe) - pyspark

I'm trying to read a profiles table from Athena in PySpark using Glue client from boto3 and checking if it's empty. Why Spark bug on converting Int to Long, knowing that I do not have Long type in the table read? There is nothing on Google, nor on StackOverflow that answers this problem.
Here is a code sum-up:
dataframe = GlueContext(session.sparkContext).create_dynamic_frame.from_catalog(
database="xxx",
table_name="profiles",
catalog_id="xxx"
).toDF()
if dataframe.rdd.isEmpty():
dataframe = session.sparkContext.emptyRDD().toDF(schema)
I'm getting the error:
ERROR GlueExceptionAnalysisListener: [Glue Exception Analysis] Event: GlueETLJobExceptionEvent
[...]
File "/myScript.py", line 246, in load_table
if dataframe.rdd.isEmpty()
[...]
Caused by: org.apache.parquet.io.ParquetDecodingException: Can not read value at 1 in block 0 in file s3://bucket/path/to/profiles/vault=c27/subgroup=1/part-00003-a97d95f5-713c-4756-808b-38c3866842cb.c000.snappy.parquet
[...]
Caused by: java.lang.ClassCastException: org.apache.spark.sql.catalyst.expressions.MutableLong cannot be cast to org.apache.spark.sql.catalyst.expressions.MutableInt
Here is the Athena DDL:
CREATE EXTERNAL TABLE `profiles`(
`id` string,
`anonymousids` array<string>,
`lastconsentinsightusage` boolean,
`lastconsentactivationusage` boolean,
`gender` string,
`age` int,
`iata` string,
`continent` string,
`country` string,
`city` string,
`state` string,
`brandvisit` int,
`knownprofileinmarket` date,
`devicebrowser` string,
`devicebrand` string,
`deviceos` string,
`returnedinmarket` date)
PARTITIONED BY (
`vault` varchar(5),
`subgroup` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
's3://bucket/path/to/profiles'
TBLPROPERTIES (
'classification'='parquet',
'transient_lastDdlTime'='1645604528')
And here is the parquet schema:
{
"type" : "record",
"name" : "spark_schema",
"fields" : [ {
"name" : "Id",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "anonymousIds",
"type" : [ "null", {
"type" : "array",
"items" : {
"type" : "record",
"name" : "list",
"fields" : [ {
"name" : "element",
"type" : [ "null", "string" ],
"default" : null
} ]
}
} ],
"default" : null
}, {
"name" : "lastConsentInsightUsage",
"type" : [ "null", "boolean" ],
"default" : null
}, {
"name" : "lastConsentActivationUsage",
"type" : [ "null", "boolean" ],
"default" : null
}, {
"name" : "gender",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "age",
"type" : [ "null", "int" ],
"default" : null
}, {
"name" : "iata",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "continent",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "country",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "city",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "state",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "brandVisit",
"type" : [ "null", "int" ],
"default" : null
}, {
"name" : "knownProfileInMarket",
"type" : [ "null", {
"type" : "int",
"logicalType" : "date"
} ],
"default" : null
}, {
"name" : "deviceBrowser",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "deviceBrand",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "deviceOs",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "returnedInMarket",
"type" : [ "null", {
"type" : "int",
"logicalType" : "date"
} ],
"default" : null
} ]
}
And a line of the parquet file:
{"Id": "34e9bbcd3dd577d6bc3f9b82d9dd99666dafa0203486d2a604f59b7702d50d7d", "anonymousIds": [{"element": "5510"}], "lastConsentInsightUsage": true, "lastConsentActivationUsage": true, "gender": "F", "age": 40, "iata": "", "continent": "EU", "country": "DE", "city": "Frankfurt (Oder)", "state": "BB", "brandVisit": 9, "knownProfileInMarket": 18765, "deviceBrowser": "Googlebot", "deviceBrand": "Spider", "deviceOs": "Other", "returnedInMarket": 18765}

Related

MongoDB doesn't group properly

I am using mongoDB with .NET and for over a year, the same query worked properly.
For a single date range - 04.06.22. - 05.06.22. query malfunctioned and didn't group only one product by ID, so I am getting 2 products with the same ID.
Query (will delete unnecessary params, to make it easier to understand):
.FilterByCustomer(filter.CustomerId)
.FilterByTerminals(filter.Terminals)
.FilterByDateRange("FromLocalDateTime", "ToLocalDateTime", filter.LocalDateRange)
.Unwind("SalesForSingleProductReports", false)
.Unwind("SalesForSingleProductReports.RevenueAndSalesPerCurrency", false)
.Group(
#" _id : {
ProductInfo: '$SalesForSingleProductReports.ProductInfo',
CurrencyInfo: '$SalesForSingleProductReports.RevenueAndSalesPerCurrency.CurrencyInfo'
},
RevenueAndSalesPerCurrency: {
$push: {
Revenue: {$sum: '$SalesForSingleProductReports.RevenueAndSalesPerCurrency.Revenue'},
}
}
}")
.Group(#"_id : '$_id.ProductInfo',
RevenueAndSalesPerCurrency: {$push: {
CurrencyInfo: '$_id.CurrencyInfo',
Revenue: {$sum: '$RevenueAndSalesPerCurrency.Revenue'},
}")
.Project(#"_id:0,
ProductInfo:'$_id',
'RevenueAndSalesPerCurrency': 1")
.Compile();
The result that I am getting is multiple products, but one is duplicated and has different revenue values, but same ID:
[{
"productInfo": {
"productId": "id-111",
"productName": null,
"productInternalCode": "internal-111",
"productCategoryId": "prod-cat-111",
"productCode": null,
"brand": null,
"countryCode": null
},
"revenueAndSalesPerCurrency": [
{
"currencyInfo": {
"currencyId": "curr-Id",
"code": "EUR",
"symbol": "€"
},
"revenue": 1680,
}
]
},
{
"productInfo": {
"productId": "id-111",
"productName": null,
"productInternalCode": "internal-111",
"productCategoryId": "prod-cat-111",
"productCode": null,
"brand": null,
"countryCode": null
},
"revenueAndSalesPerCurrency": [
{
"currencyInfo": {
"currencyId": "curr-id",
"code": "EUR",
"symbol": "€"
},
"revenue": 3080,
}
]
}
]
Here's one record from the collection on which actions are done:
"_id" : ObjectId("id-string"),
"CreatedAt" : ISODate("2021-11-10T10:14:10.116Z"),
"UpdatedAt" : ISODate("2021-11-10T10:14:10.116Z"),
"CreatedByUserId" : null,
"UpdatedByUserId" : null,
"Version" : null,
"IsDeleted" : false,
"ReportType" : "AggregatedSingleProductSalesReportForTerminalAndDate",
"ReportId" : "report-id,
"CustomerId" : "cust-id",
"FromUtcDateTime" : ISODate("2021-11-01T23:00:00.000Z"),
"ToUtcDateTime" : ISODate("2021-11-02T22:59:59.999Z"),
"FromLocalDateTime" : ISODate("2021-11-02T00:00:00.000Z"),
"ToLocalDateTime" : ISODate("2021-11-02T23:59:59.999Z"),
"TransactionIds" : [
"trans-id-1",
"trans-id-2",
],
"TerminalId" : "terminal-id",
"LocalDate" : "2021-11-02",
"SalesForSingleProductReports" : [
{
"ProductInfo" : {
"ProductId" : "prod-id",
"ProductName" : null,
"ProductInternalCode" : "internal-string",
"ProductCategoryId" : "prod-category-id"
},
"RevenueAndSalesPerCurrency" : [
{
"CurrencyInfo" : {
"CurrencyId" : "euro-curr",
"Code" : "GBP",
"Symbol" : "£"
},
"Revenue" : 180,
}
]
},
{
"ProductInfo" : {
"ProductId" : "prod-id-2",
"ProductName" : null,
"ProductInternalCode" : "internal-string-2",
"ProductCategoryId" : "prod-category-id-2"
},
"RevenueAndSalesPerCurrency" : [
{
"CurrencyInfo" : {
"CurrencyId" : "euro-curr",
"Code" : "GBP",
"Symbol" : "£"
},
"Revenue" : 90,
}
]
}
]
}```

How to pull an element in sub of sub array in MongoDB?

I want to pull an object from com_address where "add_id":111:
{
"_id" : ObjectId("5f7b6b4e327e2111883909f3"),
"FirstName" : "abc",
"LastName" : "abc",
"DateOfBirth" : "05/09/2020",
"gender" : "M",
"address" : {
"country" : "string",
"state" : "string",
"city" : [
{
"id" : 15,
"type" : "string",
"com_address" : [
{
"add_id" : 113,
"street" : "string",
"house_no" : "string",
"landmark" : "string"
},
{
"add_id" : 114,
"street" : "string",
"house_no" : "string",
"landmark" : "string"
}
]
},
{
"id" : 16,
"type" : "string",
"com_address" : [
{
"add_id" : 110,
"street" : "string",
"house_no" : "string",
"landmark" : "string"
},
{
"add_id" : 111,
"street" : "string",
"house_no" : "string",
"landmark" : "string"
}
]
}
]
}
}
This is my query:
db.getCollection('student').update({"_id" : ObjectId("5f7b6b4e327e2111883909f3")},{$pull:{"address.city":{"com_address.add_id":111}}})
By doing this object inside city with id 16 is getting deleted instead of pulling an object from com_address.
How to pull an object from com_address with add_id?
Try this, which uses the $[] operator
db.getCollection('student').update({
"_id" : ObjectId("5f7b6b4e327e2111883909f3")
},
{
$pull: { "address.city.$[].com_address": { "add_id": 111 }}
})

Find a nested object field inside an array in mongodb aggregate

I have this object as below.
{
"_id" : ObjectId("5ec80a981e89a84b19934039"),
"status" : "active",
"organizationId" : "1",
"productId" : "1947",
"name" : "BOOKEND & PAPER WEIGHT SET – ZODIAC PIG – RED COPPER + PLATINUM",
"description" : "This global exclusive Zodiac bookend and paperweight set from Zuny will stand auspiciously on your bookcase and table, spreading good luck and fortune throughout your home just in time for the Year of the Pig.",
"brand" : "ZUNY",
"created" : "2018-09-28 00:00:00",
"updated" : "2020-05-22 09:19:07",
"mainImage" : "https://",
"availableOnline" : true,
"colors" : [
{
"images" : [
{
"type" : "studio",
"url" : "https://"
},
{
"type" : "studio",
"url" : "https://"
},
{
"type" : "studio",
"url" : "https://"
}
],
"extraInfo" : [
{
"type" : "text-tag",
"title" : "CATEGORY",
"tags" : [
"HOME FURNISHING & DÉCOR",
"LIFESTYLE"
]
},
{
"type" : "text-tag",
"title" : "BRAND",
"tags" : [
"ZUNY"
]
},
{
"type" : "text-tag",
"title" : "COLOUR",
"tags" : [
"GOLD",
"ROSE GOLD"
]
},
{
"type" : "text-tag",
"title" : "SEASON",
"tags" : [
"AW(2018)"
]
},
{
"type" : "text-tag",
"title" : "HASHTAG",
"tags" : [
"BOOKCASES",
"BOOKEND",
"COLOUR",
"EXCLUSIVE",
"GLOBAL EXCLUSIVE",
"HOME",
"LEATHER",
"MOTIF",
"OBJECTS",
"PAPER",
"PAPERWEIGHT",
"PLATINUM",
"SET",
"SYNTHETIC",
"ZODIAC",
"HANDMADE",
"time"
]
}
],
"_id" : ObjectId("5ec80a981e89a84b1993403a"),
"colorId" : "1",
"color" : "ROSE GOLD",
"status" : "active",
"sizes" : [
{
"extraInfo" : [
{
"type" : "text-block",
"title" : "Size And Fit",
"text" : ""
},
{
"type" : "text-block",
"title" : "Information",
"text" : "Global exclusive. Colour: Copper/Platinum. Set includes: Zodiac Pig bookend (x 1), Zodiac Pig paperweight (x 1). Metallic copper- and platinum-tone synthetic leather. Pig motif. Iron pellet filling. Handmade"
}
],
"_id" : ObjectId("5ec80a981e89a84b1993403b"),
"sizeId" : "1",
"neo" : "0210111790664",
"size" : "*",
"originalPrice" : "1060.00",
"sellingPrice" : "1060.00",
"discountPercent" : "0.00",
"url" : "https://",
"status" : "active",
"currency" : "HK$",
"stores" : [
{
"storeId" : "1",
"quantity" : 70,
"_id" : ObjectId("5ec80a981e89a84b1993403c"),
"available" : 70,
"reserved" : 0,
"name" : "Park Street",
"status" : "active"
},
{
"storeId" : "2",
"quantity" : 95,
"_id" : ObjectId("5ec80a981e89a84b1993403d"),
"name" : "Rashbehari",
"status" : "active"
}
]
}
]
}
],
"__v" : 0
}
I want the output as follows
{
"name": "Mock Collection",
"collectionId": "92",
"products": [
{
"title": "GLOBAL EXCLUSIVE OFF-SHOULDER SHIRT DRESS",
"imageUrl": "https://",
"productId": "21174",
"currency": "" // This should be this.colors[0].sizes[0].currency
},
]
}
How to get the nested field. I tried using arrayElemAt by which I was able to get to colors[0]. But I am confused how to get inside the nested object of sizes from there. Also the currency node should have the exact value. It comes like currency:{currency: value} which I don't want.
Please help!
Not sure how you've got that output but to extract currency from first object of sizes then you need to try this :
db.collection.aggregate([
{
$project: {
currency: {
$arrayElemAt: [
{
$arrayElemAt: [ "$colors.sizes.currency", 0 ] // gives an array of currency values, in your case since you've only one object just an array of one value
},
0
]
}
}
}
])
Test : mongoplayground

Unable to load data in druid

I am a newbie in druid. Trying to load a very simple data in JSON format to druid. The data contains just one dimension, one metric and timestamp. I have been successfully able to load data to druid for a different dataset but somehow I am getting errors for this dataset.
This is my index file :
{
"type" : "index",
"spec" : {
"dataSchema" : {
"dataSource" : "datatemplate",
"parser" : {
"type" : "string",
"parseSpec" : {
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
"Loc"
]
},
"timestampSpec" : {
"format" : "auto",
"column" : "Timestamp"
}
}
},
"metricsSpec" : [{"name" : "Qty","type" : "doubleSum","fieldName" : "Qty"}],
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "day",
"queryGranularity" : "none",
"intervals" : ["2016-01-01T00:00:00Z/2030-06-30T00:00:00Z"],
"rollup" : true
}
},
"ioConfig" : {
"type" : "index",
"firehose" : {
"type" : "local",
"baseDir" : "datatemplate/",
"filter" : "datatemplate.json"
},
"appendToExisting" : false
},
"tuningConfig" : {
"type" : "index",
"targetPartitionSize" : 10000000,
"maxRowsInMemory" : 40000,
"forceExtendableShardSpecs" : true
}
}
}
Also here is my dataset in JSON format:
{"Loc": "A", "Qty": "1", "Timestamp": "2017-12-01T00:00:00Z"}
{"Loc": "A", "Qty": "1", "Timestamp": "2017-12-01T00:00:00Z"}
{"Loc": "B", "Qty": "2", "Timestamp": "2017-12-01T00:00:00Z"}
{"Loc": "B", "Qty": "1", "Timestamp": "2017-12-01T00:00:00Z"}

swagger file for all possible 'properties'

I need to call an API to load data on an ongoing basis. API returns different properties for each event. When I create the swagger file it has properties from sample return, but in the long run there will be more properties which can be added by the source system and they will not be in swagger file.
Is there any way to recreate swagger file before data load dynamically with additional properties?
Swagger file is generated by Informatica Cloud based on a sample return while testing the connection.
Properties list has a different number of entries based on event type.
swagger file:
{"swagger" : "2.0",
"info" : {
"description" : null,
"version" : "1.0.0",
"title" : null,
"termsOfService" : null,
"contact" : null,
"license" : null
},
"host" : "<host>.com",
"basePath" : "/api",
"schemes" : [ "https" ],
"paths" : {
"/2.0" : {
"post" : {
"tags" : [ "events" ],
"summary" : null,
"description" : null,
"operationId" : "events",
"produces" : [ "application/json" ],
"consumes" : [ "application/json" ],
"parameters" : [ {
"name" : "script",
"in" : "query",
"description" : null,
"required" : false,
"type" : "string"
}, {
"name" : "Authorization",
"in" : "header",
"description" : null,
"required" : false,
"type" : "string"
} ],
"responses" : {
"200" : {
"description" : "successful operation",
"schema" : {
"$ref" : "#/definitions/events"
}
}
}
}
}
},
"definitions" : {
"events##properties" : {
"properties" : {
"$app_build_number" : {
"type" : "string"
},
"$app_version_string" : {
"type" : "string"
},
"$carrier" : {
"type" : "string"
},
"$lib_version" : {
"type" : "string"
},
"$manufacturer" : {
"type" : "string"
},
"$model" : {
"type" : "string"
},
"$os" : {
"type" : "string"
},
"$os_version" : {
"type" : "string"
},
"$radio" : {
"type" : "string"
},
"$region" : {
"type" : "string"
},
"$screen_height" : {
"type" : "number",
"format" : "int32"
},
"$screen_width" : {
"type" : "number",
"format" : "int32"
},
"Home Step Enabled" : {
"type" : "string"
},
"Number Of Lifetime Logins" : {
"type" : "number",
"format" : "int32"
},
"Sessions" : {
"type" : "number",
"format" : "int32"
},
"mp_country_code" : {
"type" : "string"
},
"mp_lib" : {
"type" : "string"
}
}
},
"events" : {
"properties" : {
"name" : {
"type" : "string"
},
"distinct_id" : {
"type" : "string"
},
"labels" : {
"type" : "string"
},
"time" : {
"type" : "number",
"format" : "int64"
},
"sampling_factor" : {
"type" : "number",
"format" : "int32"
},
"dataset" : {
"type" : "string"
},
"properties" : {
"$ref" : "#/definitions/events##properties"
}
}
}
}
}
sample return:
"name": "Session",
"distinct_id": "1234567890",
"labels": [],
"time": 1520072505000,
"sampling_factor": 1,
"dataset": "$event_data_set",
"properties": {
"$app_build_number": "900",
"$app_version_string": "1.9",
"$carrier": "AT&T",
"$lib_version": "2.0.1",
"$manufacturer": "Apple",
"$model": "iPhone10,6",
"$os": "iOS",
"$os_version": "11.2.6",
"$radio": "LTE",
"$region": "Florida",
"$screen_height": 667,
"$screen_width": 375,
"Number Of Lifetime Logins": 2,
"Session Length": "00h:00m:08s",
"Sessions": 43,
"mp_country_code": "US",
"mp_lib": "swift"
}
}