debezium - schema registry issue - apache-kafka

Im using AWS schema registry for debezium.
In the debezium I mentioned the server name as mysql-db01. So debezium will create a topic with this server name to add some metadata about the server and schema changes.
When I deployed the connector, in the schema registry I got the schema like this.
{
"type": "record",
"name": "SchemaChangeKey",
"namespace": "io.debezium.connector.mysql",
"fields": [
{
"name": "databaseName",
"type": "string"
}
],
"connect.name": "io.debezium.connector.mysql.SchemaChangeKey"
}
Then immediately another version got created like this.
{
"type": "record",
"name": "SchemaChangeValue",
"namespace": "io.debezium.connector.mysql",
"fields": [
{
"name": "source",
"type": {
"type": "record",
"name": "Source",
"fields": [
{
"name": "version",
"type": "string"
},
{
"name": "connector",
"type": "string"
},
{
"name": "name",
"type": "string"
},
{
"name": "ts_ms",
"type": "long"
},
{
"name": "snapshot",
"type": [
{
"type": "string",
"connect.version": 1,
"connect.parameters": {
"allowed": "true,last,false"
},
"connect.default": "false",
"connect.name": "io.debezium.data.Enum"
},
"null"
],
"default": "false"
},
{
"name": "db",
"type": "string"
},
{
"name": "sequence",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "table",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "server_id",
"type": "long"
},
{
"name": "gtid",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "file",
"type": "string"
},
{
"name": "pos",
"type": "long"
},
{
"name": "row",
"type": "int"
},
{
"name": "thread",
"type": [
"null",
"long"
],
"default": null
},
{
"name": "query",
"type": [
"null",
"string"
],
"default": null
}
],
"connect.name": "io.debezium.connector.mysql.Source"
}
},
{
"name": "databaseName",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "schemaName",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "ddl",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "tableChanges",
"type": {
"type": "array",
"items": {
"type": "record",
"name": "Change",
"namespace": "io.debezium.connector.schema",
"fields": [
{
"name": "type",
"type": "string"
},
{
"name": "id",
"type": "string"
},
{
"name": "table",
"type": {
"type": "record",
"name": "Table",
"fields": [
{
"name": "defaultCharsetName",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "primaryKeyColumnNames",
"type": [
"null",
{
"type": "array",
"items": "string"
}
],
"default": null
},
{
"name": "columns",
"type": {
"type": "array",
"items": {
"type": "record",
"name": "Column",
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "jdbcType",
"type": "int"
},
{
"name": "nativeType",
"type": [
"null",
"int"
],
"default": null
},
{
"name": "typeName",
"type": "string"
},
{
"name": "typeExpression",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "charsetName",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "length",
"type": [
"null",
"int"
],
"default": null
},
{
"name": "scale",
"type": [
"null",
"int"
],
"default": null
},
{
"name": "position",
"type": "int"
},
{
"name": "optional",
"type": [
"null",
"boolean"
],
"default": null
},
{
"name": "autoIncremented",
"type": [
"null",
"boolean"
],
"default": null
},
{
"name": "generated",
"type": [
"null",
"boolean"
],
"default": null
}
],
"connect.name": "io.debezium.connector.schema.Column"
}
}
}
],
"connect.name": "io.debezium.connector.schema.Table"
}
}
],
"connect.name": "io.debezium.connector.schema.Change"
}
}
}
],
"connect.name": "io.debezium.connector.mysql.SchemaChangeValue"
These 2 schemas are not matching, so the AWS schema registry is not allowing the connector to register the 2nd version. But the 2nd version is the actual schema for the connector.
To solve this issue, I deleted the schema(in the schema registry). Then deleted the connector, re-deployed the connector, then It worked.
But I'm trying to understand why the very first time the schema has different versions.

I have used the following key/value convertors on the source and sink connectors to make it work.
"key.converter": "org.apache.kafka.connect.storage.StringConverter",
"key.converter.schemas.enable": "false",
"internal.key.converter": "com.amazonaws.services.schemaregistry.kafkaconnect.AWSKafkaAvroConverter",
"internal.key.converter.schemas.enable": "false",
"internal.value.converter": "com.amazonaws.services.schemaregistry.kafkaconnect.AWSKafkaAvroConverter",
"internal.value.converter.schemas.enable": "false",
"value.converter": "com.amazonaws.services.schemaregistry.kafkaconnect.AWSKafkaAvroConverter",
"value.converter.schemas.enable": "true",
"value.converter.region": "ap-south-1",
"key.converter.schemaAutoRegistrationEnabled": "true",
"value.converter.schemaAutoRegistrationEnabled": "true",
"key.converter.avroRecordType": "GENERIC_RECORD",
"value.converter.avroRecordType": "GENERIC_RECORD",
"key.converter.registry.name": "bhuvi-debezium",
"value.converter.registry.name": "bhuvi-debezium",

Related

How to group by single field and return more values together

I'm starting to use apache druid but having some difficult to run native queries (and some SQL too).
1- Is it possible to groupBy a single column while also returning more channels?
2- How could I groupBy a single column, while returning different grouped itens on same query/row ?
Query I'm trying to use:
{
"queryType": "groupBy",
"dataSource": "my-data-source",
"granularity": "all",
"intervals": ["2022-06-27T03:00:00.000Z/2022-06-28T03:00:00.000Z"],
"context:": { "timeout: 30000 },
"dimensions": ["userId"],
"filter": {
"type": "and",
"fields": [
{
"type": "or",
"fields": [{...}]
}
]
},
"aggregations": [
{
"type": "count",
"name": "count"
}
]
}
Tried to add a filtered type inside aggregations:[] but 0 changes happened.
"aggregations": [
{
"type: "count",
"name": "count"
},
{
"type": "filtered",
"filter": {
"type": "selector",
"dimension": "block_id",
"value": "block1"
},
"aggregator": {
"type": "count",
"name": "block1",
"fieldName": "block_id"
}
}
]
Grouping Aggregator also didn't work.
"aggregations": [
{
"type": "count",
"name": "count"
},
{
"type": "grouping",
"name": "groupedData",
"groupings": ["block_id"]
}
],
Below is the image illustrating the results I'm trying to achieve.
Not sure yet how to get the results in the format you want, but as a start, something like this might be a step:
{
"queryType": "groupBy",
"dataSource": {
"type": "table",
"name": "dataTest"
},
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"filter": null,
"granularity": {
"type": "all"
},
"dimensions": [
{
"type": "default",
"dimension": "d2_ts2",
"outputType": "STRING"
},
{
"type": "default",
"dimension": "d3_email",
"outputType": "STRING"
}
],
"aggregations": [
{
"type": "count",
"name": "myCount",
}
],
"descending": false
}
I'm curious, what is the use case?
Using a SQL query you can do it this way:
SELECT UserID,
sum(1) FILTER (WHERE BlockId = 'block1') as Block1,
sum(1) FILTER (WHERE BlockId = 'block2') as Block2,
sum(1) FILTER (WHERE BlockId = 'block3') as Block3
FROM inline_data
GROUP BY 1
The Native Query for this (from the explain) is:
{
"queryType": "topN",
"dataSource": {
"type": "table",
"name": "inline_data"
},
"virtualColumns": [
{
"type": "expression",
"name": "v0",
"expression": "1",
"outputType": "LONG"
}
],
"dimension": {
"type": "default",
"dimension": "UserID",
"outputName": "d0",
"outputType": "STRING"
},
"metric": {
"type": "dimension",
"previousStop": null,
"ordering": {
"type": "lexicographic"
}
},
"threshold": 101,
"intervals": {
"type": "intervals",
"intervals": [
"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z"
]
},
"filter": null,
"granularity": {
"type": "all"
},
"aggregations": [
{
"type": "filtered",
"aggregator": {
"type": "longSum",
"name": "a0",
"fieldName": "v0",
"expression": null
},
"filter": {
"type": "selector",
"dimension": "BlockId",
"value": "block1",
"extractionFn": null
},
"name": "a0"
},
{
"type": "filtered",
"aggregator": {
"type": "longSum",
"name": "a1",
"fieldName": "v0",
"expression": null
},
"filter": {
"type": "selector",
"dimension": "BlockId",
"value": "block2",
"extractionFn": null
},
"name": "a1"
},
{
"type": "filtered",
"aggregator": {
"type": "longSum",
"name": "a2",
"fieldName": "v0",
"expression": null
},
"filter": {
"type": "selector",
"dimension": "BlockId",
"value": "block3",
"extractionFn": null
},
"name": "a2"
}
],
"postAggregations": [],
"context": {
"populateCache": false,
"sqlOuterLimit": 101,
"sqlQueryId": "bb92e899-c127-49b0-be1b-d4b38909d166",
"useApproximateCountDistinct": false,
"useApproximateTopN": false,
"useCache": false,
"useNativeQueryExplain": true
},
"descending": false
}

org.apache.avro.AvroTypeException: Unknown union branch EventId

I am trying to convert a json to avro using 'kafka-avro-console-producer' and publish it to kafka topic.
I am able to do that flat json/schema's but for below given schema and json I am getting
"org.apache.avro.AvroTypeException: Unknown union branch EventId" error.
Any help would be appreciated.
Schema :
{
"type": "record",
"name": "Envelope",
"namespace": "CoreOLTPEvents.dbo.Event",
"fields": [{
"name": "before",
"type": ["null", {
"type": "record",
"name": "Value",
"fields": [{
"name": "EventId",
"type": "long"
}, {
"name": "CameraId",
"type": ["null", "long"],
"default": null
}, {
"name": "SiteId",
"type": ["null", "long"],
"default": null
}],
"connect.name": "CoreOLTPEvents.dbo.Event.Value"
}],
"default": null
}, {
"name": "after",
"type": ["null", "Value"],
"default": null
}, {
"name": "op",
"type": "string"
}, {
"name": "ts_ms",
"type": ["null", "long"],
"default": null
}],
"connect.name": "CoreOLTPEvents.dbo.Event.Envelope"
}
And Json input is like below :
{
"before": null,
"after": {
"EventId": 12,
"CameraId": 10,
"SiteId": 11974
},
"op": "C",
"ts_ms": null
}
And in my case I cant alter schema, I can alter only json such a way that it works
If you are using the Avro JSON format, the input you have is slightly off. For unions, non-null values need to be specified such that the type information is listed: https://avro.apache.org/docs/current/spec.html#json_encoding
See below for an example which I think should work.
{
"before": null,
"after": {
"CoreOLTPEvents.dbo.Event.Value": {
"EventId": 12,
"CameraId": {
"long": 10
},
"SiteId": {
"long": 11974
}
}
},
"op": "C",
"ts_ms": null
}
Removing "connect.name": "CoreOLTPEvents.dbo.Event.Value" and "connect.name": "CoreOLTPEvents.dbo.Event.Envelope" as The RecordType can only contains {'namespace', 'aliases', 'fields', 'name', 'type', 'doc'} keys.
Could you try with below schema and see if you are able to produce the msg?
{
"type": "record",
"name": "Envelope",
"namespace": "CoreOLTPEvents.dbo.Event",
"fields": [
{
"name": "before",
"type": [
"null",
{
"type": "record",
"name": "Value",
"fields": [
{
"name": "EventId",
"type": "long"
},
{
"name": "CameraId",
"type": [
"null",
"long"
],
"default": "null"
},
{
"name": "SiteId",
"type": [
"null",
"long"
],
"default": "null"
}
]
}
],
"default": null
},
{
"name": "after",
"type": [
"null",
"Value"
],
"default": null
},
{
"name": "op",
"type": "string"
},
{
"name": "ts_ms",
"type": [
"null",
"long"
],
"default": null
}
]
}

Conversion of JSON to Avro failed: Failed to convert JSON to Avro: Unknown union branch

I am trying to send a JSON message to a Kafka topic using Kafka-rest service to serialize JSON as an Avro object, but the JSON message is failed to get accepted by Kafka-rest with the following error:
Conversion of JSON to Avro failed: Failed to convert JSON to Avro: Unknown union branch postId
I suspect that there is an issue with the Avro schema I am using as it is a nested record type with nullable fields.
Avro schema:
{
"type": "record",
"name": "ExportRequest",
"namespace": "com.example.avro.model",
"fields": [
{
"name": "context",
"type": {
"type": "map",
"values": {
"type": "string",
"avro.java.string": "String"
},
"avro.java.string": "String"
}
},
{
"name": "exportInfo",
"type": {
"type": "record",
"name": "ExportInfo",
"fields": [
{
"name": "exportId",
"type": {
"type": "string",
"avro.java.string": "String"
}
},
{
"name": "exportType",
"type": {
"type": "string",
"avro.java.string": "String"
}
},
{
"name": "exportQuery",
"type": {
"type": "record",
"name": "ExportQuery",
"fields": [
{
"name": "postExport",
"type": [
"null",
{
"type": "record",
"name": "PostExport",
"fields": [
{
"name": "postId",
"type": {
"type": "string",
"avro.java.string": "String"
}
},
{
"name": "isCommentIncluded",
"type": "boolean"
}
]
}
],
"default": null
},
{
"name": "feedExport",
"type": [
"null",
{
"type": "record",
"name": "FeedExport",
"fields": [
{
"name": "accounts",
"type": {
"type": "array",
"items": {
"type": "string",
"avro.java.string": "String"
}
}
},
{
"name": "recordTypes",
"type": {
"type": "array",
"items": {
"type": "string",
"avro.java.string": "String"
}
}
},
{
"name": "actions",
"type": {
"type": "array",
"items": {
"type": "string",
"avro.java.string": "String"
}
}
},
{
"name": "contentTypes",
"type": {
"type": "array",
"items": {
"type": "string",
"avro.java.string": "String"
}
}
},
{
"name": "startDate",
"type": "long"
},
{
"name": "endDate",
"type": "long"
},
{
"name": "advancedSearch",
"type": [
"null",
{
"type": "record",
"name": "AdvancedSearchExport",
"fields": [
{
"name": "allOfTheWords",
"type": {
"type": "array",
"items": {
"type": "string",
"avro.java.string": "String"
}
}
},
{
"name": "anyOfTheWords",
"type": {
"type": "array",
"items": {
"type": "string",
"avro.java.string": "String"
}
}
},
{
"name": "noneOfTheWords",
"type": {
"type": "array",
"items": {
"type": "string",
"avro.java.string": "String"
}
}
},
{
"name": "hashtags",
"type": {
"type": "array",
"items": {
"type": "string",
"avro.java.string": "String"
}
}
},
{
"name": "keyword",
"type": {
"type": "string",
"avro.java.string": "String"
}
},
{
"name": "exactPhrase",
"type": {
"type": "string",
"avro.java.string": "String"
}
}
]
}
],
"default": null
}
]
}
],
"default": null
}
]
}
}
]
}
}
]
}
Json message:
{
"context": {
"user_id": "1",
"group_id": "1",
"organization_id": "1"
},
"exportInfo": {
"exportId": "93874dd7-35d7-4f1f-8cf8-051c606d920b",
"exportType": "json",
"exportQuery": {
"postExport": {
"postId": "dd",
"isCommentIncluded": false
},
"feedExport": {
"accounts": [
"1677143852565319"
],
"recordTypes": [],
"actions": [],
"contentTypes": [],
"startDate": 0,
"endDate": 0,
"advancedSearch": {
"allOfTheWords": [
"string"
],
"anyOfTheWords": [
"string"
],
"noneOfTheWords": [
"string"
],
"hashtags": [
"string"
],
"keyword": "string",
"exactPhrase": "string"
}
}
}
}
}
I would appreciate it if someone could help me to understand what the issue is.
Both of your JSON and Avro looks good.
You are facing the issue because JSON doesn't conform to Avro's JSON encoding spec.
So, if you convert your JSON accordingly, it will somehow look like this
{
"context": {
"user_id": "1",
"group_id": "1",
"organization_id": "1"
},
"exportInfo": {
"exportId": "93874dd7-35d7-4f1f-8cf8-051c606d920b",
"exportType": "json",
"exportQuery": {
"postExport": {
"com.example.avro.model.PostExport": {
"postId": "dd",
"isCommentIncluded": false
}
},
"feedExport": {
"com.example.avro.model.FeedExport": {
"accounts": [
"1677143852565319"
],
"recordTypes": [],
"actions": [],
"contentTypes": [],
"startDate": 0,
"endDate": 0,
"advancedSearch": {
"com.example.avro.model.AdvancedSearchExport": {
"allOfTheWords": [
"string"
],
"anyOfTheWords": [
"string"
],
"noneOfTheWords": [
"string"
],
"hashtags": [
"string"
],
"keyword": "string",
"exactPhrase": "string"
}
}
}
}
}
}
}

how to create stream in ksql from topic with decimal type column

I want to create a stream from kafka topic that monitor a mysql table. mysql table has columns with decimal(16,4) type and when I create stream with this command:
create stream test with (KAFKA_TOPIC='dbServer.Kafka.DailyUdr',VALUE_FORMAT='AVRO');
stream created and run but columns with decimal(16,4) type don't appear in result stream.
source topic value schema:
{
"type": "record",
"name": "Envelope",
"namespace": "dbServer.Kafka.DailyUdr",
"fields": [
{
"name": "before",
"type": [
"null",
{
"type": "record",
"name": "Value",
"fields": [
{
"name": "UserId",
"type": "int"
},
{
"name": "NationalCode",
"type": "string"
},
{
"name": "TotalInputOcted",
"type": "int"
},
{
"name": "TotalOutputOcted",
"type": "int"
},
{
"name": "Date",
"type": "string"
},
{
"name": "Service",
"type": "string"
},
{
"name": "decimalCol",
"type": [
"null",
{
"type": "bytes",
"scale": 4,
"precision": 16,
"connect.version": 1,
"connect.parameters": {
"scale": "4",
"connect.decimal.precision": "16"
},
"connect.name": "org.apache.kafka.connect.data.Decimal",
"logicalType": "decimal"
}
],
"default": null
}
],
"connect.name": "dbServer.Kafka.DailyUdr.Value"
}
],
"default": null
},
{
"name": "after",
"type": [
"null",
"Value"
],
"default": null
},
{
"name": "source",
"type": {
"type": "record",
"name": "Source",
"namespace": "io.debezium.connector.mysql",
"fields": [
{
"name": "version",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "connector",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "name",
"type": "string"
},
{
"name": "server_id",
"type": "long"
},
{
"name": "ts_sec",
"type": "long"
},
{
"name": "gtid",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "file",
"type": "string"
},
{
"name": "pos",
"type": "long"
},
{
"name": "row",
"type": "int"
},
{
"name": "snapshot",
"type": [
{
"type": "boolean",
"connect.default": false
},
"null"
],
"default": false
},
{
"name": "thread",
"type": [
"null",
"long"
],
"default": null
},
{
"name": "db",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "table",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "query",
"type": [
"null",
"string"
],
"default": null
}
],
"connect.name": "io.debezium.connector.mysql.Source"
}
},
{
"name": "op",
"type": "string"
},
{
"name": "ts_ms",
"type": [
"null",
"long"
],
"default": null
}
],
"connect.name": "dbServer.Kafka.DailyUdr.Envelope"
}
my problem is in decimalCol column
KSQL does not yet support DECIMAL data type.
There is an issue here that you can track and upvote if you think it would be useful.

avro.io.AvroTypeException: The datum [object] is not an example of the schema

I have been struggling through this issue quite for some time. I am working on AvroProducer(confluent kafka) and getting error related to schema defined.
Here is the complete stacktrace of the issue I am getting:
<!--language: lang-none-->
raise AvroTypeException(self.writer_schema, datum)
avro.io.AvroTypeException: The datum {'totalDifficulty': 2726165051, 'stateRoot': '0xf09bd6730b3ae7f5728836564837d7f776a8f7333628c8b84cb57d7c6d48ebba', 'sha3Uncles': '0x1dcc4de8dec75d7aab85b567b6ccd41ad312451b948a7413f0a142fd40d49347', 'size': 538, 'logs': [], 'gasLimit': 8000000, 'mixHash': '0x410b2b19519be16496727c93515f399072ffecf06defe4913d00eb4d10bb7351', 'logsBloom': '0x00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000', 'nonce': '0x18dc6c0d30839c91', 'proofOfAuthorityData': '0xd883010817846765746888676f312e31302e34856c696e7578', 'number': 5414, 'timestamp': 1552577641, 'difficulty': 589091, 'gasUsed': 0, 'miner': '0x48FA5EBc2f0D82B5D52faAe624Fa2426998ab492', 'hash': '0x71259991acb407a85befa8b3c5df26a94a11a6c08f92f3e3b7c9c0e8e1f5916d', 'transactionsRoot': '0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421', 'receiptsRoot': '0x56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421', 'transactions': [], 'parentHash': '0x9f0c25eeab86fc144296cb034c94857beed331936016d60c0986a35ac07d9c68', 'uncles': []} is not an example of the schema {
"type": "record",
"name": "value",
"namespace": "exporter.value.opsnetBlock",
"fields": [
{
"type": "int",
"name": "difficulty"
},
{
"type": "string",
"name": "proofOfAuthorityData"
},
{
"type": "int",
"name": "gasLimit"
},
{
"type": "int",
"name": "gasUsed"
},
{
"type": "string",
"name": "hash"
},
{
"type": "string",
"name": "logsBloom"
},
{
"type": "int",
"name": "size"
},
{
"type": "string",
"name": "miner"
},
{
"type": "string",
"name": "mixHash"
},
{
"type": "string",
"name": "nonce"
},
{
"type": "int",
"name": "number"
},
{
"type": "string",
"name": "parentHash"
},
{
"type": "string",
"name": "receiptsRoot"
},
{
"type": "string",
"name": "sha3Uncles"
},
{
"type": "string",
"name": "stateRoot"
},
{
"type": "int",
"name": "timestamp"
},
{
"type": "int",
"name": "totalDifficulty"
},
{
"type": "string",
"name": "transactionsRoot"
},
{
"type": {
"type": "array",
"items": "string"
},
"name": "transactions"
},
{
"type": {
"type": "array",
"items": "string"
},
"name": "uncles"
},
{
"type": {
"type": "array",
"items": {
"type": "record",
"name": "Child",
"namespace": "exporter.value.opsnetBlock",
"fields": [
{
"type": "string",
"name": "address"
},
{
"type": "string",
"name": "blockHash"
},
{
"type": "int",
"name": "blockNumber"
},
{
"type": "string",
"name": "data"
},
{
"type": "int",
"name": "logIndex"
},
{
"type": "boolean",
"name": "removed"
},
{
"type": {
"type": "array",
"items": "string"
},
"name": "topics"
},
{
"type": "string",
"name": "transactionHash"
},
{
"type": "int",
"name": "transactionIndex"
}
]
}
},
"name": "logs"
}
]
}
Can anybody please tell me where am I going wrong in this?
Thanks in advance