Synchning Postgres & Elastic Search with Kafka Connect ( Debezium ) - postgresql

I have set up using docker a postgres image, aswell as a elastic search one.
What i'm trying to achieve is that i have a Vehicle entity ( on a microservice with spring data jpa) , as well as Vehicle document ( on a microservice with spring data elastic search ) .
#Document(indexName = "vehicles")
#Builder
#Data
public class Vehicle {
#Id
private UUID id;
#Field(name = "vin")
private String vin;
#Field(name = "brand")
private String brand;
#Field(name = "model")
private String model;
}
I also have jsons for kafka connect for elastic search and postgres:
{
"name": "eh-vehicles-sink",
"config": {
"connector.class":
"io.confluent.connect.elasticsearch.ElasticsearchSinkConnector",
"tasks.max": "1",
"topics": "vehicles",
"connection.url": "http://elasticsearch:9200",
"key.ignore": "true",
"type.name": "vehicles",
"index.mapping.dynamic": false,
"key.converter": "org.apache.kafka.connect.json.JsonConverter",
"value.converter": "org.apache.kafka.connect.json.JsonConverter",
"key.converter.schemas.enable": "false",
"value.converter.schemas.enable": "false"
}
}
{
"name": "postgres-vehicles-source",
"config": {
"connector.class": "io.debezium.connector.postgresql.PostgresConnector",
"tasks.max": "1",
"plugin.name": "pgoutput",
"database.hostname": "postgres",
"database.port": "5432",
"database.user": "postgres",
"database.password": "postgres",
"database.dbname": "postgres",
"schema.include.list": "public",
"include.schema.changes": "true",
"database.server.name": "Vehicles",
"database.server.id": "5401",
"database.history.kafka.bootstrap.servers": "kafka:9092",
"database.history.kafka.topic": "public.history",
"key.converter": "org.apache.kafka.connect.json.JsonConverter",
"value.converter": "org.apache.kafka.connect.json.JsonConverter",
"key.converter.schemas.enable": "false",
"value.converter.schemas.enable": "false",
"transforms":"Reroute",
"transforms.Reroute.type": "io.debezium.transforms.ByLogicalTableRouter",
"transforms.Reroute.topic.regex":"(.*)vehicles",
"transforms.Reroute.topic.replacement": "vehicles",
"transforms.Reroute.key.field.name": "id",
"transforms.Reroute.key.enforce.uniqueness":"false"
}
}
The problem is that after a entty is persisted in postgres, kafka will send it to elastic search, but it will store it in the following format:
"hits": [
{
"_index": "vehicles",
"_type": "_doc",
"_id": "vehicles+0+0",
"_score": 1,
"_source": {
"op": "c",
"before": null,
"after": {
"generation": "F10",
"cylindrical_capacity": 2000,
"country": "Germany",
"tva_deductible": false,
"km": "100000",
"fuel": "Diesel",
"first_owner": "John Doe",
"production_date": 15126,
"created_at": null,
"traction": "ALLWHEELS",
"owner_account_id": "4a2ac2a2-3323-4b42-8960-b044897a180c",
"first_registration_date": 15354,
"colour": "Black",
"soft_deleted": false,
"transmission": "AUTOMATIC",
"accident_free": true,
"vin": "WBAJB51090B513560",
"model": "5 Series",
"id": "739c9d56-d50b-4b17-a86a-e2561f54c1a9",
"power": 180,
"brand": "BMW",
"favorite_accounts": null
},
"source": {
"schema": "public",
"sequence": "[\"24015744\",\"24015744\"]",
"xmin": null,
"connector": "postgresql",
"lsn": 24015744,
"name": "Vehicles",
"txId": 506,
"version": "1.8.1.Final",
"ts_ms": 1676855693566,
"snapshot": "false",
"db": "postgres",
"table": "vehicles"
},
"ts_ms": 1676855694066,
"transaction": null
}
}
]
Which will be a problem when fetching it in the microservice for elastic search, because the payload is wrapped in another object and the wrong id will be fetched unless i do some aditional processing which i don't wanna do cause it seems a little bit boiler.
How can i configure kafka debezium in order to store in the vehicle index, only the entity, wihout aditional metadata like "after" ?

Related

How can I save Kafka message Key in document for MongoDB Sink?

Right now I have a MongoDB Sink and it saves the value of incoming AVRO messages correctly.
I need it to save the Kafka Message Key in the document.
I have tried org.apache.kafka.connect.transforms.HoistField$Key in order to add the key to the value that is being saved, but this did nothing. It did work when using ProvidedInKeyStrategy, but I don't want my _id to be the Kafka message Key.
My configuration:
"config": {
"connector.class": "com.mongodb.kafka.connect.MongoSinkConnector",
"connection.uri": "mongodb://mongo1",
"database": "mongodb",
"collection": "sink",
"topics": "topics.foo",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url": "http://schema-registry:8081",
"key.converter": "io.confluent.connect.avro.AvroConverter",
"key.converter.schema.registry.url": "http://schema-registry:8081",
"transforms": "hoistKey",
"transforms.hoistKey.type":"org.apache.kafka.connect.transforms.HoistField$Key",
"transforms.hoistKey.field":"kafkaKey"
}
Kafka message schema:
{
"type": "record",
"name": "Smoketest",
"namespace": "some_namespace",
"fields": [
{
"name": "timestamp",
"type": "int",
"logicalType": "timestamp-millis"
}
]
}
Kafka key schema:
[
{
"type": "enum",
"name": "EnvironmentType",
"namespace": "some_namespace",
"doc": "DEV",
"symbols": [
"Dev",
"Test",
"Accept",
"Sandbox",
"Prod"
]
},
{
"type": "record",
"name": "Key",
"namespace": "some_namespace",
"doc": "The standard Key type that is used as key",
"fields": [
{
"name": "conversation_id",
"doc": "The first system producing an event sets this field",
"type": "string"
},
{
"name": "broker_key",
"doc": "The key of the broker",
"type": "string"
},
{
"name": "user_id",
"doc": "User identification",
"type": [
"null",
"string"
]
},
{
"name": "application",
"doc": "The name of the application",
"type": [
"null",
"string"
]
},
{
"name": "environment",
"doc": "The type of environment",
"type": "type.EnvironmentType"
}
]
}
]
Using https://github.com/f0xdx/kafka-connect-wrap-smt I can now wrap all the data from the kafka message into a single document to save in my mongodb sink.

Debezium with MongoDB - Produced record's payload contains backslash

I'm implementing data extract using the debezium mongodb connector, building up upon the official documentation: https://debezium.io/documentation/reference/stable/connectors/mongodb.html
Everything is working quite fine - except that the payload contains backslash as you can see in the after attribute. Well, oddly enough, the attribute source is right.
{
"after": "{\"_id\": {\"$oid\": \"63626d5993801d8fd1140993\"},\"document\": \"29973569000204\",\"document_type\": \"CNPJ\"}",
"patch": null,
"filter": null,
"source": {
"version": "1.7.1.Final",
"connector": "mongodb",
"name": "xxxxxxxxxx",
"ts_ms": 8466513,
"snapshot": "false",
"db": "database",
"sequence": null,
"rs": "atlas-iurhise-shard-0",
"collection": "mongo_collection",
"ord": 1,
"h": null,
"tord": 4,
"stxnid": "281f4230-d8cc-3d23-a556-89923b45e25f:168"
},
"op": "c",
"ts_ms": 1667394905422,
"transaction": null
}
I tried this solution, but it doesn't work for me: Debezium Outbox Pattern property transforms.outbox.table.expand.json.payload not working
these are my settings:
{
"name": "DebeziumDataExtract",
"config": {
"connector.class": "io.debezium.connector.mongodb.MongoDbConnector",
"tasks.max": "3",
"mongodb.hosts": "removed",
"mongodb.name": "removed",
"mongodb.user": "removed",
"mongodb.password": "removed",
"mongodb.ssl.enabled": "true",
"collection.whitelist": "removed",
"key.converter": "org.apache.kafka.connect.json.JsonConverter",
"value.converter": "org.apache.kafka.connect.storage.StringConverter",
"hstore.handling.mode": "json",
"decimal.handling.mode": "string",
"key.converter.schemas.enable": "false",
"value.converter.schemas.enable": "false",
"heartbeat.interval.ms": "1000",
"heartbeat.topics.prefix": "removed",
"topic.creation.default.replication.factor": 3,
"topic.creation.default.partitions": 1,
"topic.creation.default.cleanup.policy": "compact",
"topic.creation.default.compression.type": "lz4",
"transforms": "unwrap",
"transforms.unwrap.collection.expand.json.payload": "true"
}
}
and waiting for a payload like this:
{
"after": {
"_id": {
"$oid": "63626d5993801d8fd1140993"
},
"document": "29973585214796",
"document_type": "CNPJ"
},
"patch": null,
"filter": null,
"source": {
"version": "1.7.1.Final",
"connector": "mongodb",
"name": "xxxxxxxxxx",
"ts_ms": 8466513,
"snapshot": "false",
"db": "database",
"sequence": null,
"rs": "atlas-iurhise-shard-0",
"collection": "mongo_collection",
"ord": 1,
"h": null,
"tord": 4,
"stxnid": "281f4230-d8cc-3d23-a556-89923b45e25f:168"
},
"op": "c",
"ts_ms": 1667394905422,
"transaction": null
}
Could someone help me?
########## UPDATES ##########
After #onecricketeer comments I tried this:
{
"name": "DebeziumTransportPlanner",
"config": {
"connector.class": "io.debezium.connector.mongodb.MongoDbConnector",
"tasks.max": "3",
"mongodb.hosts": "stg-transport-planner-0-shard-00-00-00.xmapa.mongodb.net,stg-transport-planner-0-shard-00-01.xmapa.mongodb.net,stg-transport-planner-0-shard-00-02.xmapa.mongodb.net",
"mongodb.name": "stg-transport-planner-01",
"mongodb.user": "oploguser-stg",
"mongodb.password": "vCh1NtV4PoY8PeSJ",
"mongodb.ssl.enabled": "true",
"collection.whitelist": "stg-transport-planner-01[.]aggregated_transfers",
"key.converter": "org.apache.kafka.connect.json.JsonConverter",
"value.converter": "org.apache.kafka.connect.json.JsonConverter",
"hstore.handling.mode": "json",
"decimal.handling.mode": "string",
"key.converter.schemas.enable": "false",
"value.converter.schemas.enable": "false",
"heartbeat.interval.ms": "1000",
"heartbeat.topics.prefix": "__debeziumtransport-planner-heartbeat",
"topic.creation.default.replication.factor": 3,
"topic.creation.default.partitions": 1,
"topic.creation.default.cleanup.policy": "compact",
"topic.creation.default.compression.type": "lz4",
"transforms": "unwrap",
"transforms.unwrap.type":"io.debezium.connector.mongodb.transforms.ExtractNewDocumentState",
"transforms.unwrap.collection.expand.json.payload": "true",
"transforms.unwrap.collection.fields.additional.placement": "route_external_id:header,transfer_index:header"
}
}
You need to use JsonConverter instead of StringConverter if you want the data to be a JSON object rather than a String.
Also, you are missing transforms.unwrap.type

Optimal Kakfa Connect Hourly S3AvroSink Config

{
"name":"{{name}}",
"tasks.max": "6", //have 6 partitions for this topic
"topics": "{{topic}}",
"connector.class": "io.confluent.connect.s3.S3SinkConnector",
"key.converter": "io.confluent.connect.avro.AvroConverter",
"key.converter.schemas.enable": "true",
"key.converter.schema.registry.url": "xx",
"key.converter.key.subject.name.strategy": "io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter.schemas.enable": "true",
"value.converter.schema.registry.url": "xx",
"value.converter.value.subject.name.strategy": "io.confluent.kafka.serializers.subject.TopicRecordNameStrategy",
"errors.retry.timeout":"600000",
"errors.log.enable":"true",
"errors.log.include.messages":"true",
"schema.compatibility": "BACKWARD",
"format.class": "io.confluent.connect.s3.format.avro.AvroFormat",
"flush.size": "100000",
"rotate.schedule.interval.ms": "3600000",
"rotate.interval.ms": "3600000",
"enhanced.avro.schema.support": "true",
"connect.meta.data": "false",
"partitioner.class": "{{partitioner}}somepartitioner",
"partition.duration.ms": "3600000",
"path.format": "'avro/event=?eventClass?/tenant=?tenant?/date'=YYYY-MM-dd/'hour'=HH",
"locale": "en",
"timezone": "UTC",
"timestamp.extractor": "RecordField",
"timestamp.field": "{{timestampField}}",
"storage.class": "io.confluent.connect.s3.storage.S3Storage",
"s3.bucket.name": "somebucket",
"s3.region": "region",
"s3.part.size": "5242880",
"offset.flush.interval.ms": "1200000"
}
The count for the topic is around 739,180 and size 1.1Gb
I'm not fully sure if my config is fully correct or not, if i can improve it somehwere. I want to flush in two cases, hourly or if the size hits 5gb.

Kafka sink to mongoDB, How do I set the "_ID" field to an existing value in one of the columns in my topic?

I have the following topic (JSON, not AVRO) generated by Debezium
"payload": {"id": 1, "name": "test": "uuid": "f9a96ea4-3ff9-480f-bf8a-ee53a1e6e583"}
How do I set the "_ID" field (in mongo collection) to the same value "uuid"?
This is my SINK config:
{
"name": "mongo-sink",
"config": {
"connector.class": "com.mongodb.kafka.connect.MongoSinkConnector",
"tasks.max": 3,
"key.converter": "org.apache.kafka.connect.json.JsonConverter",
"value.converter": "org.apache.kafka.connect.json.JsonConverter",
"topics": "s4farm.animal",
"connection.uri": "mongodb://user:password#host:port/?authSource=database",
"database": "database",
"collection": "s4farm_animal",
"document.id.strategy": "com.mongodb.kafka.connect.sink.processor.id.strategy.PartialValueStrategy",
"value.projection.list": "id",
"value.projection.type": "whitelist",
"writemodel.strategy": "com.mongodb.kafka.connect.sink.writemodel.strategy.ReplaceOneBusinessKeyStrategy"
}
}
Can you help me?

is there an way to transform key field values to lower case in debezium sql server source connector? [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
I want to transform SQL-Server column names to lower case while storing it in a Kafka topic. I am using debezium as my source connector
It can be done using Kafka Connect Common Transformations by Jeremy Custenborder
SQL Server table:
Id Name Description Weight Pro_Id
101 aaa Sample_Test 3.14 2020-02-21 13:32:06.5900000
102 eee testdata1 3.14 2020-02-21 13:32:06.5900000
Step 1: Download the kafka connect common transformations jar file by Jeremy Custenborder in confluent hub from this link
Step 2: place the jar file in /usr/share/java or /kafka/libs based on your kafka environment
Step 3: Create the debezium SQL-Server source connector
{
"name": "sqlserver_src_connector",
"config": {
"connector.class": "io.debezium.connector.sqlserver.SqlServerConnector",
"database.server.name": "sqlserver",
"database.hostname": "*.*.*.*",
"database.port": "1433",
"database.user": "username",
"database.password": "password",
"database.dbname": "db_name",
"table.whitelist": "dbo.tablename",
"transforms": "unwrap,changeCase",
"transforms.unwrap.type": "io.debezium.transforms.ExtractNewRecordState",
"transforms.changeCase.type" : "com.github.jcustenborder.kafka.connect.transform.common.ChangeCase$Value",
"transforms.changeCase.from" : "UPPER_UNDERSCORE",
"transforms.changeCase.to" : "LOWER_UNDERSCORE",
"database.history.kafka.bootstrap.servers": "*.*.*.*",
"database.history.kafka.topic": "schema-changes-tablename"
}
}
Step 4: kafka topic data
{
"schema": {
"type": "struct",
"fields": [
{
"type": "int32",
"optional": false,
"field": "id"
},
{
"type": "string",
"optional": false,
"field": "name"
},
{
"type": "string",
"optional": true,
"field": "description"
},
{
"type": "double",
"optional": true,
"field": "weight"
},
{
"type": "int64",
"optional": false,
"name": "io.debezium.time.NanoTimestamp",
"version": 1,
"field": "pro_id"
}
],
"optional": true,
"name": "sqlserver.dbo.tablename"
},
"payload": {
"id": 101,
"name": "aaa",
"description": "Sample_Test",
"weight": 3.14,
"pro_id": 1582291926590000000
}
}
{
"schema": {
"type": "struct",
"fields": [
{
"type": "int32",
"optional": false,
"field": "id"
},
{
"type": "string",
"optional": false,
"field": "name"
},
{
"type": "string",
"optional": true,
"field": "description"
},
{
"type": "double",
"optional": true,
"field": "weight"
},
{
"type": "int64",
"optional": false,
"name": "io.debezium.time.NanoTimestamp",
"version": 1,
"field": "pro_id"
}
],
"optional": true,
"name": "sqlserver.dbo.tablename"
},
"payload": {
"id": 102,
"name": "eee",
"description": "testdata1",
"weight": 3.14,
"pro_id": 1582291926590000000
}
}
thanks for the help Jiri Pechanec and Chris Cranford #Naros from debezium community