How can I get specific header from response body in Kafka Connector - apache-kafka

There is a response which is belong apache kafka-connect, I want to get just "ID" from that body. Is there any way in order to eliminate that body.
{"schema":{"type":"struct","fields":[{"type":"int32","optional":false,"field":"Id"},
{"type":"string","optional":false,"field":"__dbz__physicalTableIdentifier"}],
"optional":false,"name":"****.Key"},
"payload":{"Id":20030726,"__dbz__physicalTableIdentifier":
"test.***.dbo.Log"}} Timestamp: 2023-02-16 11:05:22.496 Headers: empty
{
"Id": 20030726,
"Date": 1652485947593,
"Thread": "ClusteredScheduler_Worker-9",
"Level": "DEBUG",
"Logger": "Quartz.Core.JobRunShell",
"Message": "Trigger instruction : NoInstruction",
"Exception": "",
"HostName": "****",
"Domain": "*****",
"Identity": "",
"UserName": "*****\\SYSTEM",
"Tier": "AppServer",
"ActivityId": "(null)",
"SessionId": "(null)",
"RequestPath": "(null)",
"DiagnosticsData": "(null)"
}
I've showed as below request to create connector.
curl -X POST -H "Accept:application/json" -H "Content-Type:application/json"
localhost:8083/connectors/ -d '{
"name": "****",
"config": {
"connector.class" : "io.debezium.connector.sqlserver.SqlServerConnector",
"tasks.max" : "1",
"database.server.name" : "test",
"database.hostname" : "****",
"database.port" : "1433",
"database.user" : "*****",
"database.password" : "*****",
"database.dbname" : "******",
"table.include.list" : "dbo.Merchant",
"database.history.kafka.bootstrap.servers" : "******:9092",
"database.history.kafka.topic" : "schema-changes.****",
"tombstones.on.delete" : "false",
"transforms": "Reroute, unwrap",
"transforms.Reroute.type": "io.debezium.transforms.ByLogicalTableRouter",
"transforms.Reroute.topic.regex": "(.*)",
"transforms.Reroute.topic.replacement": "test.****",
"transforms.unwrap.type": "io.debezium.transforms.ExtractNewRecordState"
}
}'

You can use ExtractField transform to get only one field

Related

Kafka Connect SftpCSVSourceConnector schema configuration

I'm trying to setup an SftpCSVSourceConnector in my local env and I'm having some trouble setting a schema to the connector. This is what I'm trying to do
curl -i -X PUT -H "Accept:application/json" \
-H "Content-Type:application/json" http://localhost:8083/connectors/nc-csv-02/config \
-d '{
"tasks.max" : "1",
"connector.class" : "io.confluent.connect.sftp.SftpCsvSourceConnector",
"kafka.topic": "sftp-csv-00",
"cleanup.policy":"NONE",
"behavior.on.error":"IGNORE",
"key.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"input.path" : "/",
"csv.separator.char" : 59,
"finished.path" : "/finished",
"error.path" : "/error",
"schema.generation.key.fields" : "msisdn",
"input.file.pattern" : ".*\\.dat",
"schema.generation.enabled" : "false",
"csv.first.row.as.header" : "true",
"key.schema":"{\"fields\":[{\"default\":null,\"name\":\"msisdn\",\"type\":[\"null\",\"string\"]}],\"name\":\"NCKeySchema\",\"type\":\"record\"}",
"value.schema":"{\"name\":\"NCPortabilityMovementEvent\",\"type\":\"record\",\"fields\":[{\"default\":null,\"name\":\"action\",\"type\":[\"null\",\"string\"]},{\"default\":null,\"name\":\"msisdn\",\"type\":[\"null\",\"string\"]},{\"default\":null,\"name\":\"previousNRN\",\"type\":[\"null\",\"string\"]},{\"default\":null,\"name\":\"newNRN\",\"type\":[\"null\",\"string\"]},{\"default\":null,\"name\":\"effectiveDate\",\"type\":[\"null\",\"string\"]},{\"default\":null,\"name\":\"referenceID\",\"type\":[\"null\",\"string\"]}]}",
"sftp.username":"tester",
"sftp.password":"password",
"sftp.host":"192.168.1.2",
"sftp.port":"22"
}'
The exception I see in the worker task is
org.apache.kafka.common.config.ConfigException: Invalid value com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException: Unrecognized field "fields" (class com.github.jcustenborder.kafka.connect.utils.jackson.SchemaSerializationModule$Storage), not marked as ignorable (10 known properties: "defaultValue", "valueSchema", "doc", "type", "name", "keySchema", "version", "parameters", "isOptional", "fieldSchemas"])
at [Source: (String)"{"fields":[{"default":null,"name":"msisdn","type":["null","string"]}],"name":"NCKeySchema","type":"record"}"; line: 1, column: 12] (through reference chain: com.github.jcustenborder.kafka.connect.utils.jackson.SchemaSerializationModule$Storage["fields"]) for configuration Could not read schema from 'key.schema'
at io.confluent.connect.sftp.source.SftpSourceConnectorConfig.readSchema(SftpSourceConnectorConfig.java:334)
at io.confluent.connect.sftp.source.SftpSourceConnectorConfig.<init>(SftpSourceConnectorConfig.java:117)
at io.confluent.connect.sftp.source.SftpCsvSourceConnectorConfig.<init>(SftpCsvSourceConnectorConfig.java:156)
at io.confluent.connect.sftp.SftpCsvSourceConnector.start(SftpCsvSourceConnector.java:44)
at org.apache.kafka.connect.runtime.WorkerConnector.doStart(WorkerConnector.java:185)
at org.apache.kafka.connect.runtime.WorkerConnector.start(WorkerConnector.java:210)
at org.apache.kafka.connect.runtime.WorkerConnector.doTransitionTo(WorkerConnector.java:349)
at org.apache.kafka.connect.runtime.WorkerConnector.doTransitionTo(WorkerConnector.java:332)
at org.apache.kafka.connect.runtime.WorkerConnector.doRun(WorkerConnector.java:141)
at org.apache.kafka.connect.runtime.WorkerConnector.run(WorkerConnector.java:118)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
The schemas I'm trying to use for key and value are
{
"fields": [
{
"default": null,
"name": "msisdn",
"type": [
"null",
"string"
]
}
],
"name": "NCKeySchema",
"type": "record"
}
and
{
"name" : "NCPortabilityMovementEvent",
"type" : "record",
"fields" : [
{
"default" : null,
"name" : "action",
"type" : [
"null",
"string"
]
},
{
"default" : null,
"name" : "msisdn",
"type" : [
"null",
"string"
]
},
{
"default" : null,
"name" : "previousNRN",
"type" : [
"null",
"string"
]
},
{
"default" : null,
"name" : "newNRN",
"type" : [
"null",
"string"
]
},
{
"default" : null,
"name" : "effectiveDate",
"type" : [
"null",
"string"
]
},
{
"default" : null,
"name" : "referenceID",
"type" : [
"null",
"string"
]
}
]
}
What am I doing wrong here ?
I tried this with schema.generation.enabled=true and removing the key.schema and value.schema the connector worked just fine.
You're providing Avro schemas, which are not correct. You'll need to define Connect schemas, which are type=STRUCT with fieldSchemas. The format itself is not well documented, but there are examples here https://docs.confluent.io/kafka-connect-sftp/current/source-connector/csv_source_connector.html#sftp-connector-csv-with-schema-example
You can find the source code of the schema json deserializer here - https://github.com/jcustenborder/connect-utils/tree/master/connect-utils-jackson/src/main/java/com/github/jcustenborder/kafka/connect/utils/jackson

MongoDB as sink connector not capturing data as expected - kafka?

I am currently using MySQL database as source connector using this config below, I want to monitor changes to a database and send it to mongoDB,
Here's my source connector config,
curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" localhost:8083/connectors/ -d '''{
"name": "source_mysql_connector",
"config": {
"connector.class": "io.debezium.connector.mysql.MySqlConnector",
"tasks.max": "1",
"database.hostname": "host.docker.internal",
"database.port": "3306",
"database.user": "test",
"database.password": "$apr1$o7RbW.GvrPIY1",
"database.server.id": "8111999",
"database.server.name": "db_source",
"database.include.list": "example",
"database.history.kafka.bootstrap.servers": "broker:29092",
"database.history.kafka.topic": "schema-changes.example",
"database.allowPublicKeyRetrieval":"true",
"include.schema.changes": "true"
}
}'''
Here's my sink connector (mongodb) config,
curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" localhost:8083/connectors/ -d '''{
"name": "sink_mongodb_connector",
"config": {
"connector.class": "com.mongodb.kafka.connect.MongoSinkConnector",
"tasks.max":"1",
"topics":"db_source.example.employees",
"connection.uri":"mongodb://172.17.0.1:27017/example?w=1&journal=true",
"database":"example",
"collection":"employees",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url": "http://schema-registry:8081"
}
}'''
Using this I was able to establish the connection and catch the data changes and store them onto mongodb collection for a table called employees,
But the problem here is when I checked the collections in mongodb the documents were saved like this,
{ "_id" : ObjectId("60d0e6939e00e22f274ccac1"), "before" : null, "after" : { "id" : NumberLong(11), "name" : "Steve Shining", "team" : "DevOps", "birthday" : 11477 }, "source" : { "version" : "1.5.0.Final", "connector" : "mysql", "name" : "db_source", "ts_ms" : NumberLong("1624303251000"), "snapshot" : "false", "db" : "example", "sequence" : null, "table" : "employees", "server_id" : NumberLong(6030811), "gtid" : null, "file" : "mysql-bin.000003", "pos" : NumberLong(5445), "row" : 2, "thread" : null, "query" : null }, "op" : "c", "ts_ms" : NumberLong("1624303251190"), "transaction" : null }
{ "_id" : ObjectId("60d0e6939e00e22f274ccac2"), "before" : null, "after" : { "id" : NumberLong(12), "name" : "John", "team" : "Support", "birthday" : 6270 }, "source" : { "version" : "1.5.0.Final", "connector" : "mysql", "name" : "db_source", "ts_ms" : NumberLong("1624303251000"), "snapshot" : "false", "db" : "example", "sequence" : null, "table" : "employees", "server_id" : NumberLong(6030811), "gtid" : null, "file" : "mysql-bin.000003", "pos" : NumberLong(5445), "row" : 3, "thread" : null, "query" : null }, "op" : "c", "ts_ms" : NumberLong("1624303251190"), "transaction" : null }
But my mysql database looks like this,
mysql> select * from employees;
+----+---------------+-----------+------------+------------+
| id | name | team | birthday |
+----+---------------+-----------+------------+------------+
| 1 | Peter Smith | DevOps | 2003-07-21 |
| 11 | Steve Shining | DevOps | 2001-06-04 |
| 12 | John | Support | 1987-03-03 |
+----+---------------+-----------+------------+------------+
I want my collections to look like this,
{ "_id" : ObjectId("60d0e6939e00e22f274ccac2"), "name" : "John", "team" : "Support", "birthday" : "1987-03-03 "}
What am I doing wrong here? Even the delete message is stored in collection like this, it is not able to identify the message and all. How do I fix it? Even the dates are not stored properly?
Updated:
curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" localhost:8083/connectors/ -d '''{
"name": "sink_mongodb_connector",
"config": {
"connector.class": "com.mongodb.kafka.connect.MongoSinkConnector",
"tasks.max":"1",
"topics":"db_source.example.employees",
"connection.uri":"mongodb://172.17.0.1:27017/example?w=1&journal=true",
"database":"example",
"collection":"employees",
"value.converter": "io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url": "http://schema-registry:8081",
"transforms": "unwrap",
"transforms.unwrap.type": "io.debezium.transforms.ExtractNewRecordState",
"transforms.unwrap.drop.tombstones": "false",
"transforms.unwrap.delete.handling.mode": "rewrite"
}
}'''
Problem is not related to Mongo, but the default Debezium format.
What you see is the before, after, and additional CDC event metadata.
not able to identify the message
It is, though ... "after" : { "id" : NumberLong(12), "name" : "John", "team" : "Support", "birthday" : 6270 }
You need to extract/flatten the event so that you only get the "after" field
https://debezium.io/documentation/reference/configuration/event-flattening.html
Regarding the birthday / date values, seems to be a separate issue

how on elasticsearch to do a range filter using the "REST request URI"?

I have this simple data on ES::
curl -XPUT localhost:9200/dt/art/1 -d '{ "age": 77 }'
curl -XPUT localhost:9200/dt/art/2 -d '{ "age": 19 }'
curl -XPUT localhost:9200/dt/art/3 -d '{ "age": 42 }'
curl -XPUT localhost:9200/dt/art/4 -d '{ "age": 33 }'
Is it possible to do a range filter on the "age" using the "REST request URI"?
For now, I only get this with the "REST request body":
$ curl "localhost:9200/dt/art/_search?pretty" -d'{
"query": {
"bool": { "must": { "match_all": {} },
"filter": {
"range": {
"age": { "gte": 20, "lte": 50}}}}}}'
{ "took" : 8, "timed_out" : false,
"_shards" : { "total" : 5, "successful" : 5, "skipped" : 0, "failed" : 0 },
"hits" : {
"total" : 2,
"max_score" : 1.0,
"hits" : [
{ "_index" : "dt", "_type" : "art", "_id" : "4", "_score" : 1.0,
"_source" : { "age" : 33 } },
{ "_index" : "dt", "_type" : "art", "_id" : "3", "_score" : 1.0,
"_source" : { "age" : 42 } }
]
}
}
$
Yes.
GET dt/art/_search?q=age:[19+TO+50]
For dates you can use something like this
_search?q=metrictype:os+datetime:[\"2016-08-30 10:00:00\"+to+\"2016-08-30 10:30:00\" ]+fields:datetime,cpu,disk"

Meteor, MongoDB - db.collection.find() for OR condition

In MongoDB, I have the following JSONs in a collection named "Jobs"
{
"userId": "testUser1",
"default": "true",
"someData": "data"
},
{
"userId": "testUser1",
"default": "false",
"someData": "data"
},
{
"userId": "testUser2",
"default": "true",
"someData": "data"
},
{
"userId": "testUser2",
"default": "false",
"someData": "data"
}
In Meteor, I am trying to select based on two condition
- Select documents for the given userId OR default is true
I have the following code in meteor:
Jobs.find({$or:[{userid:"testUser1"}, {default:"true"}]});
But it is selecting only two JSONs:
{
"userId": "testUser1",
"default": "true",
"someData": "data"
},
{
"userId": "testUser1",
"default": "false",
"someData": "data"
}
and its NOT giving the below JSON in response:
{
"userId": "testUser2",
"default": "true",
"someData": "data"
}
I researched with $where but even that is not working.
How to retrieve the right document from the MongoDB?
Try without $or
Jobs.find({userId: "testUser2", "default": "true"});
Just to be clear, you're trying to get all three of the records you mention, right? If so, I think your issue is that the 'true' values are strings, not bools and I'm guessing that you're searching on bool. Try this:
db.Jobs.find()
{"userId" : "testUser1", "default" : "true", "someData" : "data" }
{"userId" : "testUser1", "default" : "false", "someData" : "data" }
{"userId" : "testUser2", "default" : "true", "someData" : "data" }
{"userId" : "testUser2", "default" : "false", "someData" : "data" }
db.Jobs.find({ $or: [{ userId: 'testUser1' }, { default : 'true' } ] })
{"userId" : "testUser1", "default" : "true", "someData" : "data" }
{"userId" : "testUser1", "default" : "false", "someData" : "data" }
{"userId" : "testUser2", "default" : "true", "someData" : "data" }

Elasticsearch autocomplete or autosuggest by token

I want to create suggestions on how to complete a term based on tokens, similar to google like autocomplete but only with one token or word.
I'd like to search across filenames who will be tokenized. E.g. "BRAND_Connect_A1233.jpg" gets tokenized into "brand", "connect", "a1234" and "jpg".
Now I'd like to ask for some suggestion for e.g. "Con".
The suggestion should deliver the complete matching tokens, not the full filename:
Connect
Contour
Concept
...
The suggestion for "A12" should be "A1234", "A1233", "A1233" ...
Example
Working with queries, facets and filters works fine.
First I created a mapping including a tokenizer and a filter:
curl -XPUT 'localhost:9200/files/?pretty=1' -d '
{
"settings" : {
"analysis" : {
"analyzer" : {
"filename_search" : {
"tokenizer" : "filename",
"filter" : ["lowercase"]
},
"filename_index" : {
"tokenizer" : "filename",
"filter" : ["lowercase","edge_ngram"]
}
},
"tokenizer" : {
"filename" : {
"pattern" : "[^[;_\\.\\/]\\d]+",
"type" : "pattern"
}
},
"filter" : {
"edge_ngram" : {
"side" : "front",
"max_gram" : 20,
"min_gram" : 2,
"type" : "edgeNGram"
}
}
}
},
"mappings" : {
"file" : {
"properties" : {
"filename" : {
"type" : "string",
"search_analyzer" : "filename_search",
"index_analyzer" : "filename_index"
}
}
}
}
}'
Both analyzers work pretty well:
curl -XGET 'localhost:9200/files/_analyze?pretty=1&text=BRAND_ConnectBlue_A1234.jpg&analyzer=filename_search'
curl -XGET 'localhost:9200/files/_analyze?pretty=1&text=BRAND_ConnectBlue_A1234.jpg&analyzer=filename_index'
Now I added some example data
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_ConnectBlue_A1234.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_Connect_A1233.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_ConceptSpace_A1244.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "COMPANY_Connect_A1222.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "COMPANY_Concept_A1233.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_Connect_B1234_.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_Contour21_B1233.jpg"}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_ConceptCube_B2233.jpg"}'
curl -X POST "localhost:9200/files/_refresh"
Various approaches to get the desired suggestion does not deliver the expected results. I had tried to name the analyzers and tried various combinations of analyzers and wildcards.
curl -XGET 'localhost:9200/files/_suggest?pretty=true' -d '{
"text" : "con",
"simple_phrase" : {
"phrase" : {
"field" : "filename",
"size" : 15,
"real_word_error_likelihood" : 0.75,
"max_errors" : 0.1,
"gram_size" : 3
}
}
}'
curl -XGET 'localhost:9200/files/_suggest?pretty=true' -d '{
"my-suggestion" : {
"text" : "con",
"term" : {
"field" : "filename",
"analyzer": "filename_index"
}
}
}'
You need to add a special mapping to use the completion suggester, as documented in the official ElasticSearch docs. I've modified your example to show how it works.
First create the index. Note the filename_suggest mapping.
curl -XPUT 'localhost:9200/files/?pretty=1' -d '
{
"settings" : {
"analysis" : {
"analyzer" : {
"filename_search" : {
"tokenizer" : "filename",
"filter" : ["lowercase"]
},
"filename_index" : {
"tokenizer" : "filename",
"filter" : ["lowercase","edge_ngram"]
}
},
"tokenizer" : {
"filename" : {
"pattern" : "[^[;_\\.\\/]\\d]+",
"type" : "pattern"
}
},
"filter" : {
"edge_ngram" : {
"side" : "front",
"max_gram" : 20,
"min_gram" : 2,
"type" : "edgeNGram"
}
}
}
},
"mappings" : {
"file" : {
"properties" : {
"filename" : {
"type" : "string",
"analyzer": "filename_index",
"search_analyzer" : "filename_search"
},
"filename_suggest": {
"type": "completion",
"analyzer": "simple",
"search_analyzer": "simple",
"payloads": true
}
}
}
}
}'
Add some data. Note how the filename_suggest has the input field, which contains the keywords to match on.
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_ConnectBlue_A1234.jpg", "filename_suggest": { "input": ["BRAND", "ConnectBlue", "A1234", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_Connect_A1233.jpg", "filename_suggest": { "input": ["BRAND", "Connect", "A1233", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "BRAND_ConceptSpace_A1244.jpg", "filename_suggest": { "input": ["BRAND", "ConceptSpace", "A1244", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "COMPANY_Connect_A1222.jpg", "filename_suggest": { "input": ["COMPANY", "Connect", "A1222", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "COMPANY_Concept_A1233.jpg", "filename_suggest": { "input": ["COMPANY", "Concept", "A1233", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_Connect_B1234_.jpg", "filename_suggest": { "input": ["DEALER", "Connect", "B1234", "jpg"], "payload": {} } }'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_Contour21_B1233.jpg", "filename_suggest": { "input": ["DEALER", "Contour21", "B1233", "jpg"], "payload": {} }}'
curl -X POST "localhost:9200/files/file" -d '{ "filename" : "DEALER_ConceptCube_B2233.jpg", "filename_suggest": { "input": ["DEALER", "ConceptCube", "B2233", "jpg"], "payload": {} }}'
curl -X POST "localhost:9200/files/_refresh"
Now perform the query:
curl -XPOST 'localhost:9200/files/_suggest?pretty=true' -d '{
"filename_suggest" : {
"text" : "con",
"completion": {
"field": "filename_suggest", "size": 10
}
}
}'
Results:
{
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"filename_suggest" : [ {
"text" : "con",
"offset" : 0,
"length" : 3,
"options" : [ {
"text" : "Connect",
"score" : 2.0,
"payload":{}
}, {
"text" : "Concept",
"score" : 1.0,
"payload":{}
}, {
"text" : "ConceptSpace",
"score" : 1.0,
"payload":{}
}, {
"text" : "ConnectBlue",
"score" : 1.0,
"payload":{}
}, {
"text" : "Contour21",
"score" : 1.0,
"payload":{}
} ]
} ]
}