Flink SQL Client connect to secured kafka cluster - apache-kafka

I want to execute a query on Flink SQL Table backed by kafka topic of secured kafka cluster. I'm able to execute the query programmatically but unable to do the same through Flink SQL client. I'm not sure on how to pass JAAS config (java.security.auth.login.config) and other system properties through Flink SQL client.
Flink SQL query programmatically
private static void simpleExec_auth() {
// Create the execution environment.
final EnvironmentSettings settings = EnvironmentSettings.newInstance()
.inStreamingMode()
.withBuiltInCatalogName(
"default_catalog")
.withBuiltInDatabaseName(
"default_database")
.build();
System.setProperty("java.security.auth.login.config","client_jaas.conf");
System.setProperty("sun.security.jgss.native", "true");
System.setProperty("sun.security.jgss.lib", "/usr/libexec/libgsswrap.so");
System.setProperty("javax.security.auth.useSubjectCredsOnly","false");
TableEnvironment tableEnvironment = TableEnvironment.create(settings);
String createQuery = "CREATE TABLE test_flink11 ( " + "`keyid` STRING, " + "`id` STRING, "
+ "`name` STRING, " + "`age` INT, " + "`color` STRING, " + "`rowtime` TIMESTAMP(3) METADATA FROM 'timestamp', " + "`proctime` AS PROCTIME(), " + "`address` STRING) " + "WITH ( "
+ "'connector' = 'kafka', "
+ "'topic' = 'test_flink10', "
+ "'scan.startup.mode' = 'latest-offset', "
+ "'properties.bootstrap.servers' = 'kafka01.nyc.com:9092', "
+ "'value.format' = 'avro-confluent', "
+ "'key.format' = 'avro-confluent', "
+ "'key.fields' = 'keyid', "
+ "'value.fields-include' = 'EXCEPT_KEY', "
+ "'properties.security.protocol' = 'SASL_PLAINTEXT', 'properties.sasl.kerberos.service.name' = 'kafka', 'properties.sasl.kerberos.kinit.cmd' = '/usr/local/bin/skinit --quiet', 'properties.sasl.mechanism' = 'GSSAPI', "
+ "'key.avro-confluent.schema-registry.url' = 'http://kafka-schema-registry:5037', "
+ "'key.avro-confluent.schema-registry.subject' = 'test_flink6', "
+ "'value.avro-confluent.schema-registry.url' = 'http://kafka-schema-registry:5037', "
+ "'value.avro-confluent.schema-registry.subject' = 'test_flink4')";
System.out.println(createQuery);
tableEnvironment.executeSql(createQuery);
TableResult result = tableEnvironment
.executeSql("SELECT name,rowtime FROM test_flink11");
result.print();
}
This is working fine.
Flink SQL query through SQL client
Running this giving the following error.
Flink SQL> CREATE TABLE test_flink11 (`keyid` STRING,`id` STRING,`name` STRING,`address` STRING,`age` INT,`color` STRING) WITH('connector' = 'kafka', 'topic' = 'test_flink10','scan.startup.mode' = 'earliest-offset','properties.bootstrap.servers' = 'kafka01.nyc.com:9092','value.format' = 'avro-confluent','key.format' = 'avro-confluent','key.fields' = 'keyid', 'value.avro-confluent.schema-registry.url' = 'http://kafka-schema-registry:5037', 'value.avro-confluent.schema-registry.subject' = 'test_flink4', 'value.fields-include' = 'EXCEPT_KEY', 'key.avro-confluent.schema-registry.url' = 'http://kafka-schema-registry:5037', 'key.avro-confluent.schema-registry.subject' = 'test_flink6', 'properties.security.protocol' = 'SASL_PLAINTEXT', 'properties.sasl.kerberos.service.name' = 'kafka', 'properties.sasl.kerberos.kinit.cmd' = '/usr/local/bin/skinit --quiet', 'properties.sasl.mechanism' = 'GSSAPI');
Flink SQL> select * from test_flink11;
[ERROR] Could not execute SQL statement. Reason:
java.lang.IllegalArgumentException: Could not find a 'KafkaClient' entry in the JAAS configuration. System property 'java.security.auth.login.config' is /tmp/jaas-6309821891889949793.conf
There is nothing in /tmp/jaas-6309821891889949793.conf except the following comment
# We are using this file as an workaround for the Kafka and ZK SASL implementation
# since they explicitly look for java.security.auth.login.config property
# Please do not edit/delete this file - See FLINK-3929
SQL client run command
bin/sql-client.sh embedded --jar flink-sql-connector-kafka_2.11-1.12.0.jar --jar flink-sql-avro-confluent-registry-1.12.0.jar
Flink cluster command
bin/start-cluster.sh
How to pass this java.security.auth.login.config and other system properties (that I'm setting in the above java code snippet), for SQL client?

flink-conf.yaml
security.kerberos.login.use-ticket-cache: true
security.kerberos.login.principal: XXXXX#HADOOP.COM
security.kerberos.login.use-ticket-cache: false
security.kerberos.login.keytab: /path/to/kafka.keytab
security.kerberos.login.principal: XXXX#HADOOP.COM
security.kerberos.login.contexts: Client,KafkaClient
I haven't really tested whether this solution is feasible, you can try it out, hope it will help you.

Related

Flink table sink doesn't work with debezium-avro-confluent source

I'm using Flink SQL to read debezium avro data from Kafka and store as parquet files in S3. Here is my code,
import os
from pyflink.datastream import StreamExecutionEnvironment, FsStateBackend
from pyflink.table import TableConfig, DataTypes, BatchTableEnvironment, StreamTableEnvironment, \
ScalarFunction
exec_env = StreamExecutionEnvironment.get_execution_environment()
exec_env.set_parallelism(1)
# start a checkpoint every 12 s
exec_env.enable_checkpointing(12000)
t_config = TableConfig()
t_env = StreamTableEnvironment.create(exec_env, t_config)
INPUT_TABLE = 'source'
KAFKA_TOPIC = os.environ['KAFKA_TOPIC']
KAFKA_BOOTSTRAP_SERVER = os.environ['KAFKA_BOOTSTRAP_SERVER']
OUTPUT_TABLE = 'sink'
S3_BUCKET = os.environ['S3_BUCKET']
OUTPUT_S3_LOCATION = os.environ['OUTPUT_S3_LOCATION']
ddl_source = f"""
CREATE TABLE {INPUT_TABLE} (
`event_time` TIMESTAMP(3) METADATA FROM 'timestamp' VIRTUAL,
`id` BIGINT,
`price` DOUBLE,
`type` INT,
`is_reinvite` INT
) WITH (
'connector' = 'kafka',
'topic' = '{KAFKA_TOPIC}',
'properties.bootstrap.servers' = '{KAFKA_BOOTSTRAP_SERVER}',
'scan.startup.mode' = 'earliest-offset',
'format' = 'debezium-avro-confluent',
'debezium-avro-confluent.schema-registry.url' = 'http://kafka-production-schema-registry:8081'
)
"""
ddl_sink = f"""
CREATE TABLE {OUTPUT_TABLE} (
`event_time` TIMESTAMP,
`id` BIGINT,
`price` DOUBLE,
`type` INT,
`is_reinvite` INT
) WITH (
'connector' = 'filesystem',
'path' = 's3://{S3_BUCKET}/{OUTPUT_S3_LOCATION}',
'format' = 'parquet'
)
"""
t_env.sql_update(ddl_source)
t_env.sql_update(ddl_sink)
t_env.execute_sql(f"""
INSERT INTO {OUTPUT_TABLE}
SELECT *
FROM {INPUT_TABLE}
""")
When I submit the job, I get the following error message,
pyflink.util.exceptions.TableException: Table sink 'default_catalog.default_database.sink' doesn't support consuming update and delete changes which is produced by node TableSourceScan(table=[[default_catalog, default_database, source]], fields=[id, price, type, is_reinvite, timestamp])
I'm using Flink 1.12.1. The source is working properly and I have tested it using a 'print' connector in the sink. Here is a sample data set which was extracted from the task manager logs when using 'print' connector in the table sink,
-D(2021-02-20T17:07:27.298,14091764,26.0,9,0)
-D(2021-02-20T17:07:27.298,14099765,26.0,9,0)
-D(2021-02-20T17:07:27.299,14189806,16.0,9,0)
-D(2021-02-20T17:07:27.299,14189838,37.0,9,0)
-D(2021-02-20T17:07:27.299,14089840,26.0,9,0)
-D(2021-02-20T17:07:27.299,14089847,26.0,9,0)
-D(2021-02-20T17:07:27.300,14189859,26.0,9,0)
-D(2021-02-20T17:07:27.301,14091808,37.0,9,0)
-D(2021-02-20T17:07:27.301,14089911,37.0,9,0)
-D(2021-02-20T17:07:27.301,14099937,26.0,9,0)
-D(2021-02-20T17:07:27.302,14091851,37.0,9,0)
How can I make my table sink work with the filesystem connector ?
What happens is that:
when receiving the Debezium records, Flink updates a logical table by adding, removing and suppressing Flink rows based on their primary key.
the only sinks that can handle that kind of information are those that have a concept of update by key. Jdbc would be a typical example, in which case it's straightforward for Flink to translate the concept of "a Flink row with key foo has been updated to bar" into "JDBC row with key foo should be updated with value bar", or something. filesystem sink do not support that kind of operation since files are append-only.
See also Flink documentation on append and update queries
In practice, in order to do the conversion, we first have to decide what is it exactly we want to have in this append-only file.
If what we want is to have in the file the latest version of each item any time an id is updated, then to my knowledge the way to go would be to convert it to a stream first, and then output that with a FileSink. Note that in that case, the result contains a boolean saying if the row is updated or deleted, and we have to decide how we want this information to be visible in the resulting file.
Note: I used this other CDC example from the Flink SQL cookbook to reproduce a similar setup:
// assuming a Flink retract table of claims build from a CDC stream:
tableEnv.executeSql("" +
" CREATE TABLE accident_claims (\n" +
" claim_id INT,\n" +
" claim_total FLOAT,\n" +
" claim_total_receipt VARCHAR(50),\n" +
" claim_currency VARCHAR(3),\n" +
" member_id INT,\n" +
" accident_date VARCHAR(20),\n" +
" accident_type VARCHAR(20),\n" +
" accident_detail VARCHAR(20),\n" +
" claim_date VARCHAR(20),\n" +
" claim_status VARCHAR(10),\n" +
" ts_created VARCHAR(20),\n" +
" ts_updated VARCHAR(20)" +
") WITH (\n" +
" 'connector' = 'postgres-cdc',\n" +
" 'hostname' = 'localhost',\n" +
" 'port' = '5432',\n" +
" 'username' = 'postgres',\n" +
" 'password' = 'postgres',\n" +
" 'database-name' = 'postgres',\n" +
" 'schema-name' = 'claims',\n" +
" 'table-name' = 'accident_claims'\n" +
" )"
);
// convert it to a stream
Table accidentClaims = tableEnv.from("accident_claims");
DataStream<Tuple2<Boolean, Row>> accidentClaimsStream = tableEnv
.toRetractStream(accidentClaims, Row.class);
// and write to file
final FileSink<Tuple2<Boolean, Row>> sink = FileSink
// TODO: adapt the output format here:
.forRowFormat(new Path("/tmp/flink-demo"),
(Encoder<Tuple2<Boolean, Row>>) (element, stream) -> stream.write((element.toString() + "\n").getBytes(StandardCharsets.UTF_8)))
.build();
ordersStreams.sinkTo(sink);
streamEnv.execute();
Note that during the conversion, you obtain a boolean telling you whether that row is a new value for that accident claim, or a deletion of such claim. My basic FileSink config there is just including that boolean in the output, although how to handle deletions is to be decided case by case.
The result in the file then looks like this:
head /tmp/flink-demo/2021-03-09--09/.part-c7cdb74e-893c-4b0e-8f69-1e8f02505199-0.inprogress.f0f7263e-ec24-4474-b953-4d8ef4641998
(true,1,4153.92,null,AUD,412,2020-06-18 18:49:19,Permanent Injury,Saltwater Crocodile,2020-06-06 03:42:25,IN REVIEW,2021-03-09 06:39:28,2021-03-09 06:39:28)
(true,2,8940.53,IpsumPrimis.tiff,AUD,323,2019-03-18 15:48:16,Collision,Blue Ringed Octopus,2020-05-26 14:59:19,IN REVIEW,2021-03-09 06:39:28,2021-03-09 06:39:28)
(true,3,9406.86,null,USD,39,2019-04-28 21:15:09,Death,Great White Shark,2020-03-06 11:20:54,INITIAL,2021-03-09 06:39:28,2021-03-09 06:39:28)
(true,4,3997.9,null,AUD,315,2019-10-26 21:24:04,Permanent Injury,Saltwater Crocodile,2020-06-25 20:43:32,IN REVIEW,2021-03-09 06:39:28,2021-03-09 06:39:28)
(true,5,2647.35,null,AUD,74,2019-12-07 04:21:37,Light Injury,Cassowary,2020-07-30 10:28:53,REIMBURSED,2021-03-09 06:39:28,2021-03-09 06:39:28)

Failed to connect to Confluent Platform Schema Registry - Apache Flink SQL Confluent Avro Format

I am using Confluent managed Kafka cluster, Schema Registry service and trying to process Debezium messages in a Flink job. The job is configured to use Table & SQL Connectors and Confluent Avro Format.
However the job is not able to connect to Schema Registry and raises 401 error.
Table Connector configurations
tEnv.executeSql("CREATE TABLE flink_test_1 (\n" +
" ORDER_ID STRING,\n" +
" ORDER_TYPE STRING,\n" +
" USER_ID STRING,\n" +
" ORDER_SUM BIGINT\n" +
") WITH (\n" +
" 'connector' = 'kafka',\n" +
" 'topic' = 'flink_test_1',\n" +
" 'scan.startup.mode' = 'earliest-offset',\n" +
" 'format' = 'avro-confluent',\n" +
" 'avro-confluent.schema-registry.url' = 'https://<SR_ENDPOINT>',\n" +
" 'avro-confluent.schema-registry.subject' = 'flink_test_1-value',\n" +
" 'properties.basic.auth.credentials.source' = 'USER_INFO',\n" +
" 'properties.basic.auth.user.info' = '<SR_API_KEY>:<SR_API_SECRET>',\n" +
" 'properties.bootstrap.servers' = '<CLOUD_BOOTSTRAP_SERVER_ENDPOINT>:9092',\n" +
" 'properties.security.protocol' = 'SASL_SSL',\n" +
" 'properties.ssl.endpoint.identification.algorithm' = 'https',\n" +
" 'properties.sasl.mechanism' = 'PLAIN',\n" +
" 'properties.sasl.jaas.config' = 'org.apache.kafka.common.security.plain.PlainLoginModule required username=\"<CLUSTER_API_KEY>\" password=\"<CLUSTER_API_SECRET>\";'\n" +
")");
Error Message
Caused by: java.io.IOException: Failed to deserialize Avro record.
at org.apache.flink.formats.avro.AvroRowDataDeserializationSchema.deserialize(AvroRowDataDeserializationSchema.java:101)
at org.apache.flink.formats.avro.AvroRowDataDeserializationSchema.deserialize(AvroRowDataDeserializationSchema.java:44)
at org.apache.flink.api.common.serialization.DeserializationSchema.deserialize(DeserializationSchema.java:82)
at org.apache.flink.streaming.connectors.kafka.table.DynamicKafkaDeserializationSchema.deserialize(DynamicKafkaDeserializationSchema.java:113)
at org.apache.flink.streaming.connectors.kafka.internals.KafkaFetcher.partitionConsumerRecordsHandler(KafkaFetcher.java:179)
at org.apache.flink.streaming.connectors.kafka.internals.KafkaFetcher.runFetchLoop(KafkaFetcher.java:142)
at org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumerBase.run(FlinkKafkaConsumerBase.java:826)
at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:110)
at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:66)
at org.apache.flink.streaming.runtime.tasks.SourceStreamTask$LegacySourceFunctionThread.run(SourceStreamTask.java:241)
Caused by: java.io.IOException: Could not find schema with id 100256 in registry
at org.apache.flink.formats.avro.registry.confluent.ConfluentSchemaRegistryCoder.readSchema(ConfluentSchemaRegistryCoder.java:77)
at org.apache.flink.formats.avro.RegistryAvroDeserializationSchema.deserialize(RegistryAvroDeserializationSchema.java:70)
at org.apache.flink.formats.avro.AvroRowDataDeserializationSchema.deserialize(AvroRowDataDeserializationSchema.java:98)
... 9 more
Caused by: io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException: Unauthorized; error code: 401
at io.confluent.kafka.schemaregistry.client.rest.RestService.sendHttpRequest(RestService.java:292)
at io.confluent.kafka.schemaregistry.client.rest.RestService.httpRequest(RestService.java:352)
at io.confluent.kafka.schemaregistry.client.rest.RestService.getId(RestService.java:660)
at io.confluent.kafka.schemaregistry.client.rest.RestService.getId(RestService.java:642)
at io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient.getSchemaByIdFromRegistry(CachedSchemaRegistryClient.java:217)
at io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient.getSchemaBySubjectAndId(CachedSchemaRegistryClient.java:291)
at io.confluent.kafka.schemaregistry.client.CachedSchemaRegistryClient.getSchemaById(CachedSchemaRegistryClient.java:276)
at io.confluent.kafka.schemaregistry.client.SchemaRegistryClient.getById(SchemaRegistryClient.java:64)
at org.apache.flink.formats.avro.registry.confluent.ConfluentSchemaRegistryCoder.readSchema(ConfluentSchemaRegistryCoder.java:74)
... 11 more
I successfully tested the connection to Schema Registry by:
curl -u <SR_API_KEY>:<SR_API_SECRET> https://<SR_ENDPOINT>
It seem like the error message "io.confluent.kafka.schemaregistry.client.rest.exceptions.RestClientException: Unauthorized; error code: 401" clearly says that <SR_API_KEY>:<SR_API_SECRET> were not passed to the Confluent Schema Registry.
I checked the documentation here https://ci.apache.org/projects/flink/flink-docs-release-1.12/dev/table/connectors/formats/avro-confluent.html, where only 3 Format options described: ["format", "avro-confluent.schema-registry.url", "avro-confluent.schema-registry.subject"] and no options for specifying SR_API_KEY and SR_API_SECRET.
I can't figure out how to successfully connect to the secure schema registry from the Flink program.
Is this connection type supported by Flink?
Does anyone know what the correct connection configuration should look like?
Thanks.
I got the same issue.
After some investigation, I found a Jira ticket about this issue.
If you can't upgrade your flink version, you can first use DataStream API
to consume data and then convert it to Table.

Scala no configuration key found

I could extract value for
val ranking_score_path = cf.getString(stg + ".input.path.ranking_score")
.replaceAll("_replace_date_", this_date)
and
val output_path = cf.getString(stg + ".output.path.hdfs") + tomz_date + "/"
but not
val AS_HOST = cf.getString(stg + ".output.path.aerospike.host")
println("AS_HOST = " + AS_HOST)
I have tried
replacing . with _,
adding commas
but didnt work.
error log
Exception in thread "main" com.typesafe.config.ConfigException$Missing: No configuration setting found for key 'production.output.path.aerospike'
at com.typesafe.config.impl.SimpleConfig.findKeyOrNull(SimpleConfig.java:152)
at ...
application.conf
production {
input {
path {
local = "/home/aduser/tmp/"
hdfs = "/user/aduser/tmp_vincent/CPA/_replace_date_/intermediate/l1/"
ranking_score = "/home/aduser/plt/item_performance/pipeline/cpa/output/_replace_date_/predict_output/ranking_score.csv"
}
}
output {
path {
local = "/home/aduser/tmp/"
hdfs = "/user/aduser/dyson/display/"
aerospike {
host = "0.0.0.0"
port = 3000
namespace = "test"
set = "spark-test2"
}
}
}
}
Reply # Comment #1
the cf is very long but the important part is as follows
... ore.csv"}},"output":{"path":{"hdfs":"/user/aduser/dyson/display/","local":"/home/aduser/tmp/"}}},"sun":{"arch": ...
Effort #1: Replaced part of the application.conf
path {
local = "/home/aduser/tmp/"
hdfs = "/user/aduser/dyson/display/"
ae_host = "0.0.0.0"
ae_port = 3000
ae_namespace = "test"
ae_set = "spark-test2"
}
and changed the calling method
val AS_HOST = cf.getString(stg + ".output.path.ae_host")
println("AS_HOST = " + AS_HOST)
but still getting errors
Exception in thread "main" com.typesafe.config.ConfigException$Missing: No configuration setting found for key 'production.output.path.ae_host'
at com.typesafe.config.impl.SimpleConfig.findKeyOrNull(SimpleConfig.java:152)
I have found the problem after several hours, my application.conf was at the same level with src.main.scala and it worked partially for reasons which I don't know. It worked perfectly after creating src.main.resources and putting application.conf inside.

parameterize queries in OrientDB using placeholders

I am using pyorient and and want to parameterize queries -- I am assuming that command() allows placeholders, but am not able to find any documentation. IN particular I'd like to use dict() args as per postgres' %(name)s construct, but could make tuples/lists work as well.
I tried your case with my python_test database:
Dataset:
I used two parameters:
name ---> string;
surname ---> string;
and I passed them into the command() function.
PyOrient Code:
import pyorient
db_name = 'python_test'
user = 'root'
pwd = 'root'
print("Connecting...")
client = pyorient.OrientDB("localhost",2424)
session_id = client.connect(user, pwd)
print("OK - sessionID: ",session_id,"\n")
if client.db_exists( db_name, pyorient.STORAGE_TYPE_PLOCAL ):
print("DB "+db_name+" already exists\n")
client.db_open(db_name, user, pwd, pyorient.DB_TYPE_GRAPH, pyorient.STORAGE_TYPE_PLOCAL)
name = 'test name'
surname = 'test surname'
vertexes = client.command("SELECT FROM TestClass WHERE name = '" + name + "' AND surname = '" + surname + "'")
for vertex in vertexes:
print(vertex)
else:
print("Creating DB "+ db_name + "...")
client.db_create( db_name, pyorient.DB_TYPE_GRAPH, pyorient.STORAGE_TYPE_PLOCAL )
print("DB " + db_name + " created\n")
client.db_close()
Output:
Connecting...
OK - sessionID: 40
DB python_test already exists
{'#TestClass':{'surname': 'test surname', 'name': 'test name'},'version':1,'rid':'#12:0'}
Hope it helps

CQL SELECT fails with prepareStatement

In my sample app following snippet works fine.
SELECT_CQL = "SELECT * FROM " + STREAM_NAME_IN_CASSANDRA + " WHERE '" + CONTEXT_ID_COLUMN + "'=?";
Connection connection = getConnection();
statement = connection.prepareStatement(SELECT_CQL);
statement.setString(1, "123");
resultSet = statement.executeQuery();
But when I try to add another parameter to the where clause query returns nothing!!
SELECT_CQL = "SELECT * FROM " + STREAM_NAME_IN_CASSANDRA + " WHERE '" + CONTEXT_ID_COLUMN + "'=? AND '"+TIMESTAMP_COLUMN+"'=?";
Connection connection = getConnection();
statement = connection.prepareStatement(SELECT_CQL);
statement.setString(1, "123");
statement.setString(2, "1390996577514");
resultSet = statement.executeQuery();
When I try the exact query within cqlsh terminal, it works fine.
statement.setString(2, "1390996577514");
Double-check the datatype of your TIMESTAMP_COLUMN and make sure that it's a string. Otherwise you'll need to use the appropriate "set" method. Ex:
statement.setLong(2, 1390996577514L);