the code is from google doc.
https://cloud.google.com/pubsub/lite/docs/write-messages-apache-spark
I m trying to publish to a pubsulite topic from pyspark. .
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import array, create_map, col, lit, when
from pyspark.sql.types import BinaryType, StringType
import uuid
# TODO(developer):
project_number = xxx
location = "us-central1"
topic_id = "kosmin"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.google.cloud:pubsublite-spark-sql-streaming:0.4.1,com.google.cloud:google-cloud-pubsublite:1.6.1 pyspark-shell'
spark = SparkSession.builder.appName("write-app").getOrCreate()
sdf = (
sdf.withColumn("key", lit("example").cast(BinaryType()))
.withColumn("data", col("value").cast(StringType()).cast(BinaryType()))
.withColumnRenamed("timestamp", "event_timestamp")
# Populate the attributes field. For example, an even value will
# have {"key1", [b"even"]}.
.withColumn(
"attributes",
create_map(
lit("key1"),
array(when(col("value") % 2 == 0, b"even").otherwise(b"odd")),
),
)
.drop("value")
)
query = (
sdf.writeStream.format("pubsublite")
.option(
"pubsublite.topic",
f"projects/{project_number}/locations/{location}/topics/{topic_id}",
)
# Required. Use a unique checkpoint location for each job.
.option("checkpointLocation", "/tmp/app" + uuid.uuid4().hex)
.outputMode("append")
# .trigger(processingTime="1 second")
.start()
)
# Wait 60 seconds to terminate the query.
query.awaitTermination(60)
query.stop()
but i m getting
22/07/29 19:09:38 ERROR Utils: Aborting task
com.google.api.gax.rpc.ApiException:
at com.google.cloud.pubsublite.internal.CheckedApiException.<init>(CheckedApiException.java:51)
at com.google.cloud.pubsublite.internal.CheckedApiException.<init>(CheckedApiException.java:55)
at com.google.cloud.pubsublite.internal.ExtractStatus.toCanonical(ExtractStatus.java:53)
at com.google.cloud.pubsublite.spark.PslWriteDataSourceOptions.newServiceClient(PslWriteDataSourceOptions.java:131)
......
Caused by: java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;CLjava/lang/Object;)V
at io.grpc.Metadata$Key.validateName(Metadata.java:754)
at io.grpc.Metadata$Key.<init>(Metadata.java:762)
at io.grpc.Metadata$Key.<init>(Metadata.java:671)
at io.grpc.Metadata$AsciiKey.<init>(Metadata.java:971)
at io.grpc.Metadata$AsciiKey.<init>(Metadata.java:966)
at io.grpc.Metadata$Key.of(Metadata.java:708)
at io.grpc.Metadata$Key.of(Metadata.java:704)
at com.google.api.gax.grpc.GrpcHeaderInterceptor.<init>(GrpcHeaderInterceptor.java:60)
....
what am I missing ? the credentials are set, maybe some package at submit time ?
worked with
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars=pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar pyspark-shell'
I am trying to create an AWS Glue ETL job to move data from Aurora RDS to Redshift, but cannot resolve how to get the timestamp fields properly mapped. All stages of the job show a valid preview of the expected data, but the job always fails with the following error
py4j.protocol.Py4JJavaError: An error occurred while calling o179.pyWriteDynamicFrame.
: java.sql.SQLException:
Error (code 1206) while loading data into Redshift: "Invalid timestamp format or value [YYYY-MM-DD HH24:MI:SS]"
Table name: public.stage_table_ae89e9dffe974b649bbf4852e49a4b12
Column name: updated_at
Column type: timestamp(0)
Raw line: 1234,5341,1121,0,2022-01-06 16:29:55.000000000,2022-01-06 16:29:55.000000000,1,1,Suzy
Raw field value: 0
I have tried doing a date format to remove the microseconds, I have tried forcing quotes around the date fields, nothing works.
Here is the generated script
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrameCollection
from awsglue.dynamicframe import DynamicFrame
from awsglue import DynamicFrame
# Script generated for node Custom transform
def CastIntsTransform(glueContext, dfc) -> DynamicFrameCollection:
df = dfc.select(list(dfc.keys())[0])
df_resolved = (
df.resolveChoice(specs=[("id", "cast:bigint")])
.resolveChoice(specs=[("user_id", "cast:bigint")])
.resolveChoice(specs=[("connected_user_id", "cast:bigint")])
.resolveChoice(specs=[("mg_id", "cast:bigint")])
.resolveChoice(specs=[("access_level", "cast:tinyint")])
.resolveChoice(specs=[("status", "cast:tinyint")])
)
return DynamicFrameCollection({"CustomTransform0": df_resolved}, glueContext)
def sparkSqlQuery(glueContext, query, mapping, transformation_ctx) -> DynamicFrame:
for alias, frame in mapping.items():
frame.toDF().createOrReplaceTempView(alias)
result = spark.sql(query)
return DynamicFrame.fromDF(result, glueContext, transformation_ctx)
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node JDBC Connection
JDBCConnection_node1 = glueContext.create_dynamic_frame.from_catalog(
database="ABC123",
table_name="user_connections",
transformation_ctx="JDBCConnection_node1",
)
# Script generated for node SQL
SqlQuery0 = """
select
id,
user_id,
connected_user_id,
COALESCE(mg_id, 0) mg_id,
created_at,
updated_at,
updated_at,
access_level,
status,
COALESCE(nickname, '') nickname
from
apiData
"""
SQL_node1647619002820 = sparkSqlQuery(
glueContext,
query=SqlQuery0,
mapping={"apiData": JDBCConnection_node1},
transformation_ctx="SQL_node1647619002820",
)
# Script generated for node Custom transform
Customtransform_node1647612655336 = CastIntsTransform(
glueContext,
DynamicFrameCollection(
{"SQL_node1647619002820": SQL_node1647619002820}, glueContext
),
)
# Script generated for node Select From Collection
SelectFromCollection_node1647613332516 = SelectFromCollection.apply(
dfc=Customtransform_node1647612655336,
key=list(Customtransform_node1647612655336.keys())[0],
transformation_ctx="SelectFromCollection_node1647613332516",
)
# Script generated for node ApplyMapping
ApplyMapping_node2 = ApplyMapping.apply(
frame=SelectFromCollection_node1647613332516,
mappings=[
("id", "bigint", "id", "bigint"),
("user_id", "bigint", "user_id", "bigint"),
("connected_user_id", "bigint", "connected_user_id", "bigint"),
("mg_id", "bigint", "mg_id", "bigint"),
("created_at", "timestamp", "created_at", "timestamp"),
("updated_at", "timestamp", "updated_at", "timestamp"),
("access_level", "tinyint", "access_level", "tinyint"),
("status", "tinyint", "status", "tinyint"),
("nickname", "varchar", "nickname", "varchar"),
],
transformation_ctx="ApplyMapping_node2",
)
# Script generated for node Amazon Redshift
pre_query = "drop table if exists public.stage_table_cd5d65739d334453938f090ea1cb2d6e;create table public.stage_table_cd5d65739d334453938f090ea1cb2d6e as select * from public.test_user_connections where 1=2;"
post_query = "begin;delete from public.test_user_connections using public.stage_table_cd5d65739d334453938f090ea1cb2d6e where public.stage_table_cd5d65739d334453938f090ea1cb2d6e.id = public.test_user_connections.id; insert into public.test_user_connections select * from public.stage_table_cd5d65739d334453938f090ea1cb2d6e; drop table public.stage_table_cd5d65739d334453938f090ea1cb2d6e; end;"
AmazonRedshift_node1647612972417 = glueContext.write_dynamic_frame.from_jdbc_conf(
frame=ApplyMapping_node2,
catalog_connection="ABC123",
connection_options={
"database": "test",
"dbtable": "public.stage_table_cd5d65739d334453938f090ea1cb2d6e",
"preactions": pre_query,
"postactions": post_query,
},
redshift_tmp_dir=args["TempDir"],
transformation_ctx="AmazonRedshift_node1647612972417",
)
job.commit()
Using an existing working Java example I am trying to write a python equivalent of the producer using python-kafka and confluent_kafka library. How do I configure sasl.jass.config in python with the information like that in Java below?
import java.util.Properties;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
...
Properties props = new Properties();
...
props.put("sasl.jaas.config", "org.apache.kafka.common.security.scram.ScramLoginModule required username=\"<Kafka_Username>\" password=\"<Kafka_Password>\";");
Producer<String, String> producer = new KafkaProducer<>(props);
This works for me
from confluent_kafka import Producer
SECURITY_PROTOCOL = "SASL_SSL"
SASL_MECHANISM = "PLAIN"
conf = {
'bootstrap.servers': SERVERS,
'sasl.mechanisms': SASL_MECHANISM,
'security.protocol': SECURITY_PROTOCOL,
'sasl.username': SASL_USERNAME,
'sasl.password': SASL_PASSWORD,
...
}
consumer = Producer(conf)
IAm trying to delete a document i created through my script using deleteone as well as deletemany using following code:
import com.mongodb.client.MongoCollection;
import org.bson.Document;
import static com.mongodb.client.model.Filters.*;
import org.bson.Document;
import org.bson.types.ObjectId;
import java.util.Arrays;
import com.mongodb.client.MongoClients;
import com.mongodb.client.MongoClient;
import com.mongodb.MongoClientSettings;
import com.mongodb.MongoCredential;
import com.mongodb.ServerAddress;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
import org.bson.Document;
try {
String mongoUser = vars.get("mongouser");
String userDB = vars.get("userdb");
char[] password = vars.get("password").toCharArray();
MongoCredential credential = MongoCredential.createCredential(mongoUser, userDB, password);
MongoClientSettings settings = MongoClientSettings.builder()
.applyToClusterSettings {builder ->
builder.hosts(Collections.singletonList(new ServerAddress(vars.get("mongohost"), vars.get("mongoPort") as int)))}
.credential(credential)
.build();
MongoClient mongoClient = MongoClients.create(settings);
MongoDatabase database = mongoClient.getDatabase(vars.get("databaseName"));
MongoCollection<Document> collection = database.getCollection(vars.get("collectionName1"));
vars.putObject("collection1", collection);
collection.deleteone(eq("EmployeeName", "Test Automation through Jmeter"));
return "Document deleted";
}
catch (Exception e) {
SampleResult.setSuccessful(false);
SampleResult.setResponseCode("500");
SampleResult.setResponseMessage("Exception: " + e);}
getting this error:
Response message: Exception: groovy.lang.MissingMethodException: No signature of method: com.mongodb.client.internal.MongoCollectionImpl.deleteone() is applicable for argument types: (com.mongodb.client.model.Filters$
SimpleEncodingFilter) values: [Filter{fieldName='EmployeeName', value=Test Automation through Jmeter}]
Possible solutions: deleteOne(org.bson.conversions.Bson), deleteOne(com.mongodb.client.ClientSession, org.bson.conversions.Bson), deleteOne(org.bson.conversions.Bson, com.mongodb.client.model.DeleteOptions), deleteOne(com.mongodb.client.ClientSession, org.bson.conversions.Bson, com.mongodb.client.model.DeleteOptions), deleteMany(org.bson.conversions.Bson), deleteMany(com.mongodb.client.ClientSession, org.bson.conversions.Bson)
What am I doing wrong?
I believe the method is called deleteOne, not deleteone, so the correct way to call it should be:
collection.deleteOne(eq("EmployeeName", "Test Automation through Jmeter"));
See also the javadocs for the MongoCollection: https://mongodb.github.io/mongo-java-driver/3.6/javadoc/?com/mongodb/client/MongoCollection.html
I am trying to connect to Neo4j from Spark using neo4j-spark-connector. I am facing an authentication issue when I try to connect to the Neo4j org.neo4j.driver.v1.exceptions.AuthenticationException: Unsupported authentication token, scheme='none' only allowed when auth is disabled: { scheme='none' }
I have checked and the credentials I am passing are correct. Not sure why is it failing.
import org.neo4j.spark._
import org.apache.spark._
import org.graphframes._
import org.apache.spark.sql.SparkSession
import org.neo4j.driver.v1.GraphDatabase
import org.neo4j.driver.v1.AuthTokens
val config = new SparkConf()
config.set(Neo4jConfig.prefix + "url", "bolt://localhost")
config.set(Neo4jConfig.prefix + "user", "neo4j")
config.set(Neo4jConfig.prefix + "password", "root")
val sparkSession :SparkSession = SparkSession.builder.config(config).getOrCreate()
val neo = Neo4j(sparkSession.sparkContext)
val graphFrame = neo.pattern(("Person","id"),("KNOWS","null"), ("Employee","id")).partitions(3).rows(1000).loadGraphFrame
println("**********Graphframe Vertices Count************")
graphFrame.vertices.count
println("**********Graphframe Edges Count************")
graphFrame.edges.count
val pageRankFrame = graphFrame.pageRank.maxIter(5).run()
val ranked = pageRankFrame.vertices
ranked.printSchema()
val top3 = ranked.orderBy(ranked.col("pagerank").desc).take(3)
Can someone please have a look and let me know the reason for the same?
It might be a configuration issue with your neo4j.conf file. Is this line commented out:
dbms.security.auth_enabled=false
I had a similar problem, creating the following spring beans fixed the issue.
#Bean
public org.neo4j.ogm.config.Configuration getConfiguration() {
return new org.neo4j.ogm.config.Configuration.Builder()
.credentials("neo4j", "secret")
.uri("bolt://localhost:7687").build();
}
#Bean
public SessionFactory sessionFactory(org.neo4j.ogm.config.Configuration configuration) {
return new SessionFactory(configuration,
"<your base package>");
}