XML streaming using Autoloader in Azure databricks - pyspark

I am trying to use readstream using binary format with respect to xml in Azure databricks.
rootTag = "Message"
inputPath ='/mnt/xyz//1.0/20220401/*.xml'
df = spark.read.format('com.databricks.spark.xml').option("rowtag" , rootTag).option("mode", "PERMISSIVE").load(inputPath)
dfschema = spark.createDataFrame([df.schema.json()],StringType())
dfschema.write.text("dbfs:/mnt/xyz/.txt")
schemaJson = spark.read.text(schemapath).first()[0]
schemaStruct = StructType.fromJson(json.loads(schemaJson))
inputPath = '/mnt/xyz/1.0/20230118/*.xml'
dfn=spark.readStream.format("cloudFiles") \
.option("cloudFiles.useNotifications", "false") \
.option("cloudFiles.validateOptions", "false") \
.option("cloudFiles.format", "text") \
.option("delimiter", "") \
.option("wholetext", "true") \
.load(inputPath) \
.withColumn("FileName", input_file_name()) \
.withColumn("fromXML", ext_from_xml(col("value"), schemaStruct)) \
.drop("value")
display(dfn)
Ouput -
from pyspark.sql.column import Column, _to_java_column
from pyspark.sql.types import _parse_datatype_json_string
def ext_from_xml(xml_column, schema, options={}):
java_column = _to_java_column(xml_column.cast('string'))
java_schema = spark._jsparkSession.parseDataType(schema.json())
scala_map = spark._jvm.org.apache.spark.api.python.PythonUtils.toScalaMap(options)
jc = spark._jvm.com.databricks.spark.xml.functions.from_xml(
java_column, java_schema, scala_map)
return Column(jc)
What could be the reason for getting nulls in values, we have data in value column but how to get in columns.

Related

create spark connection as part of python function

I am trying to create spark connection as part of spark_conn() function and use this connection throughout the other functions. For example in the below code, I am using spark connection created as part of spark_conn() function in read_data() function as below. Is my approach correct?
from pyspark.sql import SparkSession
def spark_conn():
spark = SparkSession \
.builder \
.appName("sparkConnection") \
.getOrCreate()
return spark
def read_data(spark, SNOWFLAKE_SOURCE_NAME, snowflake_options, loadtime, sftable):
df = spark.read \
.format(SNOWFLAKE_SOURCE_NAME) \
.options(**snowflake_options) \
.option("query","SELECT * FROM sftable WHERE SNAPSHOT==loadtime") \
.load()
if __name__=="__main__":
conn = spark_conn()
rd = read_data(conn, SNOWFLAKE_SOURCE_NAME, snowflake_options, loadtime, sftable)

Spark Scala code not working similarly then its pyspark version

I have a general question about Spark.
Should Pyspark and Scala Spark always have the same behaviour when we use the exact same code ?
If yes, how can ou explain this example:
Scala version:
val inputDf = spark
.readStream
.format("csv")
.schema(schema)
.option("ignoreChanges", "true")
.option("delimiter", ";").option("header", true)
.load("/input/")
def processIsmedia(df: DataFrame, batchId: Long): Unit = {
val ids = df
.select("id").distinct().collect().toList
.map(el => s"$el")
ids.foreach { id =>
val datedDf = df.filter(col("id") === id)
datedDf
.write
.format("delta")
.option("mergeSchema", "true")
.partitionBy("id")
.option("replaceWhere", s"id == '$id'")
.mode("overwrite")
.save("/res/")
}
}
inputDf
.writeStream
.format("delta")
.foreachBatch(processIsmedia _)
.queryName("tgte")
.option("checkpointLocation", "/check")
.trigger(Trigger.Once)
.start()
Python version:
inputDf = spark \
.readStream \
.format("csv") \
.schema(schema) \
.option("ignoreChanges", "true") \
.option("delimiter", ";").option("header", True) \
.load("/in/") \
def processDf(df, epoch_id):
PartitionKey = "id"
df.cache()
ids=[x.id for x in df.select("id").distinct().collect()]
for idd in ids:
idd =str(idd)
tmp = df.filter(df.id == idd)
tmp.write.format("delta").option("mergeSchema", "true").partitionBy(PartitionKey).option("replaceWhere", "id == '$i'".format(i=idd)).save("/res/")
inputDf.writeStream.format("delta").foreachBatch(processDf).queryName("aaaa").option("checkpointLocation", "/check").trigger(once=True).start()
Both codes are exactly equivalent.
They are supposed to write data (append new partitions and overwrite existant ones).
With Scala it is working perfectly fine.
With Python I am having an error :
Data written out does not match replaceWhere 'id == '$i''.
So my question is: Isnt spark the same thing whether it is used with Scala, Java, Python or even R ? How can this error be possible then ?
The python code is not performing a replace for the value in idd and the resulting string is "id == '$i'" which is not the case in your scala code i.e.
.option("replaceWhere", "id == '$i'".format(i=idd))
should be
.option("replaceWhere", "id == '{i}'".format(i=idd))
Let me know if this change works for you.

How to use Structured Spark Streaming in pySpark to insert row into Mongodb?

I am trying to Integrate Kafka with Spark-Structured-Streaming in PySpark to MongoDB Sink. I need help on correcting my code if i am going wrong
Got integrated Kafka-PySpark and PySpark-Mongo. Now trying to integrate the pipeline from Kafka-PySpark-Mongo
I'm using pyspark 2.4.5.
This is my code:
spark = SparkSession.builder \
.appName("Spark Structured Streaming from Kafka") \
.getOrCreate()
topic_name = "Be_"
kafka_broker = "localhost:9092"
producer = KafkaProducer(bootstrap_servers = kafka_broker)
jsonschema = StructType([ \
StructField("id", StringType()), StructField("Date", StringType()), \
StructField("Name", StringType()), StructField("Hour", StringType()), \
StructField("Last_Price", FloatType()), StructField("Var%", FloatType()), \
StructField("Last_Value", FloatType()), StructField("TYpe", StringType())])
df = spark.readStream.format("kafka") \
.option("kafka.bootstrap.servers", kafka_broker) \
.option("startingOffsets", "latest") \
.option("subscribe", topic_name) \
.load() \
.selectExpr("CAST(value AS STRING)")
def parse_data_from_kafka_message(sdf, schema):
from pyspark.sql.functions import split
assert sdf.isStreaming == True, "DataFrame doesn't receive streaming data"
col = split(sdf['value'], ',') #split attributes to nested array in one Column
#now expand col to multiple top-level columns
for idx, field in enumerate(schema):
sdf = sdf.withColumn(field.name, col.getItem(idx).cast(field.dataType))
return sdf.select([field.name for field in schema])
df= parse_data_from_kafka_message(df, jsonschema)
df \
.writeStream \
.format("mongo") \
.option("com.mongodb.spark.sql.DefaultSource","mongodb://localhost:27017/DataManagement.Data") \
.outputMode("append") \
.start() \
.awaitTermination()
This is the error that comes out in console:
I get this error from the console:
Py4JJavaError: An error occurred while calling o263.start.
: java.lang.UnsupportedOperationException: Data source mongo does not support streamed writing
I also tried using the ForeachWriter:
class ForeachWriter:
     def open (self, partition_id, epoch_id):
         # Open connection. This method is optional in Python.
         self.connection = MongoClient ('mongodb: // localhost: 27017')
         self.db = self.connection ['DataManagement']
         self.coll = self.db ['Data']
         pass
     def process (self, row):
         # Write row to connection. This method is NOT optional in Python.
         # Self.coll = None
         self.coll.insert_one (row.asDict ())
         pass
     def close (self, error):
         # Close the connection. This method in optional in Python.
         pass
df \
         .writeStream \
         .foreach (ForeachWriter ()) \
         .trigger (processingTime = '3 seconds') \
         .outputMode ("Append") \
         .option ("truncate", "false") \
         .start ()
Unfortunately the mongodb sink doesn't work either way and I'd like to know if there is another way to send data to MongoDB using PySpark or if I'm doing something wrong in the code. Thank you very much

pyspark kafka streaming data handler

I'm using spark 2.3.2 with pyspark and just figured out that foreach and foreachBatch are not available in 'DataStreamWriter' object in this configuration. The problem is the company Hadoop is 2.6 and spark 2.4(that provides what I need) doesn't work(SparkSession is crashing). There is some another alternative to send data to a custom handler and process streaming data?
This is my code until now:
def streamLoad(self,customHandler):
options = self.options
self.logger.info("Recuperando o schema baseado na estrutura do JSON")
jsonStrings = ['{"sku":"9","ean":"4","name":"DVD","description":"foo description","categories":[{"code":"M02_BLURAY_E_DVD_PLAYER"}],"attributes":[{"name":"attrTeste","value":"Teste"}]}']
myRDD = self.spark.sparkContext.parallelize(jsonStrings)
jsonSchema = self.spark.read.json(myRDD).schema # Maybe there is a way to serialize this
self.logger.info("Iniciando o streaming no Kafka[opções: {}]".format(str(options)))
df = self.spark \
.readStream \
.format("kafka") \
.option("maxFilesPerTrigger", 1) \
.option("kafka.bootstrap.servers", options["kafka.bootstrap.servers"]) \
.option("startingOffsets", options["startingOffsets"]) \
.option("subscribe", options["subscribe"]) \
.option("failOnDataLoss", options["failOnDataLoss"]) \
.load() \
.select(
col('value').cast("string").alias('json'),
col('key').cast("string").alias('kafka_key'),
col("timestamp").cast("string").alias('kafka_timestamp')
) \
.withColumn('pjson', from_json(col('json'), jsonSchema)).drop('json')
query = df \
.writeStream \
.foreach(customHandler) \ #This doesn't work in spark 2.3.x Alternatives, please?
.start()
query.awaitTermination()

kafka to pyspark structured streaming, parsing json as dataframe

I am experimenting with spark structured streaming (spark v2.2.0) to consume json data from kafka. However I encountered the following error.
pyspark.sql.utils.StreamingQueryException: 'Missing required
configuration "partition.assignment.strategy" which has no default
value.
Does anyone know why? The job was submitted using spark-submit below.
spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0 sparksstream.py
This is the entire python script.
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession \
.builder \
.appName("test") \
.getOrCreate()
# Define schema of json
schema = StructType() \
.add("Session-Id", StringType()) \
.add("TransactionTimestamp", IntegerType()) \
.add("User-Name", StringType()) \
.add("ID", StringType()) \
.add("Timestamp", IntegerType())
# load data into spark-structured streaming
df = spark \
.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "xxxx:9092") \
.option("subscribe", "topicName") \
.load() \
.select(from_json(col("value").cast("string"), schema).alias("parsed_value"))
# Print output
query = df.writeStream \
.outputMode("append") \
.format("console") \
.start()
use this instead to submit:
spark-submit \
--conf "spark.driver.extraClassPath=$SPARK_HOME/jars/kafka-clients-1.1.0.jar" \
--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.2.0 \
sparksstream.py
Assuming that you have donwloaded the kafka-clients*jar in you $SPARK_HOME/jars folder