Is there a simple way to generate a schema from a structype definition from a string ?
For example I actualy do :
from pyspark.sql.types import *
customSchema = StructType([StructField("Date",StringType(),True)])
And i'm looking to store the schema définition in a table and load it dynamically like :
from pyspark.sql.types import *
stringShema = 'StructType([StructField("Date",StringType(),True)])'
customSchema = SomeFunctionToConvertTextToStruct(stringShema)
Any hint ?
Regards,
Using eval?
from pyspark.sql.types import *
stringSchema = 'StructType([StructField("Date",StringType(),True)])'
customSchema = eval(stringSchema)
Olivier.
Related
I have this csv file, payments.csv, for some particular rows the timestamp is changing by itself. the first 3 lines are the screenshots for easier understanding.
import spark.implicits._
import org.apache.spark.sql.functions.{col,when,to_date,row_number,date_add,expr}
import org.apache.spark.sql.expressions.{Window}
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().getOrCreate()
//Importing the csv
val df = spark.read.option("header","true").option("inferSchema","true").csv("payment.csv")
val df2 = df.filter($"payment_id" === 21112).show()
val time_value = df2.collect(){0}{5}
println(time_value)
clueless about it as of now.
Screenshots:
I have created a PySpark udf by doing the following:
from urllib.parse import urljoin, urlparse
import unicodedata
from pyspark.sql.functions import col, udf, count, substring
from pyspark.sql.types import StringType
decode_udf = udf(lambda val: urljoin(unicodedata.normalize('NFKC',val), urlparse(unicodedata.normalize('NFKC',val)).path), StringType())
For reference, the code above takes a url like this:
https://www.dagens.dk/udland/steve-irwins-soen-taet-paa-miste-livet-ny-video-viser-flugt-fra-kaempe-krokodille?utm_medium=Social&utm_source=Facebook#Echobox=1644308898
and transforms into
https://www.dagens.dk/udland/steve-irwins-soen-taet-paa-miste-livet-ny-video-viser-flugt-fra-kaempe-krokodille
How can I convert this into Scala? I have tried many ways to replicate the code but unsuccessful. Thanks in advance.
I am new to AWS glue and I am trying to run some transformation process using pyspark. I successfully ran my ETL but I am looking for another way of converting dataframe to dynamic frame.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
glueContext = GlueContext(SparkContext.getOrCreate())
# load data from crawler
students = glueContext.create_dynamic_frame.from_catalog(database="example_db", table_name="samp_csv")
# move data into a new variable for transformation
students_trans = students
# convert dynamicframe(students_trans) to dataframe
students_= students_trans.toDF()
# run transformation change column names/ drop columns
students_1= students_.withColumnRenamed("state","County").withColumnRenamed("capital","cap").drop("municipal",'metropolitan')
#students_1.printSchema()
#convert df back to dynamicframe
from awsglue.dynamicframe import DynamicFrame
students_trans = students_trans.fromDF(students_1, glueContext, "students_trans")
#load into s3 bucket
glueContext.write_dynamic_frame.from_options(frame = students_trans,
connection_type = "s3",
connection_options = {"path": "s3://kingb/target/"},
format = "csv")
from awsglue import DynamicFrame
students_trans = DynamicFrame.fromDF(students_1, self._glue_context, "df")
I'm getting TypeError: StructType can not accept object 3000 in type <class 'int'>
Code below. I can't see why 3000 can't be used as an integer? Am I doing something wrong?
import sys
from pyspark.sql.functions import *
from pyspark.sql.types import *
schema = StructType([StructField("pull_date",TimestampType(),True),StructField("Tot_Crdts",IntegerType(),True),StructField("Sum_Crdts",FloatType())])
data = [current_timestamp() , 3000, 1.00]
DFQuant = spark.createDataFrame(data, schema)
The data should be the list of 3-tuple. And the current_timestamp() function is not working outside of the dataframe, so I have changed to use the datetime package.
import sys
import datetime
from pyspark.sql.functions import *
from pyspark.sql.types import *
schema = StructType([
StructField("pull_date", TimestampType(), True),
StructField("Tot_Crdts", IntegerType(), True),
StructField("Sum_Crdts", FloatType())
])
data = [(datetime.datetime.now(), 3000, 1.00)]
DFQuant = spark.createDataFrame(data, schema)
DFQuant.show(truncate=False)
+--------------------------+---------+---------+
|pull_date |Tot_Crdts|Sum_Crdts|
+--------------------------+---------+---------+
|2021-06-23 13:57:10.363901|3000 |1.0 |
+--------------------------+---------+---------+
I am trying to create dataframe using sample record. One of the field is of DateType. I am getting error for value provided in DatType field. Please find below code
Error is
TypeError: field date: DateType can not accept object '2019-12-01' in type <class 'str'>
I tried to convert stringType to DateType using to_date plus some other ways but not able to do so. Please advise
from pyspark.sql.functions import to_date,col,lit,expr
from pyspark.sql.types import StructType,StructField,IntegerType,DateType,StringType
from pyspark.sql import Row
MySchema = StructType([ StructField("CustomerID",IntegerType(),True),
StructField("Quantity",IntegerType(),True),
StructField("date",DateType(),True)
])
myRow=Row(10,100,"2019-12-01")
mydf=spark.createDataFrame([myRow],MySchema)
display(mydf)
You can use datetime class to convert string to date:
from datetime import datetime
myRow=Row(10,100,datetime.strptime('2019-12-01','%Y-%m-%d'))
mydf=spark.createDataFrame([myRow],MySchema)
mydf.show()
It should work.
What works for me (I'm on Python 3.8.12 and Spark version 3.0.1):
from datetime import datetime
from pyspark.sql.types import DateType, StructType, StructField,
IntegerType, Row
from pyspark.sql import SparkSession
MySchema = StructType([ StructField("CustomerID",IntegerType(),True),
StructField("Quantity",IntegerType(),True),
StructField("date",DateType(),True)
])
spark = SparkSession.builder.appName("local").master("local").getOrCreate()
myRow=Row(10,100,datetime(2019, 12, 1))
mydf=spark.createDataFrame([myRow],MySchema)
mydf.show(truncate=False) #I'm not on DataBricks, so I use mydf.show(truncate=False) instead of display