I want to get structType from a json File lookin like this.
"$schema": "http://json-schema.org/schema#",
"self": {
"vendor": "",
"name": "",
"format": "",
"commentaireVersion": "",
"version": "1.0.1"
},
.....
I've tried to import the Json file as a data set by creating case classes for each field, it didn't work, because i've to create a generic application that can read any other json and get its corresponding struct Type.
maybe should I parse the json to avro ?
Since your JSON has non ASCII characters, you must first remove them to be able to use this solution :
def _decode_list(data):
rv = []
for item in data:
if isinstance(item, unicode):
item = item.encode('ascii', 'ignore')
elif isinstance(item, list):
item = _decode_list(item)
elif isinstance(item, dict):
item = _decode_dict(item)
rv.append(item)
return rv
def _decode_dict(data):
rv = {}
for key, value in data.iteritems():
if isinstance(key, unicode):
key = key.encode('ascii', 'ignore')
if isinstance(value, unicode):
value = value.encode('ascii', 'ignore')
elif isinstance(value, list):
value = _decode_list(value)
elif isinstance(value, dict):
value = _decode_dict(value)
rv[key] = value
return rv
with open('my_json.json', 'r') as f:
json_dict = json.load(f, object_hook=_decode_dict)
Now that you only have UTF-8 characters, you can extract the StructType like this:
rdd_JSON = sc.parallelize([json_dict])
df_JSON = spark.read.json(rdd_JSON)
schema = df_JSON.schema
df_JSON.printSchema()
Your resulting schema :
StructType(List(StructField($metadata,StructType(List(StructField($dataVector,StringType,true),StructField($dataset,StringType,true),StructField($datasource,StringType,true),StructField($fileFormat,StringType,true),StructField($ingestionMode,StringType,true),StructField($nameFormat,StringType,true))),true),StructField($schema,StringType,true),StructField(description,StringType,true),StructField(id,StringType,true),StructField(properties,StructType(List(StructField(content,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(resource,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(accountEtat,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(anonymizationDate,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(anonymized,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(ccuId,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(comptePrepaye,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(creationDate,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(currentSolde,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(solde,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(soldeDate,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(id,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(rechargeCPPEncours,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(dateCGV,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(dateCreation,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(dateDerniereModification,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(defaultAddresses,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(payment,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(addressDetail,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(addressL4ExtVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4LibVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4MotVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4NumVoie,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(addressName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(appartment,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(building,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(ceaL4,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(codeInseeCommune,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(country,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(isoCode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(name,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(doorCode1,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(doorCode2,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(mascadiaError,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(poBox,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(postalCode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(quartierLettre,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(service,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(soColissimoDeliveryMode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(streetName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(streetNumber,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(typeVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(contact,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(cellPhone,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(company,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(email,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(firstName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(gender,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(lastName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(phone1,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(phone2,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(title,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(shipping,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(addressDetail,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(addressL4ExtVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4LibVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4MotVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(addressL4NumVoie,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(addressName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(appartment,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(building,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(ceaL4,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(codeInseeCommune,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(country,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(isoCode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(name,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(doorCode1,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(doorCode2,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(mascadiaError,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(poBox,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(postalCode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(quartierLettre,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(service,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(soColissimoDeliveryMode,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(streetName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(streetNumber,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(typeVoie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(contact,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(cellPhone,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(company,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(email,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(firstName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(gender,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(lastName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(phone1,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(phone2,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(title,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(etat,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(fraude,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(dateFraudeNiv1,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(dateFraudeNiv2,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(dateStatutFraude,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(statusFraude,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(guestFlg,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(id,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(idGuest,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(identity,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(civility,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(dateOfBirth,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(firstName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(lastName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(mail,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(middleName,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(phone,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(isComptePrepaye,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(langage,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(listOfInterests,StructType(List(StructField(description,StringType,true),StructField(items,StructType(List(StructField(properties,StructType(List(StructField(description,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(name,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(uid,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(type,StringType,true))),true),StructField(marketing,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(codePromoParrain,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(isFilleul,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(nbConso,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(optins,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(infosGroupe,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(infosPartners,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(infosPhilaposte,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(infosSmsGroupe,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(infosSmsPartners,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(organization,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(codeCoclico,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(codeTypePorteFeuille,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(dateEcheanceDocCertif,StructType(List(StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(enseigne,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(function,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(nomSociete,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(numCartePro,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(secteurActivite,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(siret,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(tvaIntraCommunautaire,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(typeEntreprise,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(philatelist,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(abonnementCataloguePhilatelique,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(abonnementCatalogue,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(abonnementSortie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(appetencePhilatelie,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(emailingPhilatelistFlg,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(philateListFlg,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(termsOfUse,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(testSondeFlg,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(type,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(schema,StructType(List(StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(context,StructType(List(StructField(properties,StructType(List(StructField(dateSentEvent,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true))),true),StructField(required,ArrayType(StringType,true),true),StructField(type,StringType,true))),true),StructField(messageId,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(operation,StructType(List(StructField(description,StringType,true),StructField(properties,StructType(List(StructField(actionDate,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(format,StringType,true),StructField(type,StringType,true))),true),StructField(operationType,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(enum,ArrayType(StringType,true),true),StructField(type,StringType,true))),true),StructField(patch,StructType(List(StructField(description,StringType,true),StructField(items,StructType(List(StructField($metadata,StructType(List(StructField(PATCH_RESOURCE_ID,StringType,true),StructField(PATCH_TARGET_SCHEMA,StringType,true))),true),StructField(properties,StructType(List(StructField(_corrupt_recordvalue,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(op,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(enum,ArrayType(StringType,true),true),StructField(type,StringType,true))),true),StructField(path,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(type,StringType,true))),true))),true),StructField(type,StringType,true))),true),StructField(resourceId,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true),StructField($tags,ArrayType(StructType(List(StructField(rdf:type,StringType,true),StructField(rdfs:domain,StringType,true))),true),true))),true),StructField(description,StringType,true),StructField(type,StringType,true))),true),StructField(resourceType,StructType(List(StructField($metadata,StructType(List(StructField($dataKey,StringType,true),StructField($privacyLevel,StringType,true))),true),StructField(description,StringType,true),StructField(type,StringType,true))),true))),true),StructField(self,StructType(List(StructField(commentaireVersion,StringType,true),StructField(format,StringType,true),StructField(name,StringType,true),StructField(vendor,StringType,true),StructField(version,StringType,true))),true),StructField(title,StringType,true),StructField(type,StringType,true)))