Spark SQL on Nested JSON array

Question

I have the below two JSON messages

[{
  "identifier": {
    "domain": "OFFICE ADDRESS",
    "id": "987654321",
    "version": 1
  },
  "payload": {
    
    "contactMethods": [
      {
        "faxDiallingNumber": "0000/11111",
        "objId": 8,
        "type": "Fax",
        "use": "Business Address"
      },
      {
        "objId": 9,
        "telephoneDiallingNumber": "0999/99999",
        "telephoneType": "Fixed",
        "type": "Telephone",
        "use": "Business Address"
      },
      {
        "addressLine1": "house no",
        "addressLine3": "street name",
        "addressLine4": "area name",
        "cityCode": "city name",
        "countryCode": "US",
        "objId": 10,
        "postalCode": "12345",
        "preferredContactMethodFlag": true,
        "type": "International Address",
        "use": "Registered"
      }
    ]
  }
},
{
  "identifier": {
    "domain": "HOME ADDRESS",
    "id": "123456789",
    "version": 1
  },
  "payload": {
    
    "contactMethods": [
      {
        "faxDiallingNumber": "0000/22222",
        "objId": 11,
        "type": "Fax",
        "use": "home Address"
      },
      {
        "addressLine1": "house no",
        "addressLine3": "street name",
        "addressLine4": "area name",
        "cityCode": null,
        "countryCode": "US",
        "objId": 12,
        "postalCode": "45678",
        "preferredContactMethodFlag": true,
        "type": "International Address",
        "use": "Registered"
      },
      {
        "objId": 13,
        "telephoneDiallingNumber": "0999/88888,
        "telephoneType": "Fixed",
        "type": "Telephone",
        "use": "home Address"
      }
    ]
  }
}
]

Using pyspark Spark SQL, I am trying something like below to find out the id where type is "International Address" and cityCode is null.

My output should be

Could you please let me know the correct and working syntax.

I tried explode, array_contains, flatten functions but haven't got it right.

select 
identified.id ,
payload.contactMethods.type,
payload.contactMethods.cityCode
from sample_json_df -- (will create a dataframe on the json file using pyspark)
where
payload.contactMethods.type = 'International Address'
and payload.contactMethods.cityCode is null

Matt Matt · Accepted Answer · 2021-03-25T20:55:28

explode should work like that:

SELECT
  id,
  contactMethods.type as type,
  contactMethods.cityCode
FROM
  (
    SELECT
      identifier.id,
      explode(payload.contactMethods) as contactMethods
    FROM
      sample_json_df
  )
WHERE
  contactMethods.type = 'International Address'
  AND contactMethods.cityCode is null

Spark SQL on Nested JSON array

1 Answers