This is for scala spark.
val totalMainArrayBuffer=collection.mutable.ArrayBuffer[String]()
def flatten_df_Struct(dfTemp:org.apache.spark.sql.DataFrame,dfTotalOuter:org.apache.spark.sql.DataFrame):org.apache.spark.sql.DataFrame=
{
//dfTemp.printSchema
val totalStructCols=dfTemp.dtypes.map(x => x.toString.substring(1,x.toString.size-1)).filter(_.split(",",2)(1).contains("Struct")) // in case i the column names come with the word Struct embedded in it
val mainArrayBuffer=collection.mutable.ArrayBuffer[String]()
for(totalStructCol <- totalStructCols)
{
val tempArrayBuffer=collection.mutable.ArrayBuffer[String]()
tempArrayBuffer+=s"${totalStructCol.split(",")(0)}.*"
//tempArrayBuffer.toSeq.toDF.show(false)
val columnsInside=dfTemp.selectExpr(tempArrayBuffer:_*).columns
for(column <- columnsInside)
mainArrayBuffer+=s"${totalStructCol.split(",")(0)}.${column} as ${totalStructCol.split(",")(0)}_${column}"
//mainArrayBuffer.toSeq.toDF.show(false)
}
//dfTemp.selectExpr(mainArrayBuffer:_*).printSchema
val nonStructCols=dfTemp.selectExpr(mainArrayBuffer:_*).dtypes.map(x => x.toString.substring(1,x.toString.size-1)).filter(!_.split(",",2)(1).contains("Struct")) // in case i the column names come with the word Struct embedded in it
for (nonStructCol <- nonStructCols)
totalMainArrayBuffer+=s"${nonStructCol.split(",")(0).replace("_",".")} as ${nonStructCol.split(",")(0)}" // replacing _ by . in origial select clause if it's an already nested column
dfTemp.selectExpr(mainArrayBuffer:_*).dtypes.map(x => x.toString.substring(1,x.toString.size-1)).filter(_.split(",",2)(1).contains("Struct")).size
match {
case value if value ==0 => dfTotalOuter.selectExpr(totalMainArrayBuffer:_*)
case _ => flatten_df_Struct(dfTemp.selectExpr(mainArrayBuffer:_*),dfTotalOuter)
}
}
def flatten_df(dfTemp:org.apache.spark.sql.DataFrame):org.apache.spark.sql.DataFrame=
{
var totalArrayBuffer=collection.mutable.ArrayBuffer[String]()
val totalNonStructCols=dfTemp.dtypes.map(x => x.toString.substring(1,x.toString.size-1)).filter(!_.split(",",2)(1).contains("Struct")) // in case i the column names come with the word Struct embedded in it
for (totalNonStructCol <- totalNonStructCols)
totalArrayBuffer+=s"${totalNonStructCol.split(",")(0)}"
totalMainArrayBuffer.clear
flatten_df_Struct(dfTemp,dfTemp) // flattened schema is now in totalMainArrayBuffer
totalArrayBuffer=totalArrayBuffer++totalMainArrayBuffer
dfTemp.selectExpr(totalArrayBuffer:_*)
}
flatten_df(dfTotal.withColumn("tempStruct",lit(5))).printSchema
File
{"num1":1,"num2":2,"bool1":true,"bool2":false,"double1":4.5,"double2":5.6,"str1":"a","str2":"b","arr1":[3,4,5],"map1":{"cool":1,"okay":2,"normal":3},"carInfo":{"Engine":{"Make":"sa","Power":{"IC":"900","battery":"165"},"Redline":"11500"} ,"Tyres":{"Make":"Pirelli","Compound":"c1","Life":"120"}}}
{"num1":3,"num2":4,"bool1":false,"bool2":false,"double1":4.2,"double2":5.5,"str1":"u","str2":"n","arr1":[6,7,9],"map1":{"fast":1,"medium":2,"agressive":3},"carInfo":{"Engine":{"Make":"na","Power":{"IC":"800","battery":"150"},"Redline":"10000"} ,"Tyres":{"Make":"Pirelli","Compound":"c2","Life":"100"}}}
{"num1":8,"num2":4,"bool1":true,"bool2":true,"double1":5.7,"double2":7.5,"str1":"t","str2":"k","arr1":[11,12,23],"map1":{"preserve":1,"medium":2,"fast":3},"carInfo":{"Engine":{"Make":"ta","Power":{"IC":"950","battery":"170"},"Redline":"12500"} ,"Tyres":{"Make":"Pirelli","Compound":"c3","Life":"80"}}}
{"num1":7,"num2":9,"bool1":false,"bool2":true,"double1":33.2,"double2":7.5,"str1":"b","str2":"u","arr1":[12,14,5],"map1":{"normal":1,"preserve":2,"agressive":3},"carInfo":{"Engine":{"Make":"pa","Power":{"IC":"920","battery":"160"},"Redline":"11800"} ,"Tyres":{"Make":"Pirelli","Compound":"c4","Life":"70"}}}
Before:
root
|-- arr1: array (nullable = true)
| |-- element: long (containsNull = true)
|-- bool1: boolean (nullable = true)
|-- bool2: boolean (nullable = true)
|-- carInfo: struct (nullable = true)
| |-- Engine: struct (nullable = true)
| | |-- Make: string (nullable = true)
| | |-- Power: struct (nullable = true)
| | | |-- IC: string (nullable = true)
| | | |-- battery: string (nullable = true)
| | |-- Redline: string (nullable = true)
| |-- Tyres: struct (nullable = true)
| | |-- Compound: string (nullable = true)
| | |-- Life: string (nullable = true)
| | |-- Make: string (nullable = true)
|-- double1: double (nullable = true)
|-- double2: double (nullable = true)
|-- map1: struct (nullable = true)
| |-- agressive: long (nullable = true)
| |-- cool: long (nullable = true)
| |-- fast: long (nullable = true)
| |-- medium: long (nullable = true)
| |-- normal: long (nullable = true)
| |-- okay: long (nullable = true)
| |-- preserve: long (nullable = true)
|-- num1: long (nullable = true)
|-- num2: long (nullable = true)
|-- str1: string (nullable = true)
|-- str2: string (nullable = true
After:
root
|-- arr1: array (nullable = true)
| |-- element: long (containsNull = true)
|-- bool1: boolean (nullable = true)
|-- bool2: boolean (nullable = true)
|-- double1: double (nullable = true)
|-- double2: double (nullable = true)
|-- num1: long (nullable = true)
|-- num2: long (nullable = true)
|-- str1: string (nullable = true)
|-- str2: string (nullable = true)
|-- map1_agressive: long (nullable = true)
|-- map1_cool: long (nullable = true)
|-- map1_fast: long (nullable = true)
|-- map1_medium: long (nullable = true)
|-- map1_normal: long (nullable = true)
|-- map1_okay: long (nullable = true)
|-- map1_preserve: long (nullable = true)
|-- carInfo_Engine_Make: string (nullable = true)
|-- carInfo_Engine_Redline: string (nullable = true)
|-- carInfo_Tyres_Compound: string (nullable = true)
|-- carInfo_Tyres_Life: string (nullable = true)
|-- carInfo_Tyres_Make: string (nullable = true)
|-- carInfo_Engine_Power_IC: string (nullable = true)
|-- carInfo_Engine_Power_battery: string (nullable = true)
Tried for 2 Levels, it worked