I have this code and almost all of the transformations use withColumn function which returns a data frame. I convert the dataframe returned from preProcessing to Dataset using as[Recipe] but since all the functions return dataframe using .as over and over doesn't make sense.
So my question is what's the use case of DataSet[U] over Dataset[Row]/DataFrame? And is it worth using Dataset in my case as with each transformation(with column) the schema changes?
case class Recipe(
name: String,
ingredients: String,
url: String,
image: String,
cookTime: String,
recipeYield: String,
datePublished: DateType,
prepTime: String,
description: String
)
private def preProcessing[T](spark: SparkSession, data: DataFrame): DataFrame = {
data
.transform(lowerCaseColumn("ingredients"))
.transform(lowerCaseColumn("name"))
.transform(covertStringToDate("datePublished"))
}
private def transform[T](
spark: SparkSession,
data: Dataset[Recipe]
): DataFrame = {
data
.transform(filterRecipesWithBeef())
.persist(StorageLevel.MEMORY_AND_DISK_SER)
.transform(covertRecipeTimeColToMinutes("cookTime"))
.transform(covertRecipeTimeColToMinutes("prepTime"))
.transform(calculateTotalCookingTime())
.transform(calculateRecipeDifficulty())
.transform(calculateAvgCookingtimeByDifficulty())
}