Enviornment - spark-3.0.1-bin-hadoop2.7, ScalaLibraryContainer 2.12.3, Scala, SparkSQL, eclipse-jee-oxygen-2-linux-gtk-x86_64
I have a csv file having 3 columns with data-type :String,Long,Date. I want to group by first column which is string and retrieve the maximum date value.
To do this I have created RDD of Person objects from text file and converted it into dataframe 'peopleDF'. Registered the dataframe as a temporary view. I run the following sql statements using the sql methods provided by spark.
val maxDateDF = spark.sql("SELECT name, max(birthDate) maxDate FROM people group by name")
But it is not giving the correct max date for a name.
My sample data is as follows
Michael, 29,01/03/1992
Justin, 19,01/05/2002
Ben Stokes, 29,01/07/1992
Justin, 18,01/08/2003
Ben Stokes, 29,01/07/1993
Ben Stokes, 29,30/06/1993
How can I retrieve maximum date value group by other field name?
package org.apache.spark.examples.sql
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import java.sql.Date
import breeze.linalg.max
object SparkSQLExample1 {
case class Person(name: String, age: Long, birthDate: String)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("Spark SQL basic example")
.config("spark.master", "local").getOrCreate();
import spark.implicits._
runInferSchemaExample(spark);
spark.stop()
}
private def runInferSchemaExample(spark: SparkSession): Unit = {
import spark.implicits._
val peopleDF = spark.sparkContext
.textFile("examples/src/main/resources/people.txt")
.map(_.split(","))
.map(attributes => Person(attributes(0), attributes(1).trim.toInt,attributes(2)))
.toDF()
peopleDF.groupBy("age").count().show();
peopleDF.groupBy("name").avg("age").show();
peopleDF.createOrReplaceTempView("people")
implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]
val maxDateDF = spark.sql("SELECT name, max(birthDate) maxDate FROM people group by name")
maxDateDF.map(teenager => teenager.getValuesMap[Any](List("name", "maxDate"))).collect().foreach(println)
}
}