My objectif is to read Data from a csv file and convert my rdd to dataframe in scala/spark. This is my code :
package xxx.DataScience.CompensationStudy
import org.apache.spark._
import org.apache.log4j._
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types._
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
object CompensationAnalysis {
case class GetDF(profil_date:String, profil_pays:String, param_tarif2:String, param_tarif3:String, dt_titre:String, dt_langues:String,
dt_diplomes:String, dt_experience:String, dt_formation:String, dt_outils:String, comp_applications:String,
comp_interventions:String, comp_competence:String)
def main(args: Array[String]) {
Logger.getLogger("org").setLevel(Level.ERROR)
val conf = new SparkConf().setAppName("CompensationAnalysis ")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import sqlContext.implicits._
val lines = sc.textFile("C:/Users/../Downloads/CompensationStudy.csv").flatMap { l =>
l.split(",") match {
case field: Array[String] if field.size > 13 => Some(field(0), field(1), field(2), field(3), field(4), field(5), field(6), field(7), field(8), field(9), field(10), field(11), field(12))
case field: Array[String] if field.size == 1 => Some((field(0), "default value"))
case _ => None
}
}
At this stade, I had the error : Product with Serializable does not take parameters
val summary = lines.collect().map(x => GetDF(x("profil_date"), x("profil_pays"), x("param_tarif2"), x("param_tarif3"), x("dt_titre"), x("dt_langues"), x("dt_diplomes"), x("dt_experience"), x("dt_formation"), x("dt_outils"), x("comp_applications"), x("comp_interventions"), x("comp_competence")))
val sum_df = summary.toDF()
df.printSchema
}
}
This is a screenshot :
Help please ?
flatMap
defininglines
. The only type the compiler is able to infer isRDD[Product with Serializable]
, because you have different types in yourOption
s. - Cyrille Corpet