u'DecisionTreeClassifier was given input with invalid label column label, without the number of classes specified. See StringIndexer

Question

    #Load the CSV file into a RDD
    irisData = sc.textFile("/home/infademo/surya/iris.csv")
    irisData.cache()
    irisData.count()

    #Remove the first line (contains headers)
    dataLines = irisData.filter(lambda x: "Sepal" not in x)
    dataLines.count()

    from pyspark.sql import Row
    #Create a Data Frame from the data
    parts = dataLines.map(lambda l: l.split(","))
    irisMap = parts.map(lambda p: Row(SEPAL_LENGTH=float(p[0]),\
                                    SEPAL_WIDTH=float(p[1]), \
                                    PETAL_LENGTH=float(p[2]), \
                                    PETAL_WIDTH=float(p[3]), \
                                    SPECIES=p[4] ))

    # Infer the schema, and register the DataFrame as a table.
    irisDf = sqlContext.createDataFrame(irisMap)
    irisDf.cache()

    #Add a numeric indexer for the label/target column
    from pyspark.ml.feature import StringIndexer
    stringIndexer = StringIndexer(inputCol="SPECIES", outputCol="IND_SPECIES")
    si_model = stringIndexer.fit(irisDf)
    irisNormDf = si_model.transform(irisDf)

    irisNormDf.select("SPECIES","IND_SPECIES").distinct().collect()
    irisNormDf.cache()

    """--------------------------------------------------------------------------
    Perform Data Analytics
    -------------------------------------------------------------------------"""

    #See standard parameters
    irisNormDf.describe().show()

    #Find correlation between predictors and target
    for i in irisNormDf.columns:
        if not( isinstance(irisNormDf.select(i).take(1)[0][0], basestring)) :
            print( "Correlation to Species for ", i, \
                        irisNormDf.stat.corr('IND_SPECIES',i))



    #Transform to a Data Frame for input to Machine Learing
    #Drop columns that are not required (low correlation)

    from pyspark.mllib.linalg import Vectors
    from pyspark.mllib.linalg import SparseVector
    from pyspark.mllib.regression import LabeledPoint
    from pyspark.mllib.util import MLUtils
    import org.apache.spark.mllib.linalg.{Matrix, Matrices}
    from pyspark.mllib.linalg.distributed import RowMatrix

    from pyspark.ml.linalg import Vectors
    pyspark.mllib.linalg.Vector
    def transformToLabeledPoint(row) :
        lp = ( row["SPECIES"], row["IND_SPECIES"], \
                    Vectors.dense([row["SEPAL_LENGTH"],\
                            row["SEPAL_WIDTH"], \
                            row["PETAL_LENGTH"], \
                            row["PETAL_WIDTH"]]))
        return lp




    irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
    irisLpDf = sqlContext.createDataFrame(irisLp,["species","label", "features"])
    irisLpDf.select("species","label","features").show(10)
    irisLpDf.cache()

    """--------------------------------------------------------------------------
    Perform Machine Learning
    -------------------------------------------------------------------------"""
    #Split into training and testing data
    (trainingData, testData) = irisLpDf.randomSplit([0.9, 0.1])
    trainingData.count()
    testData.count()
    testData.collect()

    from pyspark.ml.classification import DecisionTreeClassifier
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator

    #Create the model
    dtClassifer = DecisionTreeClassifier(maxDepth=2, labelCol="label",\
                    featuresCol="features")

   dtModel = dtClassifer.fit(trainingData)

Traceback (most recent call last): File "", line 1, in File "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/pipeline.py", line 69, in fit return self._fit(dataset) File "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", line 133, in _fit java_model = self._fit_java(dataset) File "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/ml/wrapper.py", line 130, in _fit_java return self._java_obj.fit(dataset._jdf) File "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in call File "/opt/mapr/spark/spark-1.6.1-bin-hadoop2.6/python/pyspark/sql/utils.py", line 53, in deco raise IllegalArgumentException(s.split(': ', 1)[1], stackTrace) pyspark.sql.utils.IllegalArgumentException: u'DecisionTreeClassifier was given input with invalid label column label, without the number of classes specified. See StringIndexer.'

pltc pltc · Accepted Answer · 2017-08-22T17:11:01

According to Spark 1.6.1 document

We use two feature transformers to prepare the data; these help index categories for the label and categorical features, adding metadata to the DataFrame which the Decision Tree algorithm can recognize.

According to Spark 1.6.1 source code

val numClasses: Int = MetadataUtils.getNumClasses(dataset.schema($(labelCol))) match {
  case Some(n: Int) => n
  case None => throw new IllegalArgumentException("DecisionTreeClassifier was given input" +
    s" with invalid label column ${$(labelCol)}, without the number of classes" +
    " specified. See StringIndexer.")
    // TODO: Automatically index labels: SPARK-7126
}

So, you would need to use StringIndexer for label column and VectorIndexer for features column before passing to DecisionTreeClassifier.fit

u'DecisionTreeClassifier was given input with invalid label column label, without the number of classes specified. See StringIndexer

1 Answers