0
votes

I want to experiment with Spark ML on Eclipse and I have to perform some data manipulation first. The code below shows the latter.

My code:

package org.test.spark


import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.ml.feature.VectorAssembler

import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
import org.apache.spark.ml.{ Pipeline, PipelineStage }


import org.apache.spark.rdd.RDD

import org.apache.spark.sql._

object DataTest{

  import scala.reflect.runtime.universe.TypeTag

  case class Credit(
    creditability: Double,
    balance: Double, duration: Double, history: Double, purpose: Double, amount: Double,
    savings: Double, employment: Double, instPercent: Double, sexMarried: Double, guarantors: Double,
    residenceDuration: Double, assets: Double, age: Double, concCredit: Double, apartment: Double,
    credits: Double, occupation: Double, dependents: Double, hasPhone: Double, foreign: Double
  )

  def main(args: Array[String]) = {

    //Start the Spark context
    val conf = new SparkConf()
      .setAppName("DataTest")
      .setMaster("local")
    val sc = new SparkContext(conf)

    val sqlContext= new org.apache.spark.sql.SQLContext(sc)

    import sqlContext.implicits._


  // function to create a  Credit class from an Array of Double
    def parseCredit(line: Array[Double]): Credit = {
    Credit(
      line(0),
      line(1) - 1, line(2), line(3), line(4) , line(5),
      line(6) - 1, line(7) - 1, line(8), line(9) - 1, line(10) - 1,
      line(11) - 1, line(12) - 1, line(13), line(14) - 1, line(15) - 1,
      line(16) - 1, line(17) - 1, line(18) - 1, line(19) - 1, line(20) - 1
    )
  }

// function to transform an RDD of Strings into an RDD of Double
   def parseRDD(rdd: RDD[String]): RDD[Array[Double]] = {
    rdd.map(_.split(",")).map(_.map(_.toDouble))
  }

  val creditDF= parseRDD(sc.textFile("germancredit.csv")).map(parseCredit).toDF().cache()

  creditDF.registerTempTable("credit")

  creditDF.printSchema

  creditDF.show

  creditDF.groupBy("creditability").avg("balance").show

  sqlContext.sql("SELECT creditability, avg(balance) as avgbalance, avg(amount) as avgamt, avg(duration) as avgdur  FROM credit GROUP BY creditability ").show

  //define the feature columns to put in the feature vector
  val featureCols = Array("balance", "duration", "history", "purpose", "amount",
    "savings", "employment", "instPercent", "sexMarried",  "guarantors",
    "residenceDuration", "assets",  "age", "concCredit", "apartment",
    "credits",  "occupation", "dependents",  "hasPhone", "foreign" )

//set the input and output column names

  val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")

//return a dataframe with all of the  feature columns in  a vector column
  val df2 = assembler.transform(creditDF)

// the transform method produced a new column: features.
  df2.show

  sc.stop



   }

}

When I run mvn clean install I get the following:

error: bad symbolic reference. A signature in PipelineStage.class refers to term internal in package org.apache.spark which is not available. It may be completely missing from the current classpath, or the version on the classpath might be incompatible with the version used when compiling PipelineStage.class. val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")

It seems that the problem occurs at the call val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")

When I run mvn clean package I get:

Failed to execute goal org.scala-tools:maven-scala-plugin:2.15.2:compile (default) on project spark: wrap: org.apache.commons.exec.ExecuteException: Process exited with an error: 1(Exit value: 1) -> [Help 1]

My pom.xml file :

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>org.test</groupId>
  <artifactId>spark</artifactId>
  <version>0.0.1-SNAPSHOT</version>

  <pluginRepositories>
        <pluginRepository>
            <id>scala-tools.org</id>
            <name>Scala-tools Maven2 Repository</name>
            <url>http://scala-tools.org/repo-releases</url>
        </pluginRepository>
    </pluginRepositories>


    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
            <version>1.6.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.10</artifactId>
            <version>1.6.0</version>
        </dependency>   


        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-mllib_2.10</artifactId>
            <version>2.0.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-mllib-local_2.10</artifactId>
            <version>2.0.1</version>
        </dependency>

        <dependency>
            <groupId>com.databricks</groupId>
            <artifactId>spark-csv_2.10</artifactId>
            <version>1.2.0</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <!-- mixed scala/java compile -->
            <plugin>
                <groupId>org.scala-tools</groupId>
                <artifactId>maven-scala-plugin</artifactId>
                    <version>2.15.2</version>
                <executions>

                    <execution>
                        <id>compile</id>
                        <goals>
                            <goal>compile</goal>
                        </goals>
                        <phase>compile</phase>
                    </execution>

                    <execution>
                        <id>test-compile</id>
                        <goals>
                            <goal>testCompile</goal>
                        </goals>
                        <phase>test-compile</phase>
                    </execution>


                    <execution>
                        <phase>process-resources</phase>
                        <goals>
                            <goal>compile</goal>
                        </goals>
                    </execution>

                </executions>
            </plugin>


            <plugin>
                <artifactId>maven-compiler-plugin</artifactId>
                  <version>3.6.1</version>
                <configuration>
                    <source>1.7</source>
                    <target>1.7</target>
                </configuration>
            </plugin>
            <!-- for fatjar -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>2.4</version>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>assemble-all</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-jar-plugin</artifactId>
                <version>3.0.2</version>
                <configuration>
                    <archive>
                        <manifest>
                            <addClasspath>true</addClasspath>
                            <mainClass>fully.qualified.MainClass</mainClass>
                        </manifest>
                    </archive>
                </configuration>
            </plugin>
        </plugins>
        <pluginManagement>
            <plugins>
                <!--This plugin's configuration is used to store Eclipse m2e settings 
                    only. It has no influence on the Maven build itself. -->
                <plugin>
                    <groupId>org.eclipse.m2e</groupId>
                    <artifactId>lifecycle-mapping</artifactId>
                    <version>1.0.0</version>
                    <configuration>
                        <lifecycleMappingMetadata>
                            <pluginExecutions>
                                <pluginExecution>
                                    <pluginExecutionFilter>
                                        <groupId>org.scala-tools</groupId>
                                        <artifactId>
                                            maven-scala-plugin
                                        </artifactId>
                                        <versionRange>
                                            [2.15.2,)
                                        </versionRange>
                                        <goals>
                                            <goal>compile</goal>
                                            <goal>testCompile</goal>
                                        </goals>
                                    </pluginExecutionFilter>
                                    <action>
                                        <execute></execute>
                                    </action>
                                </pluginExecution>
                            </pluginExecutions>
                        </lifecycleMappingMetadata>
                    </configuration>
                </plugin>
            </plugins>
        </pluginManagement>
</build>


</project>

Any suggestion on how could I resolve the error would be very helpful, thank you.

1

1 Answers

0
votes

Try always importing the same version of everything from the org.apache.spark package.

    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.10</artifactId>
        <version>1.6.0</version>
    </dependency>   
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-mllib_2.10</artifactId>
        <version>2.0.1</version>
    </dependency>

Here, some things are using version 1.6.0 and others 2.0.1 -- I think the spark artifacts all need their version to be the same.