1
votes

I have a java spark process which I call using - spark-submit --class MyClass target/MyJar.jar

The last part of this process is to write it locally and then copy it to s3 as it need to have specific name (can also be written to s3 and mv'ed there, for the sake of the question the error stays the same).

The code compiles and runs but when it get to the code snippet below I get the following error -

java.lang.NoClassDefFoundError: com/amazonaws/services/s3/AmazonS3 at java.lang.Class.forName0(Native Method) at java.lang.Class.forName(Class.java:348) at org.apache.spark.util.Utils$.classForName(Utils.scala:229) at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:700) at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:187) at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:212) at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:126) at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) Caused by: java.lang.ClassNotFoundException: com.amazonaws.services.s3.AmazonS3 at java.net.URLClassLoader.findClass(URLClassLoader.java:381) at java.lang.ClassLoader.loadClass(ClassLoader.java:424) at java.lang.ClassLoader.loadClass(ClassLoader.java:357)

public static void saveToS3(Dataset<Row> df, String outputBucket, String outputPath) throws IOException {
    String tmpFile = "temp" + Long.toString(System.nanoTime());
    df.coalesce(1).write().option("header", true).csv(tmpFile);
    File directory = new File(tmpFile);
    AmazonS3 s3client = new AmazonS3Client();
    for (File file : directory.listFiles()) {
        if (file.getName().startsWith("part_") || file.getName().endsWith("csv")) {
            s3client.putObject(new PutObjectRequest(outputBucket, getS3path(outputPath), file));
            }
        file.delete();
    }
    directory.delete();
}

I use the following dependencies which might be relevant

        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk</artifactId>
            <version>1.7.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-aws</artifactId>
            <version>2.7.2</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>log4j</groupId>
                    <artifactId>log4j</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>javax.servlet</groupId>
                    <artifactId>servlet-api</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.mortbay.jetty</groupId>
                    <artifactId>servlet-api-2.5</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

Any idea?

Dependency tree -

[INFO] +- org.apache.spark:spark-core_2.11:jar:2.1.0:compile
[INFO] |  +- org.apache.avro:avro-mapred:jar:hadoop2:1.7.7:compile
[INFO] |  |  +- org.apache.avro:avro-ipc:jar:1.7.7:compile
[INFO] |  |  +- org.apache.avro:avro-ipc:jar:tests:1.7.7:compile
[INFO] |  |  +- org.codehaus.jackson:jackson-core-asl:jar:1.9.13:compile
[INFO] |  |  \- org.codehaus.jackson:jackson-mapper-asl:jar:1.9.13:compile
[INFO] |  +- com.twitter:chill_2.11:jar:0.8.0:compile
[INFO] |  |  \- com.esotericsoftware:kryo-shaded:jar:3.0.3:compile
[INFO] |  |     +- com.esotericsoftware:minlog:jar:1.3.0:compile
[INFO] |  |     \- org.objenesis:objenesis:jar:2.1:compile
[INFO] |  +- com.twitter:chill-java:jar:0.8.0:compile
[INFO] |  +- org.apache.xbean:xbean-asm5-shaded:jar:4.4:compile
[INFO] |  +- org.apache.hadoop:hadoop-client:jar:2.2.0:compile
[INFO] |  |  +- org.apache.hadoop:hadoop-hdfs:jar:2.2.0:compile
[INFO] |  |  +- org.apache.hadoop:hadoop-mapreduce-client-app:jar:2.2.0:compile
[INFO] |  |  |  +- org.apache.hadoop:hadoop-mapreduce-client-common:jar:2.2.0:compile
[INFO] |  |  |  |  +- org.apache.hadoop:hadoop-yarn-client:jar:2.2.0:compile
[INFO] |  |  |  |  |  \- com.google.inject:guice:jar:3.0:compile
[INFO] |  |  |  |  |     +- javax.inject:javax.inject:jar:1:compile
[INFO] |  |  |  |  |     \- aopalliance:aopalliance:jar:1.0:compile
[INFO] |  |  |  |  \- org.apache.hadoop:hadoop-yarn-server-common:jar:2.2.0:compile
[INFO] |  |  |  \- org.apache.hadoop:hadoop-mapreduce-client-shuffle:jar:2.2.0:compile
[INFO] |  |  +- org.apache.hadoop:hadoop-yarn-api:jar:2.2.0:compile
[INFO] |  |  +- org.apache.hadoop:hadoop-mapreduce-client-core:jar:2.2.0:compile
[INFO] |  |  |  \- org.apache.hadoop:hadoop-yarn-common:jar:2.2.0:compile
[INFO] |  |  +- org.apache.hadoop:hadoop-mapreduce-client-jobclient:jar:2.2.0:compile
[INFO] |  |  \- org.apache.hadoop:hadoop-annotations:jar:2.2.0:compile
[INFO] |  +- org.apache.spark:spark-launcher_2.11:jar:2.1.0:compile
[INFO] |  +- org.apache.spark:spark-network-common_2.11:jar:2.1.0:compile
[INFO] |  |  \- org.fusesource.leveldbjni:leveldbjni-all:jar:1.8:compile
[INFO] |  +- org.apache.spark:spark-network-shuffle_2.11:jar:2.1.0:compile
[INFO] |  +- org.apache.spark:spark-unsafe_2.11:jar:2.1.0:compile
[INFO] |  +- net.java.dev.jets3t:jets3t:jar:0.7.1:compile
[INFO] |  |  \- commons-httpclient:commons-httpclient:jar:3.1:compile
[INFO] |  +- org.apache.curator:curator-recipes:jar:2.4.0:compile
[INFO] |  |  +- org.apache.curator:curator-framework:jar:2.4.0:compile
[INFO] |  |  +- org.apache.zookeeper:zookeeper:jar:3.4.5:compile
[INFO] |  |  \- com.google.guava:guava:jar:14.0.1:compile
[INFO] |  +- javax.servlet:javax.servlet-api:jar:3.1.0:compile
[INFO] |  +- org.apache.commons:commons-lang3:jar:3.5:compile
[INFO] |  +- org.apache.commons:commons-math3:jar:3.4.1:compile
[INFO] |  +- com.google.code.findbugs:jsr305:jar:1.3.9:compile
[INFO] |  +- org.slf4j:slf4j-api:jar:1.7.16:compile
[INFO] |  +- org.slf4j:jul-to-slf4j:jar:1.7.16:compile
[INFO] |  +- org.slf4j:jcl-over-slf4j:jar:1.7.16:compile
[INFO] |  +- com.ning:compress-lzf:jar:1.0.3:compile
[INFO] |  +- org.xerial.snappy:snappy-java:jar:1.1.2.6:compile
[INFO] |  +- net.jpountz.lz4:lz4:jar:1.3.0:compile
[INFO] |  +- org.roaringbitmap:RoaringBitmap:jar:0.5.11:compile
[INFO] |  +- commons-net:commons-net:jar:2.2:compile
[INFO] |  +- org.scala-lang:scala-library:jar:2.11.8:compile
[INFO] |  +- org.json4s:json4s-jackson_2.11:jar:3.2.11:compile
[INFO] |  |  \- org.json4s:json4s-core_2.11:jar:3.2.11:compile
[INFO] |  |     +- org.json4s:json4s-ast_2.11:jar:3.2.11:compile
[INFO] |  |     +- com.thoughtworks.paranamer:paranamer:jar:2.6:compile
[INFO] |  |     \- org.scala-lang:scalap:jar:2.11.0:compile
[INFO] |  |        \- org.scala-lang:scala-compiler:jar:2.11.0:compile
[INFO] |  |           \- org.scala-lang.modules:scala-parser-combinators_2.11:jar:1.0.1:compile
[INFO] |  +- org.glassfish.jersey.core:jersey-client:jar:2.22.2:compile
[INFO] |  |  +- javax.ws.rs:javax.ws.rs-api:jar:2.0.1:compile
[INFO] |  |  +- org.glassfish.hk2:hk2-api:jar:2.4.0-b34:compile
[INFO] |  |  |  +- org.glassfish.hk2:hk2-utils:jar:2.4.0-b34:compile
[INFO] |  |  |  \- org.glassfish.hk2.external:aopalliance-repackaged:jar:2.4.0-b34:compile
[INFO] |  |  +- org.glassfish.hk2.external:javax.inject:jar:2.4.0-b34:compile
[INFO] |  |  \- org.glassfish.hk2:hk2-locator:jar:2.4.0-b34:compile
[INFO] |  |     \- org.javassist:javassist:jar:3.18.1-GA:compile
[INFO] |  +- org.glassfish.jersey.core:jersey-common:jar:2.22.2:compile
[INFO] |  |  +- javax.annotation:javax.annotation-api:jar:1.2:compile
[INFO] |  |  +- org.glassfish.jersey.bundles.repackaged:jersey-guava:jar:2.22.2:compile
[INFO] |  |  \- org.glassfish.hk2:osgi-resource-locator:jar:1.0.1:compile
[INFO] |  +- org.glassfish.jersey.core:jersey-server:jar:2.22.2:compile
[INFO] |  |  +- org.glassfish.jersey.media:jersey-media-jaxb:jar:2.22.2:compile
[INFO] |  |  \- javax.validation:validation-api:jar:1.1.0.Final:compile
[INFO] |  +- org.glassfish.jersey.containers:jersey-container-servlet:jar:2.22.2:compile
[INFO] |  +- org.glassfish.jersey.containers:jersey-container-servlet-core:jar:2.22.2:compile
[INFO] |  +- io.netty:netty-all:jar:4.0.42.Final:compile
[INFO] |  +- io.netty:netty:jar:3.8.0.Final:compile
[INFO] |  +- com.clearspring.analytics:stream:jar:2.7.0:compile
[INFO] |  +- io.dropwizard.metrics:metrics-core:jar:3.1.2:compile
[INFO] |  +- io.dropwizard.metrics:metrics-jvm:jar:3.1.2:compile
[INFO] |  +- io.dropwizard.metrics:metrics-json:jar:3.1.2:compile
[INFO] |  +- io.dropwizard.metrics:metrics-graphite:jar:3.1.2:compile
[INFO] |  +- com.fasterxml.jackson.module:jackson-module-scala_2.11:jar:2.6.5:compile
[INFO] |  |  +- org.scala-lang:scala-reflect:jar:2.11.7:compile
[INFO] |  |  \- com.fasterxml.jackson.module:jackson-module-paranamer:jar:2.6.5:compile
[INFO] |  +- org.apache.ivy:ivy:jar:2.4.0:compile
[INFO] |  +- oro:oro:jar:2.0.8:compile
[INFO] |  +- net.razorvine:pyrolite:jar:4.13:compile
[INFO] |  +- net.sf.py4j:py4j:jar:0.10.4:compile
[INFO] |  +- org.apache.spark:spark-tags_2.11:jar:2.1.0:compile
[INFO] |  |  \- org.scalatest:scalatest_2.11:jar:2.2.6:compile
[INFO] |  |     \- org.scala-lang.modules:scala-xml_2.11:jar:1.0.2:compile
[INFO] |  +- org.apache.commons:commons-crypto:jar:1.0.0:compile
[INFO] |  \- org.spark-project.spark:unused:jar:1.0.0:compile
[INFO] +- org.apache.hadoop:hadoop-aws:jar:2.7.2:compile
[INFO] |  \- org.apache.hadoop:hadoop-common:jar:2.7.2:compile
[INFO] |     +- commons-cli:commons-cli:jar:1.2:compile
[INFO] |     +- xmlenc:xmlenc:jar:0.52:compile
[INFO] |     +- commons-io:commons-io:jar:2.4:compile
[INFO] |     +- commons-collections:commons-collections:jar:3.2.2:compile
[INFO] |     +- org.mortbay.jetty:jetty:jar:6.1.26:compile
[INFO] |     +- org.mortbay.jetty:jetty-util:jar:6.1.26:compile
[INFO] |     +- javax.servlet.jsp:jsp-api:jar:2.1:runtime
[INFO] |     +- com.sun.jersey:jersey-core:jar:1.9:compile
[INFO] |     +- com.sun.jersey:jersey-json:jar:1.9:compile
[INFO] |     |  +- org.codehaus.jettison:jettison:jar:1.1:compile
[INFO] |     |  +- com.sun.xml.bind:jaxb-impl:jar:2.2.3-1:compile
[INFO] |     |  |  \- javax.xml.bind:jaxb-api:jar:2.2.2:compile
[INFO] |     |  |     +- javax.xml.stream:stax-api:jar:1.0-2:compile
[INFO] |     |  |     \- javax.activation:activation:jar:1.1:compile
[INFO] |     |  +- org.codehaus.jackson:jackson-jaxrs:jar:1.8.3:compile
[INFO] |     |  \- org.codehaus.jackson:jackson-xc:jar:1.8.3:compile
[INFO] |     +- com.sun.jersey:jersey-server:jar:1.9:compile
[INFO] |     |  \- asm:asm:jar:3.1:compile
[INFO] |     +- commons-lang:commons-lang:jar:2.6:compile
[INFO] |     +- commons-configuration:commons-configuration:jar:1.6:compile
[INFO] |     |  +- commons-digester:commons-digester:jar:1.8:compile
[INFO] |     |  |  \- commons-beanutils:commons-beanutils:jar:1.7.0:compile
[INFO] |     |  \- commons-beanutils:commons-beanutils-core:jar:1.8.0:compile
[INFO] |     +- org.apache.avro:avro:jar:1.7.4:compile
[INFO] |     +- com.google.protobuf:protobuf-java:jar:2.5.0:compile
[INFO] |     +- com.google.code.gson:gson:jar:2.2.4:compile
[INFO] |     +- org.apache.hadoop:hadoop-auth:jar:2.7.2:compile
[INFO] |     |  \- org.apache.directory.server:apacheds-kerberos-codec:jar:2.0.0-M15:compile
[INFO] |     |     +- org.apache.directory.server:apacheds-i18n:jar:2.0.0-M15:compile
[INFO] |     |     +- org.apache.directory.api:api-asn1-api:jar:1.0.0-M20:compile
[INFO] |     |     \- org.apache.directory.api:api-util:jar:1.0.0-M20:compile
[INFO] |     +- com.jcraft:jsch:jar:0.1.42:compile
[INFO] |     +- org.apache.curator:curator-client:jar:2.7.1:compile
[INFO] |     +- org.apache.htrace:htrace-core:jar:3.1.0-incubating:compile
[INFO] |     \- org.apache.commons:commons-compress:jar:1.4.1:compile
[INFO] |        \- org.tukaani:xz:jar:1.0:compile
[INFO] +- org.apache.spark:spark-sql_2.10:jar:2.1.0:compile
[INFO] |  +- com.univocity:univocity-parsers:jar:2.2.1:compile
[INFO] |  +- org.apache.spark:spark-sketch_2.10:jar:2.1.0:compile
[INFO] |  +- org.apache.spark:spark-core_2.10:jar:2.1.0:compile
[INFO] |  |  +- com.twitter:chill_2.10:jar:0.8.0:compile
[INFO] |  |  +- org.apache.spark:spark-launcher_2.10:jar:2.1.0:compile
[INFO] |  |  +- org.apache.spark:spark-network-common_2.10:jar:2.1.0:compile
[INFO] |  |  +- org.apache.spark:spark-network-shuffle_2.10:jar:2.1.0:compile
[INFO] |  |  +- org.apache.spark:spark-unsafe_2.10:jar:2.1.0:compile
[INFO] |  |  +- org.json4s:json4s-jackson_2.10:jar:3.2.11:compile
[INFO] |  |  |  \- org.json4s:json4s-core_2.10:jar:3.2.11:compile
[INFO] |  |  |     \- org.json4s:json4s-ast_2.10:jar:3.2.11:compile
[INFO] |  |  \- com.fasterxml.jackson.module:jackson-module-scala_2.10:jar:2.6.5:compile
[INFO] |  +- org.apache.spark:spark-catalyst_2.10:jar:2.1.0:compile
[INFO] |  |  +- org.codehaus.janino:janino:jar:3.0.0:compile
[INFO] |  |  +- org.codehaus.janino:commons-compiler:jar:3.0.0:compile
[INFO] |  |  \- org.antlr:antlr4-runtime:jar:4.5.3:compile
[INFO] |  +- org.apache.spark:spark-tags_2.10:jar:2.1.0:compile
[INFO] |  |  \- org.scalatest:scalatest_2.10:jar:2.2.6:compile
[INFO] |  +- org.apache.parquet:parquet-column:jar:1.8.1:compile
[INFO] |  |  +- org.apache.parquet:parquet-common:jar:1.8.1:compile
[INFO] |  |  \- org.apache.parquet:parquet-encoding:jar:1.8.1:compile
[INFO] |  \- org.apache.parquet:parquet-hadoop:jar:1.8.1:compile
[INFO] |     +- org.apache.parquet:parquet-format:jar:2.3.0-incubating:compile
[INFO] |     \- org.apache.parquet:parquet-jackson:jar:1.8.1:compile
[INFO] +- com.amazonaws:aws-java-sdk:jar:1.7.4:compile
[INFO] |  +- commons-logging:commons-logging:jar:1.1.1:compile
[INFO] |  +- org.apache.httpcomponents:httpclient:jar:4.2:compile
[INFO] |  |  \- org.apache.httpcomponents:httpcore:jar:4.2:compile
[INFO] |  +- commons-codec:commons-codec:jar:1.3:compile
[INFO] |  \- joda-time:joda-time:jar:2.9.9:compile (version selected from constraint [2.2,))
[INFO] +- com.fasterxml.jackson.core:jackson-core:jar:2.7.3:compile
[INFO] +- com.fasterxml.jackson.core:jackson-databind:jar:2.7.3:compile
[INFO] \- com.fasterxml.jackson.core:jackson-annotations:jar:2.7.3:compile
2
Run mvn dependency:tree and post that, looks like you're missing a dependency of some sort.Compass
@Compass added, thanks. I would expect it not to compile if a dependency is missing..Tom Ron

2 Answers

2
votes

You appear to be using a really old version of AWS SDK (1.7.4) from 2014. Your apache spark is 2.1, which is from 2016.

Commit history for the AmazonS3Client only goes back to 1.9, so it very well may not exist in your AWS SDK.

The reason your code compiles is that your code itself isn't the problem, but one of your dependencies is trying to use a more recent version of S3 client that your old SDK doesn't have. In other words, this isn't a code problem, it's a dependency management problem. If you're not directly using the SDK at all, it's safe to use a more appropriate AWS SDK 1.11.X, if you are, you'll have to coax your code to that point.

Typically, the spark-core module would point out its dependency to a version of aws, but I assume it was specifically excluded from its own so you could provide your own version of the SDK for your use without any sort of conflict, with the caveat that if you use too old of a dependency, it just breaks.

0
votes

After massaging my pom file I was able to get to a stable with the following pom file and packaging with -

mvn clean compile assembly:single;

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xmlns="http://maven.apache.org/POM/4.0.0"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.group</groupId>
    <artifactId>arifact</artifactId>
    <packaging>jar</packaging>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <java.version>1.8</java.version>
        <jdk.version>1.8</jdk.version>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
    </properties>

    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-aws</artifactId>
            <version>2.8.2</version>
        </dependency>
        <dependency> <!-- Spark dependency -->
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>2.1.1</version>
        </dependency>
        <dependency>
            <groupId>com.amazonaws</groupId>
            <artifactId>aws-java-sdk-s3</artifactId>
            <version>1.11.229</version>
        </dependency>
    </dependencies>
    <build>
      <plugins>
              <plugin>
                  <groupId>org.apache.maven.plugins</groupId>
                  <artifactId>maven-compiler-plugin</artifactId>
                  <version>2.3.2</version>
                  <configuration>
                      <source>${jdk.version}</source>
                      <target>${jdk.version}</target>
                  </configuration>
              </plugin>
          <plugin>
              <artifactId>maven-assembly-plugin</artifactId>
              <configuration>
                  <archive>
                      <manifest>
                          <mainClass>com.group.artifact.MainClass</mainClass>
                      </manifest>
                  </archive>
                  <descriptorRefs>
                      <descriptorRef>jar-with-dependencies</descriptorRef>
                  </descriptorRefs>
              </configuration>
          </plugin>
      </plugins>
    </build>
</project>