Please help, I'm a complete noob when it comes to Spark and Haddop in general. My code looks like this:
public static void main(String[] args) throws IOException {
String[] jars = {"D:\\customJars\\sparky.jar","D:\\customJars\\guava-19.0.jar"};
System.setProperty("hadoop.home.dir", "D:\\hadoop-common-2.2.0-bin-master");
SparkConf sparkConf = new SparkConf().setAppName("com.nucleus.spark.MlibPOC")
.setMaster("spark://10.1.50.165:7077")
.setJars(jars);
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
SQLContext sqlContext = new SQLContext(jsc);
DataFrame df = sqlContext.read().json("src/com/nucleus/spark/generated.json");
}
My Spark Cluster is deployed as a single cluster at 10.1.50.165 running RHEL6 and when I run this simple code, while trying to read the json file I get exceptions like:
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 0.0 failed 4 times, most recent failure: Lost task 1.3 in stage 0.0 (TID 5, 10.1.50.165): java.io.FileNotFoundException: File file:/D:/WorkSpace2/SparkHadoopProject/src/com/nucleus/spark/generated.json does not exist at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:534) at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:747) at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:524) at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:409) at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.(ChecksumFileSystem.java:140) at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:341) at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:766) at org.apache.hadoop.mapred.LineRecordReader.(LineRecordReader.java:108) at org.apache.hadoop.mapred.TextInputFormat.getRecordReader(TextInputFormat.java:67) at org.apache.spark.rdd.HadoopRDD$$anon$1.(HadoopRDD.scala:237) at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:208) at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:101) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745)
Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1832) at org.apache.spark.SparkContext.runJob(SparkContext.scala:1952) at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:1025) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111) at org.apache.spark.rdd.RDD.withScope(RDD.scala:316) at org.apache.spark.rdd.RDD.reduce(RDD.scala:1007) at org.apache.spark.rdd.RDD$$anonfun$treeAggregate$1.apply(RDD.scala:1150) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111) at org.apache.spark.rdd.RDD.withScope(RDD.scala:316) at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1127) at org.apache.spark.sql.execution.datasources.json.InferSchema$.infer(InferSchema.scala:65) at org.apache.spark.sql.execution.datasources.json.JSONRelation$$anonfun$4.apply(JSONRelation.scala:114) at org.apache.spark.sql.execution.datasources.json.JSONRelation$$anonfun$4.apply(JSONRelation.scala:109) at scala.Option.getOrElse(Option.scala:120) at org.apache.spark.sql.execution.datasources.json.JSONRelation.dataSchema$lzycompute(JSONRelation.scala:109) at org.apache.spark.sql.execution.datasources.json.JSONRelation.dataSchema(JSONRelation.scala:108) at org.apache.spark.sql.sources.HadoopFsRelation.schema$lzycompute(interfaces.scala:636) at org.apache.spark.sql.sources.HadoopFsRelation.schema(interfaces.scala:635) at org.apache.spark.sql.execution.datasources.LogicalRelation.(LogicalRelation.scala:37) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:125) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:109) at org.apache.spark.sql.DataFrameReader.json(DataFrameReader.scala:244) at com.nucleus.spark.MlibPOC.main(MlibPOC.java:44) Caused by: java.io.FileNotFoundException: File file:/D:/WorkSpace2/SparkHadoopProject/src/com/nucleus/spark/generated.json does not exist at org.apache.hadoop.fs.RawLocalFileSystem.deprecatedGetFileStatus(RawLocalFileSystem.java:534) at org.apache.hadoop.fs.RawLocalFileSystem.getFileLinkStatusInternal(RawLocalFileSystem.java:747) at org.apache.hadoop.fs.RawLocalFileSystem.getFileStatus(RawLocalFileSystem.java:524) at org.apache.hadoop.fs.FilterFileSystem.getFileStatus(FilterFileSystem.java:409) at org.apache.hadoop.fs.ChecksumFileSystem$ChecksumFSInputChecker.(ChecksumFileSystem.java:140) at org.apache.hadoop.fs.ChecksumFileSystem.open(ChecksumFileSystem.java:341) at org.apache.hadoop.fs.FileSystem.open(FileSystem.java:766) at org.apache.hadoop.mapred.LineRecordReader.(LineRecordReader.java:108) at org.apache.hadoop.mapred.TextInputFormat.getRecordReader(TextInputFormat.java:67) at org.apache.spark.rdd.HadoopRDD$$anon$1.(HadoopRDD.scala:237) at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:208) at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:101) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306) at org.apache.spark.rdd.RDD.iterator(RDD.scala:270) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:66) at org.apache.spark.scheduler.Task.run(Task.scala:89) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745)
FileNotFoundException
is the clue here... it can't findD:/WorkSpace2/SparkHadoopProject/src/com/nucleus/spark/generated.json
. – Binary NerdD:\\WorkSpace2\\SparkHadoopProject\\src\\com\\nucleus\\spark\\generated.json
– Binary Nerd