0
votes

I am trying to find out why the below Java does not work when I try to run it on hadoop.

import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

public class PageStat implements Tool {
private Configuration conf;

@Override
public int run(String[] args) throws Exception {
    Job job = new Job(getConf());
    String jobName = "Page visit statistics MR";
    job.setJobName(jobName);

    job.setJarByClass(PageStat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(PageStat.PageStatMapper.class);
    job.setReducerClass(PageStat.PageStatReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));

    int status =  job.waitForCompletion(true) ? 0 : 1;
    return status;
}

public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new PageStat(), args);
    System.exit(exitCode);
}

public void setConf(Configuration conf) {
   this.conf = conf;
}

public Configuration getConf() {
    return conf;
}

public static class PageStatMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    private Text keyHolder = new Text();
    private IntWritable valueHolder = new IntWritable();

    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
        String[] items  =  value.toString().split(",");
        if (items.length == 3) {
            String url = items[1];
            keyHolder.set(url);
            Integer duration = Integer.parseInt(items[2]);
            valueHolder.set(duration);
            context.write(keyHolder, valueHolder);
        } else {
            context.getCounter("Error", "invalidData").increment(1);
        }
    }        
}   

public static class PageStatReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    private Text keyHolder = new Text();
    private IntWritable valueHolder = new IntWritable();
    private String statType;
    private int count;
    private int totalTime;
    private int avTime;

    protected void setup(Context context) throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        statType = conf.get("page.stat");
    }

    protected void reduce(Text key, Iterable<IntWritable> values, Context context)
    throws IOException, InterruptedException {
        count = 0;
        totalTime = 0;
        for (IntWritable value : values){
            ++count;
            totalTime += value.get();
        } 
        avTime = totalTime / count;

        keyHolder.set(key);
        if (statType.equals("average")){
            valueHolder.set(avTime);
        } else {
            valueHolder.set(totalTime);
        }
        context.write(keyHolder, valueHolder);
    }
}    

}

The error is:

c:\hadoop-training\tutorial02-jobtracker>hadoop jar PageStat.jar PageStat jobtra cker/input/visit_5000000.txt jobtracker/output 13/07/29 11:24:50 INFO input.FileInputFormat: Total input paths to process : 1 log4j:ERROR Failed to rename [c:\Hadoop\hadoop-1.1.0-SNAPSHOT\logs/hadoop.log] t o [c:\Hadoop\hadoop-1.1.0-SNAPSHOT\logs/hadoop.log.2013-07-26]. 13/07/29 11:24:51 INFO util.NativeCodeLoader: Loaded the native-hadoop library 13/07/29 11:24:51 WARN snappy.LoadSnappy: Snappy native library not loaded 13/07/29 11:24:54 INFO mapred.JobClient: Running job: job_201307261340_0001 13/07/29 11:24:55 INFO mapred.JobClient: map 0% reduce 0% 13/07/29 11:25:24 INFO mapred.JobClient: map 1% reduce 0% 13/07/29 11:25:27 INFO mapred.JobClient: map 6% reduce 0% 13/07/29 11:25:30 INFO mapred.JobClient: map 14% reduce 0% 13/07/29 11:25:35 INFO mapred.JobClient: map 22% reduce 0% 13/07/29 11:25:38 INFO mapred.JobClient: map 31% reduce 0% 13/07/29 11:25:41 INFO mapred.JobClient: map 35% reduce 0% 13/07/29 11:25:44 INFO mapred.JobClient: map 44% reduce 0% 13/07/29 11:25:47 INFO mapred.JobClient: map 50% reduce 0% 13/07/29 11:26:03 INFO mapred.JobClient: map 60% reduce 0% 13/07/29 11:26:06 INFO mapred.JobClient: map 64% reduce 0% 13/07/29 11:26:09 INFO mapred.JobClient: map 69% reduce 0% 13/07/29 11:26:12 INFO mapred.JobClient: map 76% reduce 0% 13/07/29 11:26:15 INFO mapred.JobClient: map 81% reduce 0% 13/07/29 11:26:18 INFO mapred.JobClient: map 85% reduce 0% 13/07/29 11:26:21 INFO mapred.JobClient: map 87% reduce 0% 13/07/29 11:26:24 INFO mapred.JobClient: map 92% reduce 0% 13/07/29 11:26:27 INFO mapred.JobClient: map 94% reduce 0% 13/07/29 11:26:30 INFO mapred.JobClient: map 96% reduce 0% 13/07/29 11:26:33 INFO mapred.JobClient: map 97% reduce 0% 13/07/29 11:26:37 INFO mapred.JobClient: map 99% reduce 8% 13/07/29 11:26:40 INFO mapred.JobClient: map 100% reduce 8% 13/07/29 11:26:46 INFO mapred.JobClient: map 100% reduce 25% 13/07/29 11:26:54 INFO mapred.JobClient: Task Id : attempt_201307261340_0001_r_0 00000_0, Status : FAILED java.lang.NullPointerException at PageStat$PageStatReducer.reduce(PageStat.java:120) at PageStat$PageStatReducer.reduce(PageStat.java:96) at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177) at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:651 ) at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418) at org.apache.hadoop.mapred.Child$4.run(Child.java:271) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInforma tion.java:1135) at org.apache.hadoop.mapred.Child.main(Child.java:265)

13/07/29 11:26:56 INFO mapred.JobClient: map 100% reduce 0% 13/07/29 11:27:05 INFO mapred.JobClient: map 100% reduce 8% 13/07/29 11:27:08 INFO mapred.JobClient: map 100% reduce 33% 13/07/29 11:27:10 INFO mapred.JobClient: Task Id : attempt_201307261340_0001_r_0 00000_1, Status : FAILED java.lang.NullPointerException at PageStat$PageStatReducer.reduce(PageStat.java:120) at PageStat$PageStatReducer.reduce(PageStat.java:96) at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177) at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:651 ) at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418) at org.apache.hadoop.mapred.Child$4.run(Child.java:271) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInforma tion.java:1135) at org.apache.hadoop.mapred.Child.main(Child.java:265)

13/07/29 11:27:11 INFO mapred.JobClient: map 100% reduce 0% 13/07/29 11:27:20 INFO mapred.JobClient: map 100% reduce 8% 13/07/29 11:27:23 INFO mapred.JobClient: map 100% reduce 25% 13/07/29 11:27:25 INFO mapred.JobClient: Task Id : attempt_201307261340_0001_r_0 00000_2, Status : FAILED java.lang.NullPointerException at PageStat$PageStatReducer.reduce(PageStat.java:120) at PageStat$PageStatReducer.reduce(PageStat.java:96) at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177) at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:651 ) at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418) at org.apache.hadoop.mapred.Child$4.run(Child.java:271) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:396) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInforma tion.java:1135) at org.apache.hadoop.mapred.Child.main(Child.java:265)

13/07/29 11:27:26 INFO mapred.JobClient: map 100% reduce 0% 13/07/29 11:27:38 INFO mapred.JobClient: map 100% reduce 25% 13/07/29 11:27:41 INFO mapred.JobClient: map 100% reduce 0% 13/07/29 11:27:43 INFO mapred.JobClient: Job complete: job_201307261340_0001 13/07/29 11:27:43 INFO mapred.JobClient: Counters: 24 13/07/29 11:27:43 INFO mapred.JobClient: Job Counters 13/07/29 11:27:43 INFO mapred.JobClient: Launched reduce tasks=4 13/07/29 11:27:43 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=179086 13/07/29 11:27:43 INFO mapred.JobClient: Total time spent by all reduces wai ting after reserving slots (ms)=0 13/07/29 11:27:43 INFO mapred.JobClient:
Total time spent by all maps waitin g after reserving slots (ms)=0 13/07/29 11:27:43 INFO mapred.JobClient: Launched map tasks=4 13/07/29 11:27:43 INFO mapred.JobClient: Data-local map tasks=4 13/07/29 11:27:43 INFO mapred.JobClient: Failed reduce tasks=1 13/07/29 11:27:43 INFO mapred.JobClient:
SLOTS_MILLIS_REDUCES=106513 13/07/29 11:27:43 INFO mapred.JobClient:
FileSystemCounters 13/07/29 11:27:43 INFO mapred.JobClient:
FILE_BYTES_READ=179504086 13/07/29 11:27:43 INFO mapred.JobClient:
HDFS_BYTES_READ=254931072 13/07/29 11:27:43 INFO mapred.JobClient:
FILE_BYTES_WRITTEN=359099432 13/07/29 11:27:43 INFO mapred.JobClient: File Input Format Counters 13/07/29 11:27:43 INFO mapred.JobClient:
Bytes Read=254930544 13/07/29 11:27:43 INFO mapred.JobClient:
Map-Reduce Framework 13/07/29 11:27:43 INFO mapred.JobClient: Map output materialized bytes=17949 9502 13/07/29 11:27:43 INFO mapred.JobClient: Combine output records=0 13/07/29 11:27:43 INFO mapred.JobClient: Map input records=5000000 13/07/29 11:27:43 INFO mapred.JobClient: Physical memory (bytes) snapshot=85 1607552 13/07/29 11:27:43 INFO mapred.JobClient: Spilled Records=10000000 13/07/29 11:27:43 INFO mapred.JobClient: Map output bytes=169499478 13/07/29 11:27:43 INFO mapred.JobClient: CPU time spent (ms)=81308 13/07/29 11:27:43 INFO mapred.JobClient: Total committed heap usage (bytes)= 746323968 13/07/29 11:27:43 INFO mapred.JobClient: Virtual memory (bytes) snapshot=988 401664 13/07/29 11:27:43 INFO mapred.JobClient: Combine input records=0 13/07/29 11:27:43 INFO mapred.JobClient: Map output records=5000000 13/07/29 11:27:43 INFO mapred.JobClient:
SPLIT_RAW_BYTES=528

Thanks!!!

2

2 Answers

1
votes

I had a similar problem, you need to use the -D flag to execute:

-Dpage.stat=total

You'll likely see an error:

log4j:WARN No appenders could be found for logger (org.apache.hadoop.hdfs.DFSClient).
log4j:WARN Please initialize the log4j system properly.

That's not the full answer, I'm still getting to the bottom of it myself.

0
votes

The line numbers in the stack trace don't seem to line up with the source code posted. Has the code changed since this run?

The NullPointerException may be occurring on the if (statType...) line. I don't see anything setting "page.stat" in the configuration, either hard-coded in the run method or passed as an argument in the job submission. This would cause the statType member to be initialized to null.