How to use only one map for many input files? Because Hadoop creates one mapper for one file. I need only one mapper for all files.
I tried to use CombineFileInputFormat. It had one mapper, but map input contained only one file. I need that input map value to contain data from all files (Text format) like this :
Input map value :
data from file1.txt
data from file2.txt
data from file3.txt
public class WholeFileInputFormat extends CombineFileInputFormat<NullWritable, Text> {
public WholeFileInputFormat() {
super();
setMaxSplitSize(67108864);
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
return false;
}
@Override
public RecordReader<NullWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException {
if (!(split instanceof CombineFileSplit)) {
throw new IllegalArgumentException("split must be a CombineFileSplit");
}
RecordReader<NullWritable, Text> r = new CombineFileRecordReader<NullWritable, Text>((CombineFileSplit) split, context, WholeFileRecordReader.class);
return r;
//return null;
}
}
public class WholeFileRecordReader extends RecordReader<NullWritable, Text> {
private final Text mFileText;
public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext context,
Integer pathToProcess) throws IOException {
mProcessed = false;
mFileToRead = fileSplit.getPath(pathToProcess);
mFileLength = fileSplit.getLength(pathToProcess);
mConf = context.getConfiguration();
assert 0 == fileSplit.getOffset(pathToProcess);
FileSystem fs = FileSystem.get(mConf);
assert fs.getFileStatus(mFileToRead).getLen() == mFileLength;
// mFileName = new Text();
mFileText = new Text();
}
@Override
public void close() throws IOException {
mFileText.clear();
}
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return mFileText;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return (mProcessed) ? (float) 1.0 : (float) 0.0;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// no-op.
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!mProcessed) {
if (mFileLength > (long) Integer.MAX_VALUE) {
throw new IOException("File is longer than Integer.MAX_VALUE.");
}
byte[] contents = new byte[(int) mFileLength];
FileSystem fs = mFileToRead.getFileSystem(mConf);
FSDataInputStream in = null;
try {
// Set the contents of this file.
in = fs.open(mFileToRead);
IOUtils.readFully(in, contents, 0, contents.length);
mFileText.set(contents, 0, contents.length);
} finally {
IOUtils.closeStream(in);
}
mProcessed = true;
return true;
}
return false;
}
}
Could you help me?