I found the solution to my problem here. Very detail example given by Mr. Sujit, although the code is written in older version of Lucene so many things will have to be changed. I'll update details when I finish my code.
Here is my solution that works on Lucene 4.4
public class BuildTermDocumentMatrix {
public BuildTermDocumentMatrix(File index, File corpus) throws IOException{
reader = DirectoryReader.open(FSDirectory.open(index));
searcher = new IndexSearcher(reader);
this.corpus = corpus;
termIdMap = computeTermIdMap(reader);
}
private Map<String, Integer> computeTermIdMap(IndexReader reader) throws IOException {
Map<String,Integer> termIdMap = new HashMap<String,Integer>();
int id = 0;
Fields fields = MultiFields.getFields(reader);
Terms terms = fields.terms("contents");
TermsEnum itr = terms.iterator(null);
BytesRef term = null;
while ((term = itr.next()) != null) {
String termText = term.utf8ToString();
if (termIdMap.containsKey(termText))
continue;
termIdMap.put(termText, id++);
}
return termIdMap;
}
public RealMatrix buildTermDocumentMatrix () throws IOException {
int col = 0;
int numDocs = countDocs(corpus);
int numTerms = termIdMap.size();
RealMatrix tdMatrix = new Array2DRowRealMatrix(numTerms, numDocs);
for (File f : corpus.listFiles()) {
if (!f.isHidden() && f.canRead()) {
String path = f.getPath();
BooleanQuery pathQuery = new BooleanQuery();
pathQuery.add(new TermQuery(new Term("path", path)), BooleanClause.Occur.SHOULD);
TopDocs hits = searcher.search(pathQuery, 1);
Terms termVector = reader.getTermVector(hits.scoreDocs[0].doc, "contents");
TermsEnum itr = termVector.iterator(null);
BytesRef term = null;
while ((term = itr.next()) != null) {
String termText = term.utf8ToString();
int row = termIdMap.get(termText);
long termFreq = itr.totalTermFreq();
long docCount = itr.docFreq();
double weight = computeTfIdfWeight(termFreq, docCount, numDocs);
tdMatrix.setEntry(row, col, weight);
}
col++;
}
}
return tdMatrix;
}
}