0
votes

How can restrict Lucene.Net to index only these terms that has length greater than x. I am indexing the document as:

        String indexDirectory = @"C:\Users\user\Desktop\Index";
        String dataDirectory = @"C:\Users\user\Desktop\Data";


        StandardAnalyzer analyzer = new StandardAnalyzer();
        IndexWriter writer = new IndexWriter(indexDirectory, analyzer);

        Document doc = new Document();

        Field fPath = new Lucene.Net.Documents.Field("path", dataDirectory, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO);
        Field fContent = new Field("content", ReadTextFile(dataDirectory), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES);

        doc.Add(fPath);
        doc.Add(fContent);

I am using the following code to get indexed Terms from Lucene Index file.

        TermFreqVector[] vectors = IndexReader.Open(indexDirectory).GetTermFreqVectors(0);

        foreach (Lucene.Net.Index.TermFreqVector vector in vectors)
        {
            String[] terms = vector.GetTerms();

            foreach (String term in terms)
            {
                // loop through indexed terms
            }

        }
1

1 Answers

2
votes

You could implement your own Analyzer, or extend the StandardAnalyzer.

Example:

TokenFilter + Analyzer

public class MinTermLengthTokenFilter : TokenFilter
{
    private int minTermLength;
    private TermAttribute termAtt;
    public MinTermLengthTokenFilter(int maxTermLength, TokenStream input)
        : base(input)
    {
        this.minTermLength = maxTermLength;
        termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
    }

    public override bool IncrementToken()
    {
        while (input.IncrementToken())
        {
            if (termAtt.TermLength() >= minTermLength)
            {
                return true;
            }
        }
        return false;
    }        

}


public class MinTermLengthAnalyzer : StandardAnalyzer
{
    private int minTermLength;
    public MinTermLengthAnalyzer(int minTermLength)
        :base()
    {
        this.minTermLength = minTermLength;
    }

    public override TokenStream TokenStream(string fieldName, TextReader reader)
    {   
        return new MinTermLengthTokenFilter(minTermLength, base.TokenStream(fieldName, reader));
    }

    public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
    {
        return new MinTermLengthTokenFilter(minTermLength, base.ReusableTokenStream(fieldName, reader));

    }
}

Indexing:

 FSDirectory dir = FSDirectory.GetDirectory("C:\\temp\\CFSTEST");
 IndexWriter writer = new IndexWriter(dir, new MinTermLengthAnalyzer(5));
 Document document = new Document();

 document.Add(new Field(
     "text",
     "some sample text for demonstration",
     Field.Store.YES,
     Field.Index.ANALYZED,
     Field.TermVector.WITH_POSITIONS_OFFSETS));
 writer.AddDocument(document);       
 writer.Close();

Searching :

        var indexSearcher = new IndexSearcher(IndexReader.Open("C:\\temp\\CFSTEST"));

        var results = indexSearcher.Search(new TermQuery(new Term("text", "demonstration")), null, 25);

        foreach (var result in results.ScoreDocs)
        {
            TermFreqVector[] vectors = indexSearcher.GetIndexReader().GetTermFreqVectors(result.doc);

            foreach (Lucene.Net.Index.TermFreqVector vector in vectors)
            {
                String[] terms = vector.GetTerms();

                foreach (String term in terms)
                {
                    Console.WriteLine(term);
                }

            }
        }

        indexSearcher.Close();
        // outputs:
        // demonstration
        // sample