1
votes

I've got a lots of documents in different time. Now I need to calculate the tfidf for documents in one period of time. The following is what i am going to do, for example:

I have one million text documents for one year, I'll take the documents in one month as a corpus. I want to calculate the tfidf for each month, i.e., calculating the term frequency in one month, and multiply the inverse document frequency in remaining month.

The data (text, time etc.) is indexed using lucene, I just wondered whether lucene could facilitate the calculation of such a scenario I know Lucene can help me get the term frequency and document frequency, but is there any API for restricting the time range for calculation?

Much Thanks.

1

1 Answers

1
votes

I've got a solution for this using Lucene.Net 3.0.3 and payloads. I'm not sure if this is the best way to accomplish this using the java-version which currently is some way ahead of the .net port.

It works by assigning a payload, a custom byte-array, to terms that should be custom scored, and a custom Similarity that overrides ScorePayload to parse the byte-array to do custom filtering. (this would require a query that calls this method, like the PayloadTermQuery).

This highly contrived example code will score the term based on (id % 3). (Multiples of three are scored zero). You could use this combined with a PositiveScoresOnlyCollector to ignore matches that receives a zero score.

using System;
using System.IO;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Tokenattributes;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Search.Payloads;
using Lucene.Net.Store;

public static class Program {
    public static void Main() {
        var directory = new RAMDirectory();

        // Initialization; create 50 documents with payload
        var writer = new IndexWriter(directory, new KeywordAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
        for (var i = 0; i < 50; ++i) {
            AddDocument(writer, i, "lorem ipsum etc blah blah");
        }
        writer.Commit();

        var searcher = new IndexSearcher(directory, readOnly: true);
        searcher.Similarity = new ShazaamPayloadSimilarity();

        // The term we'll be looking for. This should match all documents.
        var term = new Term("Data", "lorem");
        var query = new PayloadTermQuery(term, new MaxPayloadFunction());
        var topDocs = searcher.Search(query, 40);

        // This is a bad example of a FieldCache usage.
        var iValues = FieldCache_Fields.DEFAULT.GetStrings(searcher.IndexReader, "Id");
        foreach (var scoreDoc in topDocs.ScoreDocs) {
            Console.WriteLine("Score: {0:0.0000}  i={1}", scoreDoc.Score, iValues[scoreDoc.Doc]);
        }

        Console.ReadLine();
    }

    public static void AddDocument(IndexWriter writer, Int32 id, String data) {
        var payload = BitConverter.GetBytes(id);
        var analyzer = new ShazaamPayloadAnalyzer(payload);
        var textReader = new StringReader(data);

        var document = new Document();
        document.Add(new Field("Id", id.ToString(), Field.Store.NO, Field.Index.NOT_ANALYZED));
        document.Add(new Field("Data", analyzer.TokenStream(null, textReader)));

        writer.AddDocument(document);
    }
}

public class ShazaamPayloadAnalyzer : Analyzer {
    private readonly Byte[] _value;

    public ShazaamPayloadAnalyzer(Byte[] value) {
        _value = value;
    }

    public override TokenStream TokenStream(String fieldName, TextReader reader) {
        TokenStream result = new WhitespaceTokenizer(reader);
        result = new ShazaamPayloadFilter(result, _value);
        return result;
    }
}

public class ShazaamPayloadFilter : TokenFilter {
    private readonly byte[] _payload;
    private readonly IPayloadAttribute _payloadAttr;

    public ShazaamPayloadFilter(TokenStream input, Byte[] payload)
        : base(input) {
        _payload = payload;
        _payloadAttr = AddAttribute<IPayloadAttribute>();
    }

    public override Boolean IncrementToken() {
        if (input.IncrementToken()) {
            _payloadAttr.Payload = new Payload(_payload);
            return true;
        }

        return false;
    }
}

public class ShazaamPayloadSimilarity : DefaultSimilarity {
    public override Single ScorePayload(Int32 docId, String fieldName, Int32 start, Int32 end, Byte[] payload, Int32 offset, Int32 length) {
        var originalValue = BitConverter.ToInt32(payload, startIndex: 0);

        // Advanced logic ahead!
        return (originalValue % 3);
    }
}