I'm very confused by some Lucene.NET behavior I'm observing. I assume the same is true in Java's Lucene, but have not verified. Here's a test to demonstrate:
[Fact]
public void repro()
{
var directory = new RAMDirectory();
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
float firstScore, secondScore, thirdScore;
using (var indexWriter = new IndexWriter(directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
{
var document = new Document();
document.Add(new Field("id", "abc", Field.Store.YES, Field.Index.NOT_ANALYZED));
document.Add(new Field("field", "some text in the field", Field.Store.NO, Field.Index.ANALYZED));
indexWriter.UpdateDocument(new Term("id", "abc"), document, analyzer);
// the more times I call UpdateDocument here, the higher the score is for the subsequent hit
// indexWriter.UpdateDocument(new Term("id", "abc"), document, analyzer);
indexWriter.Commit();
var queryParser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, "field", analyzer);
var parsedQuery = queryParser.Parse("some text in the field");
using (var indexSearcher = new IndexSearcher(directory, readOnly: true))
{
var hits = indexSearcher.Search(parsedQuery, 10);
Assert.Equal(1, hits.TotalHits);
firstScore = hits.ScoreDocs[0].Score;
}
using (var indexSearcher = new IndexSearcher(directory, readOnly: true))
{
var hits = indexSearcher.Search(parsedQuery, 10);
Assert.Equal(1, hits.TotalHits);
secondScore = hits.ScoreDocs[0].Score;
}
document = new Document();
document.Add(new Field("id", "abc", Field.Store.YES, Field.Index.NOT_ANALYZED));
document.Add(new Field("field", "some changed text in the field", Field.Store.NO, Field.Index.ANALYZED));
// if I call DeleteAll here, then score three is the same as score one and two (which is probably fine, though not quite what I expected either)
// indexWriter.DeleteAll();
indexWriter.UpdateDocument(new Term("id", "abc"), document, analyzer);
indexWriter.Commit();
using (var indexSearcher = new IndexSearcher(directory, readOnly: true))
{
var hits = indexSearcher.Search(parsedQuery, 10);
Assert.Equal(1, hits.TotalHits);
thirdScore = hits.ScoreDocs[0].Score;
}
}
// this is fine
Assert.Equal(firstScore, secondScore);
// this is not
Assert.True(thirdScore < secondScore);
}
The steps are:
- Add a document to the index with "some text in the field" as its indexed text.
- Search for "some text in the field" twice, recording the scores as
firstScore
andsecondScore
- Update the document so that the indexed text is now "some changed text in the field"
- Search for "some text in the field" again, recording the score as
thirdScore
- Assert that the first and second scores are equal, and the the third score is less than the first and second
The really weird thing is that thirdScore
is greater than firstScore
and secondScore
. Here's what I've found:
- the more times I call
UpdateDocument
on the index with the same document, the higher the score will become - completely deleting the index before performing the third search yields a score equal to the first and second scores. I was expecting a little bit less because of the extra word in the indexed text ("changed"), but even having the scores equal would suffice
- boycotting
RemoveDocument
and instead manually deleting and adding the document makes no difference - calling
WaitForMerges
on the index after committing makes no difference
Can anyone explain this behavior to me? Why would the scores change over subsequent updates to the document when neither the document content nor the query is changing?