1
votes

I am trying to understand whether a legacy app that generates a compass 2.2 index, stores the contents of fields or not, I can open the index with luke.net and from my understanding it's not storing fields, it just returns an id, presumably to be used elsewhere to select from a db

see this for lucene : Lucene Field.Store.YES versus Field.Store.NO

how can I tell whether this compass application indexes with the equivalent of lucene.net Field.Store.NO , this is the compass.cfg.xml :

<compass-core-config
    xmlns="http://www.opensymphony.com/compass/schema/core-config"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xsi:schemaLocation="http://www.opensymphony.com/compass/schema/core-config
           http://www.opensymphony.com/compass/schema/compass-core-config.xsd">

    <compass name="default">
        <connection>
            <!--  index path from a file dataUpdate.properties -->
            <file path="/" />
        </connection>

        <searchEngine>
            <analyzer name="default" type="CustomAnalyzer" analyzerClass="myclass.beans.search.PerFieldAnalyzer" >
                <!--  example :
                <setting name="PerField-fieldname" value="org.apache.lucene.analysis.standard.StandardAnalyzer" />
                <setting name="PerFieldConfig-stopwords-fieldname" value="no:" />
                <setting name="PerFieldConfig-stopwords-fieldname" value="yes:aa,bb" />
                -->
                <setting name="PerField-symbol" value="org.apache.lucene.analysis.standard.StandardAnalyzer" />
                <setting name="PerFieldConfig-stopwords-symbol" value="no:" />
                <setting name="PerField-isin" value="org.apache.lucene.analysis.standard.StandardAnalyzer" />
                <setting name="PerFieldConfig-stopwords-isin" value="no:" />
                <setting name="PerField-tipo_opzione" value="org.apache.lucene.analysis.KeywordAnalyzer"/>
                <setting name="PerField-settore_cod" value="org.apache.lucene.analysis.KeywordAnalyzer" />
                <setting name="PerField-trend_medio" value="org.apache.lucene.analysis.KeywordAnalyzer"/>
                <setting name="PerField-trend_breve" value="org.apache.lucene.analysis.KeywordAnalyzer"/>
                <setting name="PerField-trend_lungo" value="org.apache.lucene.analysis.KeywordAnalyzer"/>
                <setting name="PerField-tipo_sts_cod" value="org.apache.lucene.analysis.KeywordAnalyzer"/>
                <setting name="PerField-valuta" value="org.apache.lucene.analysis.KeywordAnalyzer"/>
                <setting name="PerField-sottotipo_tit" value="org.apache.lucene.analysis.KeywordAnalyzer"/>
                <setting name="PerField-tabella_rt" value="org.apache.lucene.analysis.KeywordAnalyzer"/>
                <setting name="PerField-market" value="org.apache.lucene.analysis.KeywordAnalyzer"/>
                <setting name="PerField-cod_segmento" value="org.apache.lucene.analysis.KeywordAnalyzer"/>
                <setting name="PerField-tipo_tit" value="org.apache.lucene.analysis.KeywordAnalyzer"/>      
                <setting name="PerField-radiocor" value="org.apache.lucene.analysis.standard.StandardAnalyzer" />
                <setting name="PerFieldConfig-stopwords-radiocor" value="no:" />
            </analyzer>
        </searchEngine>
        
        <mappings>
            <class name="myclass.tserver.beans.search.SearchIndex" />
        </mappings>
        <settings>
            <setting name="compass.transaction.lockTimeout" value="180" />
        </settings>
        
    </compass>
</compass-core-config>

is it that value="no:" means not to store the value, or not to consider it as "stopword" ? whereas for example value="org.apache.lucene.analysis.standard.StandardAnalyzer" means to store it

this is the analyzer it seems to use :

package myclass.tserver.beans.search;

import myclass.tserver.ejb.StubWrapper;

import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.Arrays;
import java.util.Collections;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.compass.core.CompassException;
import org.compass.core.config.CompassConfigurable;
import org.compass.core.config.CompassSettings;

public class PerFieldAnalyzer extends PerFieldAnalyzerWrapper implements CompassConfigurable {

    private static final String FIELD_PREFIX = "PerField-";

    private static final String FIELD_CONFIG_PREFIX = "PerFieldConfig-";
    private static final String STOP_WORDS_PREFIX = "stopwords-";
    private static final String NO_STOP_WORDS_PREFIX = "no-stopwords-";

    public PerFieldAnalyzer() {
        super(new StandardAnalyzer());
    }

    public void configure(CompassSettings settings) throws CompassException {
        for (Object obj : settings.getProperties().keySet()) {
            if (obj != null && obj instanceof String && ((String) obj).startsWith(FIELD_PREFIX)) {
                String field = ((String) obj).substring(FIELD_PREFIX.length());
                String value = settings.getSetting((String) obj);
                if (value != null) {
                    String stopwordsParameter = settings.getSetting(FIELD_CONFIG_PREFIX + STOP_WORDS_PREFIX + field);
                    String[] stopwords = null;
                    if (stopwordsParameter != null) {
                        if (stopwordsParameter.trim().toLowerCase().startsWith("no:"))
                            // no stopwords
                            stopwords = new String[] {};
                        else if (stopwordsParameter.trim().toLowerCase().startsWith("yes:"))
                            // stopwords
                            stopwords = stopwordsParameter.trim().substring(4).split(",");
                    } else
                        // stopwords di default dello StandardAnalyzer
                        stopwords = null;
                        
                    try {
                        Analyzer analyzer = getAnalyzer(value, stopwords);
                        addAnalyzer(field, analyzer);
                    } catch (Exception e) {
                        new CompassException("Unable to set analyzer for field " + field + " : ", e);
                    }
                }
            }
        }
    }

    private Analyzer getAnalyzer(String classname, String[] stopwords) throws ClassNotFoundException, SecurityException,
            NoSuchMethodException, IllegalArgumentException, InstantiationException, IllegalAccessException,
            InvocationTargetException {
        Class<Analyzer> myclass = (Class<Analyzer>) Class.forName(classname);
        if (stopwords == null) {
            Constructor<Analyzer> myConstructor = myclass.getConstructor();
            return (Analyzer) myConstructor.newInstance();
        } else {
            Constructor<Analyzer> myConstructor = myclass.getConstructor(String[].class);
            return (Analyzer) myConstructor.newInstance((Object)stopwords);
        }

    }

}
1

1 Answers

1
votes

The easiest way to know which fields are stored for a lucene document is to open the index via lucene and to read in a document and then look at the list of fields for the document. Fields that are indexed but not stored will not show up in the list of the fields for the document.

Here is an example in Lucene.Net 4.8 that I wrote for you that hopefully can give you an good idea of how to check which fields are stored for a document. The syntax for you will of course be a bit different if you are using Java rather than C# and you will be using an older version of Lucene. But this chunk of code should hopefully get you a long way there.

In this example there are two documents added each with three fields. But only two of the three fields are stored, even though all three fields are indexed. I have placed a comment in the code where you can see which fields are stored for each document. In this example only two fields for each document will be in the d.Fields list because only two fields are stored.

 [Fact]
        public void StoreFieldsList() {

            Directory indexDir = new RAMDirectory();

            Analyzer standardAnalyzer = new StandardAnalyzer(LuceneVersion.LUCENE_48);

            IndexWriterConfig indexConfig = new IndexWriterConfig(LuceneVersion.LUCENE_48, standardAnalyzer);
            IndexWriter writer = new IndexWriter(indexDir, indexConfig);

            Document doc = new Document();
            doc.Add(new StringField("examplePrimaryKey", "001", Field.Store.YES));
            doc.Add(new TextField("exampleField", "Unique gifts are great gifts.", Field.Store.YES));
            doc.Add(new TextField("notStoredField", "Some text to index only.", Field.Store.NO));
            writer.AddDocument(doc);

            doc = new Document();
            doc.Add(new StringField("examplePrimaryKey", "002", Field.Store.YES));
            doc.Add(new TextField("exampleField", "Everyone is gifted.", Field.Store.YES));
            doc.Add(new TextField("notStoredField", "Some text to index only. Two.", Field.Store.NO));
            writer.AddDocument(doc);
            writer.AddDocument(doc);

            writer.Commit();

            DirectoryReader reader = writer.GetReader(applyAllDeletes:true);

            for (int i = 0; i < reader.NumDocs; i++) {
                Document d = reader.Document(i);

                for (int j = 0; j < d.Fields.Count; j++) {
                    IIndexableField field = d.Fields[j];

                    string fieldName = field.Name;                  //<--This field is a stored field for this document.

                }

            }
            

        }