1
votes

I'm trying to implement an auto-suggest control powered by an ES index. The index has multiple fields (Multi-language - Arabic and English) and I want to be able to search in all languages.

The easiest way to do that is NGram with the "_all" field, as long as some care is taken in the mapping definition. The issue we have now how to accomplish this using multi-language.

PS: We are looking to separate field for all the possible languages (Using one index).

I tried to use the nGram tokenizer and filter and it's working good for one language (English).

{
    "template": "index_com",
    "settings": {
      "number_of_shards": 5,
      "number_of_replicas": 1,
      "analysis": {
         "filter": {
            "edgeNGram_filter": {
               "type": "edgeNGram",
               "min_gram": 2,
               "max_gram": 20
            }
         },
         "analyzer": {
            "edgeNGram_analyzer": {
               "type": "custom",
               "tokenizer": "whitespace",
               "filter": [
                  "lowercase",
                  "asciifolding",
                  "edgeNGram_filter"
               ]
            }
         }
      }
   },
    "mappings": {
        "product": {
            "_all": {
                "enabled": true,
                "index_analyzer": "edgeNGram_analyzer",
                "search_analyzer": "standard"
            },
            "properties": {
                "id": {
                    "type": "string",
                    "index": "no",
                    "include_in_all": false
                },
                "uuid": {
                    "type": "string",
                    "index": "no",
                    "include_in_all": false
                },
                "name": {
                    "type": "string",
                    "include_in_all": true
                },
                "description": {
                    "type": "string",
                    "include_in_all": true
                },
                "brand": {
                    "type": "string",
                    "include_in_all": true
                },    
                "made_id": {
                    "type": "string",
                    "include_in_all": true
                },        
                "category": {
                    "type": "string",
                    "include_in_all": true
                },
                "category_id": {
                    "type": "integer",
                    "include_in_all": false
                },
                "keywords": {
                    "type": "string",
                    "include_in_all": true
                },
                "colors": {
                    "type": "string",
                    "index": "not_analyzed"
                },
                "colors_name": {
                    "type": "string",
                    "include_in_all": true
                },
                "quality": {
                    "type": "string",
                    "index": "not_analyzed"
                },
                "vendor_name": {
                    "type": "string",
                    "include_in_all": false
                    },
                "vendor_location" : {
                    "type" : "geo_point",
                    "include_in_all": false
                },
                "price": {
                    "type": "double",
                    "include_in_all": false
                },
                "price_before_discount": {
                    "type": "double",
                    "include_in_all": false
                },          
                "is_deal": {
                    "type": "integer",
                    "include_in_all": false
                },
                "is_best_seller": {
                    "type": "integer",
                    "include_in_all": false
                },                    
                "views": {
                    "type": "integer",
                    "include_in_all": false
                },
                "rating": {
                    "type": "integer",
                    "include_in_all": false
                },
                "updated_at": {
                   "type": "date",
                   "format": "dateOptionalTime"
                },
                "created_at": {
                   "type": "date",
                   "format": "dateOptionalTime"
                },
                "image_link": {
                    "type": "string",
                    "index": "not_analyzed"
                }
            }
        }
    }
}

Arabic analyzer:

{
  "settings": {
    "analysis": {
      "filter": {
        "arabic_stop": {
          "type":       "stop",
          "stopwords":  "_arabic_" 
        },
        "arabic_keywords": {
          "type":       "keyword_marker",
          "keywords":   [] 
        },
        "arabic_stemmer": {
          "type":       "stemmer",
          "language":   "arabic"
        }
      },
      "analyzer": {
        "arabic": {
          "tokenizer":  "standard",
          "filter": [
            "lowercase",
            "arabic_stop",
            "arabic_normalization",
            "arabic_keywords",
            "arabic_stemmer"
          ]
        }
      }
    }
  }
}

can someone suggest any solution? Thanks!

1
Did my solution work for you?Mario Trucco

1 Answers

0
votes

Your second snippet defines the arabic analyzer, which is already available so you shouldn't need to add it.

What you are missing is to tell elasticsearch to also use the arabic analyzer. So you want to analyze each field twice, in english and arabic. To do that, add

"fields": {
            "ar": {
              "type":     "string",
              "analyzer": "arabic"
            },
            "en": {
              "type":     "string",
              "analyzer": "english"
            }
}

to all your fields that have "include_in_all": true. That makes your mappings look like this:

{
    "template": "index_com",
    "settings": {
        "number_of_shards": 5,
        "number_of_replicas": 1,
        "analysis": {
            "filter": {
                "edgeNGram_filter": {
                    "type": "edgeNGram",
                    "min_gram": 2,
                    "max_gram": 20
                }
            },
            "analyzer": {
                "edgeNGram_analyzer": {
                    "type": "custom",
                    "tokenizer": "whitespace",
                    "filter": [
                        "lowercase",
                        "asciifolding",
                        "edgeNGram_filter"
                    ]
                }
            }
        }
    },
    "mappings": {
        "product": {
            "_all": {
                "enabled": true,
                "index_analyzer": "edgeNGram_analyzer",
                "search_analyzer": "standard"
            },
            "properties": {
                "id": {
                    "type": "string",
                    "index": "no",
                    "include_in_all": false
                },
                "uuid": {
                    "type": "string",
                    "index": "no",
                    "include_in_all": false
                },
                "name": {
                    "type": "string",
                    "include_in_all": true,
                    "fields": {
                        "ar": {
                            "type": "string",
                            "analyzer": "arabic"
                        },
                        "en": {
                            "type": "string",
                            "analyzer": "english"
                        }
                    }
                },
                "description": {
                    "type": "string",
                    "include_in_all": true,
                    "fields": {
                        "ar": {
                            "type": "string",
                            "analyzer": "arabic"
                        },
                        "en": {
                            "type": "string",
                            "analyzer": "english"
                        }
                    }
                },
                "brand": {
                    "type": "string",
                    "include_in_all": true,
                    "fields": {
                        "ar": {
                            "type": "string",
                            "analyzer": "arabic"
                        },
                        "en": {
                            "type": "string",
                            "analyzer": "english"
                        }
                    }
                },
                "made_id": {
                    "type": "string",
                    "include_in_all": true,
                    "fields": {
                        "ar": {
                            "type": "string",
                            "analyzer": "arabic"
                        },
                        "en": {
                            "type": "string",
                            "analyzer": "english"
                        }
                    }
                },
                "category": {
                    "type": "string",
                    "include_in_all": true,
                    "fields": {
                        "ar": {
                            "type": "string",
                            "analyzer": "arabic"
                        },
                        "en": {
                            "type": "string",
                            "analyzer": "english"
                        }
                    }
                },
                "category_id": {
                    "type": "integer",
                    "include_in_all": false
                },
                "keywords": {
                    "type": "string",
                    "include_in_all": true,
                    "fields": {
                        "ar": {
                            "type": "string",
                            "analyzer": "arabic"
                        },
                        "en": {
                            "type": "string",
                            "analyzer": "english"
                        }
                    }
                },
                "colors": {
                    "type": "string",
                    "index": "not_analyzed"
                },
                "colors_name": {
                    "type": "string",
                    "include_in_all": true,
                    "fields": {
                        "ar": {
                            "type": "string",
                            "analyzer": "arabic"
                        },
                        "en": {
                            "type": "string",
                            "analyzer": "english"
                        }
                    }
                },
                "quality": {
                    "type": "string",
                    "index": "not_analyzed"
                },
                "vendor_name": {
                    "type": "string",
                    "include_in_all": false
                },
                "vendor_location": {
                    "type": "geo_point",
                    "include_in_all": false
                },
                "price": {
                    "type": "double",
                    "include_in_all": false
                },
                "price_before_discount": {
                    "type": "double",
                    "include_in_all": false
                },
                "is_deal": {
                    "type": "integer",
                    "include_in_all": false
                },
                "is_best_seller": {
                    "type": "integer",
                    "include_in_all": false
                },
                "views": {
                    "type": "integer",
                    "include_in_all": false
                },
                "rating": {
                    "type": "integer",
                    "include_in_all": false
                },
                "updated_at": {
                    "type": "date",
                    "format": "dateOptionalTime"
                },
                "created_at": {
                    "type": "date",
                    "format": "dateOptionalTime"
                },
                "image_link": {
                    "type": "string",
                    "index": "not_analyzed"
                }
            }
        }
    }
}