i am working on full-text search engine in Elasticsearch and using multilingual data in index time. i used elasticsearch for text analysis and i would like to be able to return the tokens (retrieved index) after the preprocessing. I know about Analyze API but doing this for +200.000 documents is very time consuming. I found "terms aggregation" but i am not sure how it works. Any ideas?
i used in the mapping language analyzers. Is there any out-of-the-box language detection when using language analyzers or every document is passing by every language analyzer? If so, does it make sense to work with language detection and create multifields for each language? What is the different between using language analyzers in settings or in mappings?
PUT /index_sample
{
"settings": {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"type" : "custom",
"tokenizer" : "standard",
"filter" : [
"my_asciifolding",
"my_apostrophe",
"cjk_bigram"]
}
},
"filter" : {
"my_asciifolding" : {
"type" : "asciifolding",
"preserve_original" : true
},
"my_apostrophe" :{
"type" : "apostrophe"
}
}
}
},
"mappings" : {
"properties": {
"category_number" : {
"type" : "integer",
"fields" : {
"raw" : {
"type" : "keyword"
}
}
},
"product": {
"type" : "text",
"index" : "true",
"store" : "true",
"analyzer" : "my_analyzer",
"fields" : {
"german_field": {
"type" : "text",
"analyzer": "german"
},
"english_field" : {
"type" : "text",
"analyzer" : "english"
},
"chinese_field" : {
"type" : "text",
"analyzer" : "smartcn"
},
"spanish_field": {
"type" : "text",
"analyzer" : "spanish"
},
"czech_analyer" : {
"type" : "text",
"analyzer" : "czech"
},
"french_field": {
"type" : "text",
"analyzer" : "french"
},
"italian_field" : {
"type" : "text",
"analyzer" : "italian"
},
"dutch_field": {
"type" : "text",
"analyzer" : "dutch"
},
"portuguese_field": {
"type" : "text",
"analyzer" : "portuguese"
}
}
}
}
}
}