0
votes

I am totally new to elastic search. So please forgive me if this is a stupid Question and my Questions might have been answered somewhere else already but I couldn't find it. I want to use Elastic Search as a search engine for PDF'S and docx's in my network. I used fscrawler to ingest the PDF's to elastic search. Since the documents I want to ingest are in several languages I wanted to use n-graming for stemming. To do so I wanted to update my mapping like this

PUT test/_mappings/_all
{
"mappings": {
    "title": {
      "properties": {
        "title": {
          "type": "text",
          "fields": {
            "de": {
              "type":     "string",
              "analyzer": "german"
            },
            "en": {
              "type":     "string",
              "analyzer": "english"
            },
             "general": { 
              "type":     "string",
              "analyzer": "trigrams"
          }
        }
      }
    }
  }
}
}

And now I get this Errormessage

{ "error": { "root_cause": [ { "type": "mapper_parsing_exception", "reason": "Root mapping definition has unsupported parameters: [mappings : {title={properties={title={type=text, fields={de={type=string, analyzer=german}, en={type=string, analyzer=english}, general={type=string, analyzer=trigrams}}}}}}]" } ], "type": "mapper_parsing_exception", "reason": "Root mapping definition has unsupported parameters: [mappings : {title={properties={title={type=text, fields={de={type=string, analyzer=german}, en={type=string, analyzer=english}, general={type=string, analyzer=trigrams}}}}}}]"
}, "status": 400 }

Do you have any idea how i can fix this? Or do you have an idea how I can ingest the files with the right mapping without using fscrawler?

2
Looks like trigrams is not a inbuilt analyzer in ES, follow this elastic.co/guide/en/elasticsearch/guide/current/… to define trigrams and then create the mapping.user156327
i actually follwed exactly this guidenu11ahnung
can you paste o/p of _mapping api . refer elastic.co/guide/en/elasticsearch/reference/current/… on how to use ituser156327
Do you think there are better options than fscrawler to ingest the documents?nu11ahnung
I think it has nothing to do with fscrawleruser156327

2 Answers

0
votes

those are my settings

    {
  "test": {
    "settings": {
      "index": {
        "mapping": {
          "total_fields": {
            "limit": "2000"
          }
        },
        "number_of_shards": "5",
        "provided_name": "test",
        "creation_date": "1542031632596",
        "analysis": {
          "filter": {
            "trigrams_filter": {
              "type": "ngram",
              "min_gram": "3",
              "max_gram": "3"
            }
          },
          "analyzer": {
            "fscrawler_path": {
              "tokenizer": "fscrawler_path"
            },
            "trigrams": {
              "filter": [
                "lowercase",
                "trigrams_filter"
              ],
              "type": "custom",
              "tokenizer": "standard"
            }
          },
          "tokenizer": {
            "fscrawler_path": {
              "type": "path_hierarchy"
            }
          }
        },
        "number_of_replicas": "1",
        "uuid": "7L3QE5_xRACECVbTFlFY-Q",
        "version": {
          "created": "6040399"
        }
      }
    }
  }
}
0
votes

My mapping

{
      "test": {
        "mappings": {
          "_doc": {
            "dynamic_templates": [
              {
                "raw_as_text": {
                  "path_match": "meta.raw.*",
                  "mapping": {
                    "fields": {
                      "keyword": {
                        "ignore_above": 256,
                        "type": "keyword"
                      }
                    },
                    "type": "text"
                  }
                }
              }
            ],
            "properties": {
              "attachment": {
                "type": "binary"
              },
              "attributes": {
                "properties": {
                  "group": {
                    "type": "keyword"
                  },
                  "owner": {
                    "type": "keyword"
                  }
                }
              },
              "content": {
                "type": "text"
              },
              "file": {
                "properties": {
                  "checksum": {
                    "type": "keyword"
                  },
                  "content_type": {
                    "type": "keyword"
                  },
                  "created": {
                    "type": "date",
                    "format": "dateOptionalTime"
                  },
                  "extension": {
                    "type": "keyword"
                  },
                  "filename": {
                    "type": "keyword",
                    "store": true
                  },
                  "filesize": {
                    "type": "long"
                  },
                  "indexed_chars": {
                    "type": "long"
                  },
                  "indexing_date": {
                    "type": "date",
                    "format": "dateOptionalTime"
                  },
                  "last_accessed": {
                    "type": "date",
                    "format": "dateOptionalTime"
                  },
                  "last_modified": {
                    "type": "date",
                    "format": "dateOptionalTime"
                  },
                  "url": {
                    "type": "keyword",
                    "index": false
                  }
                }
              },
              "meta": {
                "properties": {
                  "altitude": {
                    "type": "text"
                  },
                  "author": {
                    "type": "text"
                  },
                  "comments": {
                    "type": "text"
                  },
                  "contributor": {
                    "type": "text"
                  },
                  "coverage": {
                    "type": "text"
                  },
                  "created": {
                    "type": "date",
                    "format": "dateOptionalTime"
                  },
                  "creator_tool": {
                    "type": "keyword"
                  },
                  "date": {
                    "type": "date",
                    "format": "dateOptionalTime"
                  },
                  "description": {
                    "type": "text"
                  },
                  "format": {
                    "type": "text"
                  },
                  "identifier": {
                    "type": "text"
                  },
                  "keywords": {
                    "type": "text"
                  },
                  "language": {
                    "type": "keyword"
                  },
                  "latitude": {
                    "type": "text"
                  },
                  "longitude": {
                    "type": "text"
                  },
                  "metadata_date": {
                    "type": "date",
                    "format": "dateOptionalTime"
                  },
                  "modifier": {
                    "type": "text"
                  },
                  "print_date": {
                    "type": "date",
                    "format": "dateOptionalTime"
                  },
                  "publisher": {
                    "type": "text"
                  },
                  "rating": {
                    "type": "byte"
                  },
                  "relation": {
                    "type": "text"
                  },
                  "rights": {
                    "type": "text"
                  },
                  "source": {
                    "type": "text"
                  },
                  "title": {
                    "type": "text"
                  },
                  "type": {
                    "type": "text"
                  }
                }
              },
              "path": {
                "properties": {
                  "real": {
                    "type": "keyword",
                    "fields": {
                      "fulltext": {
                        "type": "text"
                      },
                      "tree": {
                        "type": "text",
                        "analyzer": "fscrawler_path",
                        "fielddata": true
                      }
                    }
                  },
                  "root": {
                    "type": "keyword"
                  },
                  "virtual": {
                    "type": "keyword",
                    "fields": {
                      "fulltext": {
                        "type": "text"
                      },
                      "tree": {
                        "type": "text",
                        "analyzer": "fscrawler_path",
                        "fielddata": true
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }