I want to index pdf attachment using Tire gem as client for ElasticSearch. In my mapping, I exclude the attachment field from _source, so that the attachment is not stored in the index and not returned in the search results:
mapping :_source => { :excludes => ['attachment_original'] } do
indexes :id, :type => 'integer'
indexes :folder_id, :type => 'integer'
indexes :attachment_file_name
indexes :attachment_updated_at, :type => 'date'
indexes :attachment_original, :type => 'attachment'
end
I can still see the attachment content included in the search results when I run the following curl command:
curl -X POST "http://localhost:9200/user_files/user_file/_search?pretty=true" -d '{
"query": {
"query_string": {
"query": "rspec"
}
}
}'
I have posted my question in this thread:
But I have just noticed that not only the attachment is included in the search results, but all other fields, including the ones that are not mapped, are also included as you can see here:
{
"took": 20,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.025427073,
"hits": [
{
"_index": "user_files",
"_type": "user_file",
"_id": "5",
"_score": 0.025427073,
"_source": {
"user_file": {
"id": 5,
"folder_id": 1,
"updated_at": "2012-08-16T11:32:41Z",
"attachment_file_size": 179895,
"attachment_updated_at": "2012-08-16T11:32:41Z",
"attachment_file_name": "hw4.pdf",
"attachment_content_type": "application/pdf",
"created_at": "2012-08-16T11:32:41Z",
"attachment_original": "JVBERi0xLjQKJeLjz9MKNyA"
}
}
}
]
}
}
attachment_file_size
and attachment_content_type
are not defined in the mapping, but are returned in the search results:
{
"id": 5,
"folder_id": 1,
"updated_at": "2012-08-16T11:32:41Z",
"attachment_file_size": 179895, <---------------------
"attachment_updated_at": "2012-08-16T11:32:41Z",
"attachment_file_name": "hw4.pdf", <------------------
"attachment_content_type": "application/pdf",
"created_at": "2012-08-16T11:32:41Z",
"attachment_original": "JVBERi0xLjQKJeLjz9MKNyA"
}
Here's my full implementation:
include Tire::Model::Search
include Tire::Model::Callbacks
def self.search(folder, params)
tire.search() do
query { string params[:query], default_operator: "AND"} if params[:query].present?
#filter :term, folder_id: folder.id
#highlight :attachment_original, :options => {:tag => "<em>"}
raise to_curl
end
end
mapping :_source => { :excludes => ['attachment_original'] } do
indexes :id, :type => 'integer'
indexes :folder_id, :type => 'integer'
indexes :attachment_file_name
indexes :attachment_updated_at, :type => 'date'
indexes :attachment_original, :type => 'attachment'
end
def to_indexed_json
to_json(:methods => [:attachment_original])
end
def attachment_original
if attachment_file_name.present?
path_to_original = attachment.path
Base64.encode64(open(path_to_original) { |f| f.read })
end
end
Could somebody help me figure out why all the fields are included in the _source
?
Edit: This is the output of running localhost:9200/user_files/_mapping
{
"user_files": {
"user_file": {
"_source": {
"excludes": [
"attachment_original"
]
},
"properties": {
"attachment_content_type": {
"type": "string"
},
"attachment_file_name": {
"type": "string"
},
"attachment_file_size": {
"type": "long"
},
"attachment_original": {
"type": "attachment",
"path": "full",
"fields": {
"attachment_original": {
"type": "string"
},
"author": {
"type": "string"
},
"title": {
"type": "string"
},
"name": {
"type": "string"
},
"date": {
"type": "date",
"format": "dateOptionalTime"
},
"keywords": {
"type": "string"
},
"content_type": {
"type": "string"
}
}
},
"attachment_updated_at": {
"type": "date",
"format": "dateOptionalTime"
},
"created_at": {
"type": "date",
"format": "dateOptionalTime"
},
"folder_id": {
"type": "integer"
},
"id": {
"type": "integer"
},
"updated_at": {
"type": "date",
"format": "dateOptionalTime"
}
}
}
}
}
As you can see, for some reason all the fields are included in the mapping!