0
votes

We ran into an issue where one of our Elasticsearch data nodes died completely in AWS. Even with a second node we had trouble getting the cluster back online. It originally came up in a yellow state and we could retrieve data but not write to it.

Elasticsearch Error

[2016-09-12 13:16:42,312][WARN ][discovery.zen.ping.unicast] [witness.domain.net] failed to send ping to [{#cloud-i-cb397257-0}{1.1.1.1}{1.1.1.1:9300}] RemoteTransportException[[datanode.domain.net][1.1.1.1:9300][internal:discovery/zen/unicast]]; nested: IllegalStateException[received ping request while not started]; Caused by: java.lang.IllegalStateException: received ping request while not started

Logstash Error

{:timestamp=>"2016-09-12T15:48:15.898000-0400", :message=>#900592725, :events_consumed=>900732955, :worker_count=>4, :inflight_count=>0, :worker_states=>[{:status=>"sleep", :alive=>true, :index=>0, :inflight_count=>0}, {:status=>"sleep", :alive=>true, :index=>1, :inflight_count=>0}, {:status=>"sleep", :alive=>true, :index=>2, :inflight_count=>0}, {:status=>"sleep", :alive=>true, :index=>3, :inflight_count=>0}], :output_info=>[{:type=>"gelf", :config=>{"host"=>"127.0.0.1", "ALLOW_ENV"=>false}, :is_multi_worker=>false, :events_received=>900592725, :workers=>"127.0.0.1", codec=>"UTF-8">, workers=>1, port=>12201, chunksize=>1420, sender=>"%{host}", level=>["%{severity}", "INFO"], ship_metadata=>true, ship_tags=>true, ignore_metadata=>["@timestamp", "@version", "severity", "host", "source_host", "source_path", "short_message"], full_message=>"%{message}", short_message=>"short_message">]>, :busy_workers=>0}], :thread_info=>[{"thread_id"=>17, "name"=>"[main]nil, "backtrace"=>["[...]/vendor/bundle/jruby/1.9/gems/jruby-openssl-0.9.13-java/lib/jopenssl19/openssl/ssl-internal.rb:106:in accept'", "[...]/vendor/bundle/jruby/1.9/gems/jruby-openssl-0.9.13-java/lib/jopenssl19/openssl/ssl-internal.rb:106:inaccept'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-input-tcp-3.0.6/lib/logstash/inputs/tcp.rb:112:in run_server'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-input-tcp-3.0.6/lib/logstash/inputs/tcp.rb:84:inrun'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:342:in inputworker'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:336:instart_input'"], "blocked_on"=>nil, "status"=>"sleep", "current_call"=>"[...]/vendor/bundle/jruby/1.9/gems/jruby-openssl-0.9.13-java/lib/jopenssl19/openssl/ssl-internal.rb:106:in accept'"}, {"thread_id"=>18, "name"=>"[main]>worker0", "plugin"=>["LogStash::Filters::Grok", {"patterns_dir"=>"/opt/logstash/vendor/bundle/jruby/1.9/gems/logstash-patterns-core-2.0.5/patterns/", "match"=>{"message"=>"%{SYSLOGBASE2}"}}], "backtrace"=>["[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:insynchronize'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:in inflight_batches_synchronize'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:234:inworker_loop'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:201:in start_workers'"], "blocked_on"=>nil, "status"=>"sleep", "current_call"=>"[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:insynchronize'"}, {"thread_id"=>19, "name"=>"[main]>worker1", "plugin"=>["LogStash::Filters::Grok", {"patterns_dir"=>"/opt/logstash/vendor/bundle/jruby/1.9/gems/logstash-patterns-core-2.0.5/patterns/", "match"=>{"message"=>"%{SYSLOGBASE2}"}}], "backtrace"=>["[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:in synchronize'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:ininflight_batches_synchronize'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:234:in worker_loop'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:201:instart_workers'"], "blocked_on"=>nil, "status"=>"sleep", "current_call"=>"[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:in synchronize'"}, {"thread_id"=>20, "name"=>"[main]>worker2", "plugin"=>["LogStash::Filters::Grok", {"patterns_dir"=>"/opt/logstash/vendor/bundle/jruby/1.9/gems/logstash-patterns-core-2.0.5/patterns/", "match"=>{"message"=>"%{SYSLOGBASE2}"}}], "backtrace"=>["[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:insynchronize'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:in inflight_batches_synchronize'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:234:inworker_loop'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:201:in start_workers'"], "blocked_on"=>nil, "status"=>"sleep", "current_call"=>"[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:insynchronize'"}, {"thread_id"=>21, "name"=>"[main]>worker3", "plugin"=>["LogStash::Filters::Grok", {"patterns_dir"=>"/opt/logstash/vendor/bundle/jruby/1.9/gems/logstash-patterns-core-2.0.5/patterns/", "match"=>{"message"=>"%{SYSLOGBASE2}"}}], "backtrace"=>["[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:in synchronize'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:ininflight_batches_synchronize'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:234:in worker_loop'", "[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:201:instart_workers'"], "blocked_on"=>nil, "status"=>"sleep", "current_call"=>"[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:in synchronize'"}], :stalling_threads_info=>[{"thread_id"=>17, "name"=>"[main]<tcp", "plugin"=>nil, "current_call"=>"[...]/vendor/bundle/jruby/1.9/gems/jruby-openssl-0.9.13-java/lib/jopenssl19/openssl/ssl-internal.rb:106:inaccept'"}, {"thread_id"=>18, "name"=>"[main]>worker0", "plugin"=>["LogStash::Filters::Grok", {"patterns_dir"=>"/opt/logstash/vendor/bundle/jruby/1.9/gems/logstash-patterns-core-2.0.5/patterns/", "match"=>{"message"=>"%{SYSLOGBASE2}"}}], "current_call"=>"[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:in synchronize'"}, {"thread_id"=>19, "name"=>"[main]>worker1", "plugin"=>["LogStash::Filters::Grok", {"patterns_dir"=>"/opt/logstash/vendor/bundle/jruby/1.9/gems/logstash-patterns-core-2.0.5/patterns/", "match"=>{"message"=>"%{SYSLOGBASE2}"}}], "current_call"=>"[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:insynchronize'"}, {"thread_id"=>20, "name"=>"[main]>worker2", "plugin"=>["LogStash::Filters::Grok", {"patterns_dir"=>"/opt/logstash/vendor/bundle/jruby/1.9/gems/logstash-patterns-core-2.0.5/patterns/", "match"=>{"message"=>"%{SYSLOGBASE2}"}}], "current_call"=>"[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:in synchronize'"}, {"thread_id"=>21, "name"=>"[main]>worker3", "plugin"=>["LogStash::Filters::Grok", {"patterns_dir"=>"/opt/logstash/vendor/bundle/jruby/1.9/gems/logstash-patterns-core-2.0.5/patterns/", "match"=>{"message"=>"%{SYSLOGBASE2}"}}], "current_call"=>"[...]/vendor/bundle/jruby/1.9/gems/logstash-core-2.3.3-java/lib/logstash/pipeline.rb:309:insynchronize'"}]}>, :level=>:warn}

1

1 Answers

0
votes

The problem ended up being that we were using AWS discovery for the ES Cluster. Even though we removed the dead instance (still visible in AWS, just no accessible) from the security group we were using we did not restart all of the remaining ES nodes. Once we did this everything started working again.