diff --git a/omnibus/files/server-ctl-cookbooks/infra-server/recipes/opensearch.rb b/omnibus/files/server-ctl-cookbooks/infra-server/recipes/opensearch.rb index f1af32313e..3b53521fda 100644 --- a/omnibus/files/server-ctl-cookbooks/infra-server/recipes/opensearch.rb +++ b/omnibus/files/server-ctl-cookbooks/infra-server/recipes/opensearch.rb @@ -1,5 +1,12 @@ # Copyright:: Chef Software, Inc. # All Rights Reserved +# +# OpenSearch Recipe - Enhanced with Smart Readiness Checking +# +# This recipe configures OpenSearch for Chef Server with intelligent +# startup validation instead of hardcoded sleep timings. The enhancement +# addresses CHEF-26134 timing issues that emerged with OpenSearch 1.3.20 +# by implementing proper health checks before security configuration. MAX_MAP_COUNT = 262_144 cluster_name = if node['previous_run'] && node['previous_run']['opensearch'] && node['previous_run']['opensearch']['cluster_name'] @@ -177,14 +184,101 @@ retry_delay 1 end -chef_sleep 10 +# Smart OpenSearch readiness check instead of hardcoded sleep +ruby_block 'wait_for_opensearch_ready' do + block do + require 'net/http' + require 'json' + require 'timeout' + + max_attempts = 60 # Maximum 5 minutes (60 * 5 seconds) + attempt = 0 + opensearch_ready = false + opensearch_port = node['private_chef']['opensearch']['port'] || 9200 + + Chef::Log.info("Waiting for OpenSearch to become ready on port #{opensearch_port}...") + + while attempt < max_attempts && !opensearch_ready + attempt += 1 + + begin + Timeout.timeout(10) do + # Check if OpenSearch API is responding + uri = URI("http://localhost:#{opensearch_port}/") + http = Net::HTTP.new(uri.host, uri.port) + http.read_timeout = 5 + http.open_timeout = 5 + + request = Net::HTTP::Get.new(uri) + response = http.request(request) + + if response.code == '200' || response.code == '401' + # 200 = OK, 401 = Unauthorized but service is running + begin + # Additional check: verify cluster health + health_uri = URI("http://localhost:#{opensearch_port}/_cluster/health") + health_request = Net::HTTP::Get.new(health_uri) + health_response = http.request(health_request) + + if health_response.code == '200' + health_data = JSON.parse(health_response.body) + cluster_status = health_data['status'] + + if %w(green yellow).include?(cluster_status) + Chef::Log.info("OpenSearch is ready! Cluster status: #{cluster_status}, attempt #{attempt}/#{max_attempts}") + opensearch_ready = true + else + Chef::Log.debug("OpenSearch cluster status is #{cluster_status}, waiting... (attempt #{attempt}/#{max_attempts})") + end + elsif health_response.code == '401' + # Security is enabled but service is responding - we can proceed + Chef::Log.info("OpenSearch is ready! Security enabled, attempt #{attempt}/#{max_attempts}") + opensearch_ready = true + else + Chef::Log.debug("OpenSearch cluster health check failed with code #{health_response.code}, waiting... (attempt #{attempt}/#{max_attempts})") + end + rescue JSON::ParserError => e + Chef::Log.debug("Error checking cluster health: #{e.message}, but basic service is responding") + # If health check fails but basic service responds, consider it ready + opensearch_ready = true + rescue StandardError => e + Chef::Log.debug("Error checking cluster health: #{e.message}, but basic service is responding") + # If health check fails but basic service responds, consider it ready + opensearch_ready = true + end + else + Chef::Log.debug("OpenSearch not ready, HTTP response code: #{response.code} (attempt #{attempt}/#{max_attempts})") + end + end + rescue Timeout::Error + Chef::Log.debug("Timeout connecting to OpenSearch (attempt #{attempt}/#{max_attempts})") + rescue Errno::ECONNREFUSED, Errno::EHOSTUNREACH, SocketError => e + Chef::Log.debug("Connection failed to OpenSearch: #{e.message} (attempt #{attempt}/#{max_attempts})") + rescue StandardError => e + Chef::Log.debug("Unexpected error checking OpenSearch: #{e.message} (attempt #{attempt}/#{max_attempts})") + end + + unless opensearch_ready + if attempt < max_attempts + Chef::Log.debug('OpenSearch not ready, waiting 5 seconds before retry...') + sleep 5 + else + raise "OpenSearch failed to become ready after #{max_attempts * 5} seconds. Please check OpenSearch logs." + end + end + end + + Chef::Log.info('OpenSearch readiness verified successfully!') + end + action :run +end execute 'add internal user to opensearch security plugin' do command 'export JAVA_HOME="/opt/opscode/embedded/open-jre/"; ./securityadmin.sh -f ../securityconfig/internal_users.yml -icl -nhnv -cert /opt/opscode/embedded/opensearch/config/admin.pem -cacert /opt/opscode/embedded/opensearch/config/root-ca.pem -key /opt/opscode/embedded/opensearch/config/admin-key.pem' cwd '/opt/opscode/embedded/opensearch/plugins/opensearch-security/tools/' user OmnibusHelper.new(node).ownership['owner'] - retries 10 - retry_delay 1 + retries 5 + retry_delay 15 end include_recipe 'infra-server::opensearch_index'