Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions ocs_ci/deployment/deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,6 @@
get_acm_mce_build_tag,
apply_oadp_workaround,
mute_mon_netsplit,
ceph_health_resolve_devicehealth,
)
from ocs_ci.utility.vsphere_nodes import update_ntp_compute_nodes
from ocs_ci.helpers import helpers
Expand Down Expand Up @@ -2272,14 +2271,6 @@ def deploy_ocs(self):
# https://issues.redhat.com/browse/DFBUGS-4521
if config.DEPLOYMENT.get("arbiter_deployment"):
mute_mon_netsplit(namespace=self.namespace)

# Workaround for DFBUGS-6749: devicehealth module fails when its
# pool cannot be created due to a missing default CRUSH rule.
try:
ceph_health_resolve_devicehealth()
except Exception as ex:
logger.warning(f"devicehealth workaround failed (may not be needed): {ex}")

# Verify health of ceph cluster
logger.info("Done creating rook resources, waiting for HEALTH_OK")
try:
Expand Down
7 changes: 0 additions & 7 deletions ocs_ci/ocs/resources/storage_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@
TimeoutSampler,
convert_device_size,
extract_image_urls,
ceph_health_resolve_devicehealth,
)
from ocs_ci.utility.decorators import switch_to_orig_index_at_last
from ocs_ci.helpers.helpers import storagecluster_independent_check
Expand Down Expand Up @@ -786,12 +785,6 @@ def ocs_install_verification(

# TODO: Enable the check when a solution is identified for tools pod on FaaS consumer
if not (fusion_aas_consumer or hci_cluster):
# Workaround for DFBUGS-6749: devicehealth module fails when its
# pool cannot be created due to a missing default CRUSH rule.
try:
ceph_health_resolve_devicehealth()
except Exception as ex:
log.warning(f"devicehealth workaround failed (may not be needed): {ex}")
# Temporarily disable health check for hci until we have enough healthy clusters
assert utils.ceph_health_check(
namespace,
Expand Down
63 changes: 0 additions & 63 deletions ocs_ci/utility/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3128,58 +3128,9 @@ def check_ceph_health_not_ok():
sampler.wait_for_func_status(True)


def ceph_health_resolve_devicehealth():
"""
Fix ceph health issue where the devicehealth module fails because
its pool cannot be created due to a missing CRUSH rule.

Workaround:
1. Set osd_pool_default_crush_rule to 0 (block pool rule)
2. Restart the devicehealth module so it retries pool creation
3. Archive any resulting crash reports

"""
# importing here to avoid circular import
from ocs_ci.ocs.resources.pod import get_ceph_tools_pod

log.warning(
"Trying to fix devicehealth module failure by setting "
"default CRUSH rule and restarting the module"
)
ct_pod = get_ceph_tools_pod()

ct_pod.exec_ceph_cmd(
ceph_cmd="ceph config set mon osd_pool_default_crush_rule 0",
format=None,
out_yaml_format=False,
)
log.info("Set osd_pool_default_crush_rule to 0")

ct_pod.exec_ceph_cmd(
ceph_cmd=("ceph mgr module force disable devicehealth --yes-i-really-mean-it"),
format=None,
out_yaml_format=False,
)
log.info("Force disabled devicehealth module")

ct_pod.exec_ceph_cmd(
ceph_cmd="ceph mgr module enable devicehealth",
format=None,
out_yaml_format=False,
)
log.info("Re-enabled devicehealth module")

# give time to generate crash
time.sleep(180)

ceph_crash_info_display(ct_pod)
archive_ceph_crashes(ct_pod)


def ceph_health_resolve_crash():
"""
Fix ceph health issue with daemon crash

"""
log.warning("Trying to fix the issue with crash by archiving crashes")
from ocs_ci.ocs.resources.pod import get_ceph_tools_pod
Expand Down Expand Up @@ -3298,20 +3249,6 @@ def ceph_health_recover(

"""
ceph_health_fixes = [
{
"pattern": r"Module 'devicehealth' has failed",
"func": ceph_health_resolve_devicehealth,
"func_args": [],
"func_kwargs": {},
"ceph_health_tries": 10,
"ceph_health_delay": 30,
"known_issues": [
{
"issue": "DFBUGS-6749",
"pattern": r"Module 'devicehealth' has failed",
},
],
},
{
"pattern": r"daemons have recently crashed",
"func": ceph_health_resolve_crash,
Expand Down
Loading