Skip to content

Commit bd80781

Browse files
authored
[change/fix] Recovery detection must trigger only critical checks #642
Closes #642
1 parent 67a5cd9 commit bd80781

File tree

11 files changed

+130
-23
lines changed

11 files changed

+130
-23
lines changed

docs/user/settings.rst

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -619,21 +619,32 @@ Automatically created charts.
619619
``OPENWISP_MONITORING_CRITICAL_DEVICE_METRICS``
620620
-----------------------------------------------
621621

622-
============ ==================================================================
622+
============ ===================================================================================
623623
**type**: ``list`` of ``dict`` objects
624-
**default**: .. code-block::
624+
**default**: .. code-block:: python
625625

626626
[
627-
{"key": "ping", "field_name": "reachable"},
628-
{"key": "data_collected", "field_name": "data_collected"},
627+
{
628+
"key": "ping",
629+
"field_name": "reachable",
630+
"check": "openwisp_monitoring.check.classes.Ping", # optional
631+
},
632+
{
633+
"key": "data_collected",
634+
"field_name": "data_collected",
635+
"check": "openwisp_monitoring.check.classes.DataCollected", # optional
636+
},
629637
]
630-
============ ==================================================================
638+
============ ===================================================================================
631639

632640
Device metrics that are considered critical.
633641

634642
When all critical metrics have values outside their expected range, the
635643
device's health status changes to ``CRITICAL``.
636644

645+
The ``check`` field is optional for each metric. These checks are executed
646+
when attempting to detect the recovery of a device.
647+
637648
.. _openwisp_monitoring_health_status_labels:
638649

639650
``OPENWISP_MONITORING_HEALTH_STATUS_LABELS``

openwisp_monitoring/check/base/models.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ class AbstractCheck(TimeStampedEditableModel):
4949
class Meta:
5050
abstract = True
5151
unique_together = ('name', 'object_id', 'content_type')
52+
indexes = [
53+
models.Index(
54+
fields=['content_type', 'object_id', 'is_active'],
55+
name='active_object_checks_idx',
56+
)
57+
]
5258

5359
permissions = (
5460
('add_check_inline', 'Can add check inline'),
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Generated by Django 4.2.17 on 2025-03-25 12:00
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
dependencies = [
8+
('check', '0010_create_data_collected_check'),
9+
]
10+
11+
operations = [
12+
migrations.AddIndex(
13+
model_name='check',
14+
index=models.Index(
15+
fields=['content_type', 'object_id', 'is_active'],
16+
name='active_object_checks_idx',
17+
),
18+
),
19+
]

openwisp_monitoring/device/apps.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -224,11 +224,13 @@ def manage_device_recovery_cache_key(cls, instance, status, **kwargs):
224224

225225
@classmethod
226226
def trigger_device_recovery_checks(cls, instance, **kwargs):
227-
from .tasks import trigger_device_checks
227+
from .tasks import trigger_device_critical_checks
228228

229229
# Cache is managed by "manage_device_recovery_cache_key".
230230
if cache.get(get_device_cache_key(device=instance), False):
231-
transaction_on_commit(lambda: trigger_device_checks.delay(pk=instance.pk))
231+
transaction_on_commit(
232+
lambda: trigger_device_critical_checks.delay(pk=instance.pk)
233+
)
232234

233235
@classmethod
234236
def connect_is_working_changed(cls):
@@ -242,7 +244,7 @@ def connect_is_working_changed(cls):
242244
def is_working_changed_receiver(
243245
cls, instance, is_working, old_is_working, failure_reason, **kwargs
244246
):
245-
from .tasks import trigger_device_checks
247+
from .tasks import trigger_device_critical_checks
246248

247249
Check = load_model('check', 'Check')
248250
device = instance.device
@@ -266,11 +268,15 @@ def is_working_changed_receiver(
266268
return
267269
if not is_working:
268270
if initial_status == 'ok':
269-
transaction_on_commit(lambda: trigger_device_checks.delay(pk=device.pk))
271+
transaction_on_commit(
272+
lambda: trigger_device_critical_checks.delay(pk=device.pk)
273+
)
270274
else:
271275
# if checks exist trigger them else, set status as 'ok'
272276
if Check.objects.filter(object_id=instance.device.pk).exists():
273-
transaction_on_commit(lambda: trigger_device_checks.delay(pk=device.pk))
277+
transaction_on_commit(
278+
lambda: trigger_device_critical_checks.delay(pk=device.pk)
279+
)
274280
else:
275281
device_monitoring.update_status(status)
276282

openwisp_monitoring/device/base/models.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,19 @@ def get_active_metrics(cls):
398398
cls._active_metrics = active_metrics
399399
return cls._active_metrics
400400

401+
@classmethod
402+
def get_critical_checks(cls):
403+
"""
404+
Returns list of critical checks.
405+
"""
406+
if not hasattr(cls, '_critical_checks'):
407+
critical_checks = []
408+
for metric in app_settings.CRITICAL_DEVICE_METRICS:
409+
if metric.get('check'):
410+
critical_checks.append(metric['check'])
411+
cls._critical_checks = critical_checks
412+
return cls._critical_checks
413+
401414
@staticmethod
402415
@receiver(threshold_crossed, dispatch_uid='threshold_crossed_receiver')
403416
def threshold_crossed(sender, metric, alert_settings, target, first_time, **kwargs):

openwisp_monitoring/device/settings.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,21 @@
55

66

77
def get_critical_device_metrics():
8-
default = [{'key': 'ping', 'field_name': 'reachable'}]
8+
default = [
9+
{
10+
'key': 'ping',
11+
'field_name': 'reachable',
12+
'check': 'openwisp_monitoring.check.classes.Ping',
13+
}
14+
]
915
if AUTO_DATA_COLLECTED_CHECK:
10-
default.append({'key': 'data_collected', 'field_name': 'data_collected'})
16+
default.append(
17+
{
18+
'key': 'data_collected',
19+
'field_name': 'data_collected',
20+
'check': 'openwisp_monitoring.check.classes.DataCollected',
21+
}
22+
)
1123
critical_metrics = get_settings_value(
1224
'CRITICAL_DEVICE_METRICS',
1325
default,

openwisp_monitoring/device/tasks.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import warnings
23

34
from celery import shared_task
45
from django.core.exceptions import ObjectDoesNotExist
@@ -13,7 +14,7 @@
1314

1415

1516
@shared_task(base=OpenwispCeleryTask)
16-
def trigger_device_checks(pk, recovery=True):
17+
def trigger_device_critical_checks(pk, recovery=True):
1718
"""Triggers the monitoring checks for the specified device pk.
1819
1920
Retrieves all related checks to the passed ``device`` and calls the
@@ -28,7 +29,11 @@ def trigger_device_checks(pk, recovery=True):
2829
except ObjectDoesNotExist:
2930
logger.warning(f'The device with uuid {pk} has been deleted')
3031
return
31-
check_ids = list(device.checks.filter(is_active=True).values_list('id', flat=True))
32+
check_ids = list(
33+
device.checks.filter(
34+
is_active=True, check_type__in=device.monitoring.get_critical_checks()
35+
).values_list('id', flat=True)
36+
)
3237
if not check_ids:
3338
status = 'ok' if recovery else 'critical'
3439
device.monitoring.update_status(status)
@@ -39,6 +44,19 @@ def trigger_device_checks(pk, recovery=True):
3944
perform_check.delay(check_id)
4045

4146

47+
@shared_task(base=OpenwispCeleryTask)
48+
def trigger_device_checks(pk, recovery=True):
49+
"""
50+
Deprecated, use trigger_device_critical_checks instead.
51+
"""
52+
warnings.warn(
53+
'trigger_device_checks is deprecated, use trigger_device_critical_checks instead.',
54+
DeprecationWarning,
55+
stacklevel=2,
56+
)
57+
trigger_device_critical_checks(pk, recovery)
58+
59+
4260
@shared_task(base=OpenwispCeleryTask)
4361
def delete_wifi_clients_and_sessions(days=6 * 30):
4462
WifiClient = load_model('device_monitoring', 'WifiClient')

openwisp_monitoring/device/tests/test_models.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from ...db import timeseries_db
1717
from .. import settings as app_settings
1818
from ..signals import health_status_changed
19-
from ..tasks import delete_wifi_clients_and_sessions, trigger_device_checks
19+
from ..tasks import delete_wifi_clients_and_sessions, trigger_device_critical_checks
2020
from ..utils import get_device_cache_key
2121
from . import (
2222
DeviceMonitoringTestCase,
@@ -537,9 +537,9 @@ def test_resources_no_key(self):
537537
dd.validate_data()
538538

539539
@patch('logging.Logger.warning')
540-
def test_trigger_device_checks_task_resiliency(self, mock):
540+
def test_trigger_device_critical_checks_task_resiliency(self, mock):
541541
dd = DeviceData(name='Test Device')
542-
trigger_device_checks.delay(dd.pk)
542+
trigger_device_critical_checks.delay(dd.pk)
543543
mock.assert_called_with(f'The device with uuid {dd.pk} has been deleted')
544544

545545
def test_device_data_cache_set(self):

openwisp_monitoring/device/tests/test_recovery.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@
55
from swapper import load_model
66

77
from ..signals import health_status_changed
8-
from ..tasks import trigger_device_checks
8+
from ..tasks import trigger_device_checks, trigger_device_critical_checks
99
from ..utils import get_device_cache_key
1010
from . import DeviceMonitoringTestCase, DeviceMonitoringTransactionTestcase
1111

1212
DeviceMonitoring = load_model('device_monitoring', 'DeviceMonitoring')
1313
DeviceData = load_model('device_monitoring', 'DeviceData')
1414
Device = load_model('config', 'Device')
15+
Check = load_model('check', 'Check')
1516

1617

1718
class TestRecovery(DeviceMonitoringTestCase):
@@ -57,18 +58,24 @@ def test_status_set_ok(self):
5758
"""Tests device status is set to ok if no related checks present"""
5859
dm = self._create_device_monitoring()
5960
dm.update_status('critical')
60-
trigger_device_checks.delay(dm.device.pk)
61+
trigger_device_critical_checks.delay(dm.device.pk)
6162
dm.refresh_from_db()
6263
self.assertEqual(dm.status, 'ok')
6364

6465
def test_status_set_critical(self):
6566
"""Tests device status is set to critical if no related checks present and recovery=False is passed"""
6667
dm = self._create_device_monitoring()
6768
dm.update_status('critical')
68-
trigger_device_checks.delay(dm.device.pk, recovery=False)
69+
trigger_device_critical_checks.delay(dm.device.pk, recovery=False)
6970
dm.refresh_from_db()
7071
self.assertEqual(dm.status, 'critical')
7172

73+
@patch('openwisp_monitoring.device.tasks.trigger_device_critical_checks')
74+
def test_trigger_device_checks(self, mocked_task):
75+
dm = self._create_device_monitoring()
76+
trigger_device_checks.delay(dm.device.pk)
77+
mocked_task.assert_called_once_with(dm.device.pk, True)
78+
7279

7380
class TestRecoveryTransaction(DeviceMonitoringTransactionTestcase):
7481
@patch('openwisp_monitoring.device.tasks.perform_check.delay')
@@ -111,6 +118,15 @@ def test_metrics_received_trigger_device_recovery_checks(self, mocked_task):
111118
content_type='application/json',
112119
)
113120
self.assertEqual(response.status_code, 200)
114-
self.assertEqual(mocked_task.call_count, 4)
121+
critical_checks = device_monitoring.get_critical_checks()
122+
self.assertEqual(mocked_task.call_count, len(critical_checks))
123+
# Verify that only critical checks are triggered
124+
# Get the check types from the mocked task calls
125+
triggered_check_types = list(
126+
Check.objects.filter(
127+
id__in=[call[0][0] for call in mocked_task.call_args_list]
128+
).values_list('check_type', flat=True)
129+
)
130+
self.assertListEqual(sorted(triggered_check_types), sorted(critical_checks))
115131
device_monitoring.refresh_from_db()
116132
self.assertEqual(device_monitoring.status, 'problem')

openwisp_monitoring/device/tests/test_transactions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from ...check.classes import Ping
1212
from ...check.tests import _FPING_REACHABLE, _FPING_UNREACHABLE
13-
from ..tasks import trigger_device_checks
13+
from ..tasks import trigger_device_critical_checks
1414
from . import DeviceMonitoringTransactionTestcase
1515

1616
Check = load_model('check', 'Check')
@@ -62,7 +62,7 @@ def test_trigger_device_recovery_task_regression(
6262
dm = self._create_device_monitoring()
6363
dm.device.management_ip = None
6464
dm.device.save()
65-
trigger_device_checks.delay(dm.device.pk)
65+
trigger_device_critical_checks.delay(dm.device.pk)
6666
self.assertTrue(Check.objects.exists())
6767
# we expect update_status() to be called twice (by the check)
6868
# and not a third time directly by our code

0 commit comments

Comments
 (0)