From 78443bc1d34c87ceb0c0958062dd40ae93b8c93a Mon Sep 17 00:00:00 2001 From: Duong Date: Thu, 17 Jul 2025 16:57:27 -0400 Subject: [PATCH 01/14] automatic cleaning of input data Rucio rules --- .../InputDataRucioRuleCleaner.py | 87 +++++ .../MSRuleCleaner/MSRuleCleaner.py | 71 +++- test/data/Mock/DBSMockData.json | 37 ++ test/data/Mock/RucioMockData.json | 22 ++ test/deploy/env_unittest_py3.sh | 13 +- .../InputDataRucioRuleCleaner_t.py | 342 ++++++++++++++++++ .../WMCore_t/GlobalWorkQueue_t/__init__.py | 7 + .../MSRuleCleaner_t/MSRuleCleaner_t.py | 133 ++++++- 8 files changed, 704 insertions(+), 8 deletions(-) create mode 100644 src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py create mode 100644 test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py create mode 100644 test/python/WMCore_t/GlobalWorkQueue_t/__init__.py diff --git a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py new file mode 100644 index 0000000000..a417de8c93 --- /dev/null +++ b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py @@ -0,0 +1,87 @@ +from __future__ import (division, print_function) + +from time import time +from WMCore.REST.CherryPyPeriodicTask import CherryPyPeriodicTask +from WMCore.WorkQueue.WorkQueue import globalQueue +from WMCore.MicroService.MSRuleCleaner.MSRuleCleaner import MSRuleCleaner + +from WMCore.Services.Rucio.Rucio import WMRucioDIDNotFoundException + +class InputDataRucioRuleCleaner(CherryPyPeriodicTask): + + def __init__(self, rest, config): + + super(InputDataRucioRuleCleaner, self).__init__(config) + self.globalQ = globalQueue(logger=self.logger, **config.queueParams) + self.msRuleCleaner = MSRuleCleaner(config, logger=self.logger) # Initialize MSRuleCleaner + + def setConcurrentTasks(self, config): + """ + sets the list of function reference for concurrent tasks + """ + self.concurrentTasks = [{'func': self.cleanRucioRules, 'duration': config.cleanInputDataRucioRuleDuration}] + + def cleanRucioRules(self, config): + """ + Queries global queue and builds the list of blocklevel Rucio rules of finished elements to be deleted. Calls MSRuleCleaner cleanRucioRules(self, wflow) to delete the rules. + :config: The configuration for the task. This uses Rucio account from config to use for querying rules + :return: The result of MSRuleCleaner cleanRucioRules(self, wflow) method, which is True if all rules were deleted successfully, False otherwise. + """ + + tStart = time() + + #statuses = ['Available', 'Done', 'Acquired', 'Failed', 'Canceled'] + #globalQueueElements=self.globalQ.getWork({'Status':'Done'},siteJobCounts={}) + globalQueueElements=self.globalQ.backend.getElements() + + #print("Elements in GlobalQueue cleanRucioRules:") + #print(json.dumps(globalQueueElements,indent=2)) + + #to be able to use cleanRules method of MSRuleCleaner + rulesToClean = {'PlineMarkers':['Current'], 'RulesToClean': {'Current': []}, 'CleanupStatus': {'Current': []}} + + if globalQueueElements: + #print(f"Found {len(globalQueueElements)} elements in GlobalQueue") + for element in globalQueueElements: + + requestName = element.get('RequestName') # Extract the RequestName field + percentComplete = element.get('PercentComplete', 0) # Default to 0 if key is missing + percentSuccess = element.get('PercentSuccess', 0) # Default to 0 if key is missing + + + if percentComplete == 100 and percentSuccess == 100: + + #'Inputs': {'/MinimumBias/ComissioningHI-v1/RAW#372d624c-089d-11e1-8347-003048caaace': + blocks = element.get('Inputs') # Example key for dataset + + # Fetch rules for blocks + if blocks: + for block in blocks: + print("Adding block ", block, " to RulesToClean") + dataCont = block.split('#')[0] # Extract the container name from the block + + if dataCont in self.msRuleCleaner.globalLocks: + msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the " + msg += "RulesToClean list for both container and block level Rules for workflow: %s!" + self.logger.info(msg, dataCont, requestName) + continue + try: + print('Fetching rules for block:', block, "\n", config.rucioAccount, "\n", self.msRuleCleaner.rucio.listDataRules(block, account=config.rucioAccount)) + for rule in self.msRuleCleaner.rucio.listDataRules(block, account=config.rucioAccount): + msg = "Found %s block-level rule to be deleted for container %s" + self.logger.info(msg, rule['id'], dataCont) + #cleanRules of MSRuleCleaner expects a list of rule ids and always clean the last one in the list of PlineMarkers + rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]].append(rule['id']) + except WMRucioDIDNotFoundException: + msg = "Block: %s not found in Rucio for workflow: %s." + self.logger.info(msg, block, requestName) + continue + + self.logger.info("%s executed in %.3f secs.", self.__class__.__name__, time() - tStart) + return self.msRuleCleaner.cleanRucioRules(rulesToClean) + + else: + print("No elements with status DONE found in GlobalQueue") + + self.logger.info("%s executed in %.3f secs.", self.__class__.__name__, time() - tStart) + return \ No newline at end of file diff --git a/src/python/WMCore/MicroService/MSRuleCleaner/MSRuleCleaner.py b/src/python/WMCore/MicroService/MSRuleCleaner/MSRuleCleaner.py index f1ab977790..d86ea230a2 100644 --- a/src/python/WMCore/MicroService/MSRuleCleaner/MSRuleCleaner.py +++ b/src/python/WMCore/MicroService/MSRuleCleaner/MSRuleCleaner.py @@ -38,6 +38,7 @@ from Utils.Pipeline import Pipeline, Functor from Utils.CertTools import ckey, cert +from WMCore.Services.WorkQueue.WorkQueue import WorkQueue as WorkQueueDS class MSRuleCleanerResolveParentError(WMException): """ @@ -84,6 +85,7 @@ def __init__(self, msConfig, logger=None): self.msConfig.setdefault("services", ['ruleCleaner']) self.msConfig.setdefault("rucioWmaAccount", "wma_test") self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor") + self.msConfig.setdefault("QueueURL", "http://localhost:5984/workqueue") self.msConfig.setdefault('enableRealMode', False) self.msConfig.setdefault('archiveDelayHours', 24 * 2) self.msConfig.setdefault('archiveAlarmHours', 24 * 30) @@ -135,12 +137,19 @@ def __init__(self, msConfig, logger=None): Functor(self.setArchivalDelayExpired), Functor(self.setLogDBClean), Functor(self.archive)]) - + + pName = 'plineMSTrBlockGlobalQueue' + self.plineMSTrBlockGlobalQueue = Pipeline(name=pName, + funcLine=[Functor(self.setPlineMarker, pName), + Functor(self.getGlobalWorkQueueRucioRules,self.msConfig['QueueURL'],self.msConfig['rucioWmaAccount']), #use wma_test for now. Should be changed later to self.msConfig['rucioMStrAccount'] + Functor(self.cleanRucioRules)]) + # Building the different set of plines we will need later: # NOTE: The following are all the functional pipelines which are supposed to include # a cleanup function and report cleanup status in the MSRuleCleanerWflow object self.cleanuplines = [self.plineMSTrCont, self.plineMSTrBlock, + #self.plineMSTrBlockGlobalQueue, self.plineAgentCont, self.plineAgentBlock] # Building an auxiliary list of cleanup pipeline names only: @@ -151,6 +160,7 @@ def __init__(self, msConfig, logger=None): self.plineAgentBlock] self.mstrlines = [self.plineMSTrCont, self.plineMSTrBlock] + #self.plineMSTrBlockGlobalQueue] # Initialization of the 'cleaned' and 'archived' counters: self.wfCounters = {'cleaned': {}, @@ -746,6 +756,65 @@ def getRucioRules(self, wflow, gran, rucioAcct): msg = "Container: %s not found in Rucio for workflow: %s." self.logger.info(msg, dataCont, wflow['RequestName']) return wflow + + def getGlobalWorkQueueRucioRules(self,wflow,QueueURL,rucioAcct): + + """ + Queries globle queue and builds the list of blocklevel rules of finished elements for the given workflow + :param wflow: A MSRuleCleaner workflow representation + :param QueueURL: The URL of the Global Work Queue + :rucioAcct: The Rucio account to use for querying rules + :return: The workflow object + """ + + currPline = wflow['PlineMarkers'][-1] + workflowName = wflow['RequestName'] + + #get work queue elements using work queue API. Not use since the microservice can be run on machines that can not access backend directly + #globalQueueElements = self.globalQueue.backend.getElementsForWorkflow(workflowName) + + wqService = WorkQueueDS(QueueURL, 'workqueue_t') + globalQueueElements=wqService.getWQElementsByWorkflow(workflowName) + + # Check and process the retrieved elements + if globalQueueElements: + print(f"Found {len(globalQueueElements)} elements for workflow: {workflowName}") + for element in globalQueueElements: + + percentComplete = element.get('PercentComplete', 0) # Default to 0 if key is missing + percentSuccess = element.get('PercentSuccess', 0) # Default to 0 if key is missing + + if percentComplete == 100 and percentSuccess == 100: + + #'Inputs': {'/MinimumBias/ComissioningHI-v1/RAW#372d624c-089d-11e1-8347-003048caaace': + blocks = element.get('Inputs') # Example key for dataset + + # Fetch rules for blocks + if blocks: + for block in blocks: + print("Adding block ", block, " to RulesToClean") + dataCont = block.split('#')[0] # Extract the container name from the block + + if dataCont in self.globalLocks: + msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the " + msg += "RulesToClean list for both container and block level Rules for workflow: %s!" + self.logger.info(msg, dataCont, wflow['RequestName']) + continue + try: + for rule in self.rucio.listDataRules(block, account=rucioAcct): + wflow['RulesToClean'][currPline].append(rule['id']) + msg = "Found %s block-level rule to be deleted for container %s" + self.logger.info(msg, rule['id'], dataCont) + except WMRucioDIDNotFoundException: + msg = "Block: %s not found in Rucio for workflow: %s." + self.logger.info(msg, block, wflow['RequestName']) + + + else: + print(f"No elements found for workflow: {workflowName}") + + return wflow + def cleanRucioRules(self, wflow): """ diff --git a/test/data/Mock/DBSMockData.json b/test/data/Mock/DBSMockData.json index aca8d8faa3..38000007f1 100644 --- a/test/data/Mock/DBSMockData.json +++ b/test/data/Mock/DBSMockData.json @@ -13793,6 +13793,11 @@ "dataset": "/MinimumBias/ComissioningHI-v1/RAW" } ], + "listDatasets:[('dataset', '/JetHT/Run2012C-v1/RAW'), ('dataset_access_type', '*')]": [ + { + "dataset": "/JetHT/Run2012C-v1/RAW" + } + ], "listDatasets:[('dataset', '/MinimumBias/FAKE-Filter-v1/RECO'), ('dataset_access_type', '*')]": [], "listDatasets:[('dataset', '/NoBPTX/Run2016F-23Sep2016-v1/DQMIO'), ('dataset_access_type', '*')]": [ { @@ -25424,6 +25429,14 @@ "md5": "NOTSET" } ], + "listBlocks:[('dataset', '/JetHT/Run2012C-v1/RAW'), ('detail', False)]": [ + { + "block_name": "/JetHT/Run2012C-v1/RAW#001975f4-e9fa-11e1-9597-842b2b4671d8" + }, + { + "block_name": "/JetHT/Run2012C-v1/RAW#004e510c-07fa-11e2-ad47-842b2b4671d8" + } + ], "listFileArray:[('dataset', '/Cosmics/ComissioningHI-v1/RAW'), ('detail', False), ('validFileOnly', 1)]": [ { "logical_file_name": "/store/data/ComissioningHI/Cosmics/RAW/v1/000/180/855/B452861C-1108-E111-B809-BCAEC53296FE.root" @@ -545728,6 +545741,30 @@ "num_lumi": 51879 } ], + "listFileSummaries:[('block_name', '/JetHT/Run2012C-v1/RAW#001975f4-e9fa-11e1-9597-842b2b4671d8'), ('validFileOnly', 1)]": [ + { + "file_size": 19829448550, + "max_ldate": 1321139371, + "median_cdate": 1321135779, + "median_ldate": 1321139371, + "num_block": 1, + "num_event": 14558, + "num_file": 5, + "num_lumi": 22 + } + ], + "listFileSummaries:[('block_name', '/JetHT/Run2012C-v1/RAW#004e510c-07fa-11e2-ad47-842b2b4671d8'), ('validFileOnly', 1)]": [ + { + "file_size": 6879134, + "max_ldate": 1321139371, + "median_cdate": 1321135779, + "median_ldate": 1321139371, + "num_block": 1, + "num_event": 14558, + "num_file": 1, + "num_lumi": 4 + } + ], "listFiles:[('block_name', '/Cosmics/ComissioningHI-PromptReco-v1/RECO#7020873e-0dcd-11e1-9b6c-003048caaace')]": [ { "logical_file_name": "/store/data/ComissioningHI/Cosmics/RECO/PromptReco-v1/000/181/369/F2189593-CC0D-E111-8C3C-BCAEC518FF89.root" diff --git a/test/data/Mock/RucioMockData.json b/test/data/Mock/RucioMockData.json index 5715a63d66..8522c17c44 100644 --- a/test/data/Mock/RucioMockData.json +++ b/test/data/Mock/RucioMockData.json @@ -1597,6 +1597,28 @@ "scope": "cms", "type": "CONTAINER" }, + "getDID:[('didName', '/JetHT/Run2012C-v1/RAW#001975f4-e9fa-11e1-9597-842b2b4671d8'), ('dynamic', False)]": { + "account": "sync_t0_ch_cern_tape", + "bytes": 19829448550, + "expired_at": null, + "length": 1, + "monotonic": false, + "name": "/JetHT/Run2012C-v1/RAW#001975f4-e9fa-11e1-9597-842b2b4671d8", + "open": true, + "scope": "cms", + "type": "DATASET" + }, + "getDID:[('didName', '/JetHT/Run2012C-v1/RAW#004e510c-07fa-11e2-ad47-842b2b4671d8'), ('dynamic', False)]": { + "account": "sync_t0_ch_cern_tape", + "bytes": 6879134, + "expired_at": null, + "length": 1, + "monotonic": false, + "name": "/JetHT/Run2012C-v1/RAW#004e510c-07fa-11e2-ad47-842b2b4671d8", + "open": true, + "scope": "cms", + "type": "DATASET" + }, "listDataRules:[('account', 'ms-pileup'), ('scope', 'cms')]": [{ "state": "OK", "id": "123", diff --git a/test/deploy/env_unittest_py3.sh b/test/deploy/env_unittest_py3.sh index 37a3a335ff..9909d44dee 100755 --- a/test/deploy/env_unittest_py3.sh +++ b/test/deploy/env_unittest_py3.sh @@ -33,11 +33,14 @@ ln -s $TEST_SRC/javascript/ $ORG_SRC_OTHER ln -s $TEST_SRC/template/ $ORG_SRC_OTHER export WMAGENT_SECRETS_LOCATION=$ADMIN_DIR/WMAgent_unittest.secrets -export X509_HOST_CERT=$CERT_DIR/servicecert.pem -export X509_HOST_KEY=$CERT_DIR/servicekey.pem -export X509_USER_CERT=$CERT_DIR/servicecert.pem -export X509_USER_KEY=$CERT_DIR/servicekey.pem - +#export X509_HOST_CERT=$CERT_DIR/servicecert.pem +#export X509_HOST_KEY=$CERT_DIR/servicekey.pem +#export X509_USER_CERT=$CERT_DIR/servicecert.pem +#export X509_USER_KEY=$CERT_DIR/servicekey.pem +export X509_HOST_CERT=$CERT_DIR/usercert.pem +export X509_HOST_KEY=$CERT_DIR/userkey_nopwd.pem +export X509_USER_CERT=$CERT_DIR/usercert.pem +export X509_USER_KEY=$CERT_DIR/userkey_nopwd.pem export install=$INSTALL_DIR/current/install/wmagentpy3 export config=$INSTALL_DIR/current/config/wmagentpy3 export manage=$config/manage diff --git a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py new file mode 100644 index 0000000000..654e9c588c --- /dev/null +++ b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py @@ -0,0 +1,342 @@ +from WMCore.GlobalWorkQueue.CherryPyThreads.InputDataRucioRuleCleaner import InputDataRucioRuleCleaner + +from WMQuality.Emulators.EmulatedUnitTestCase import EmulatedUnitTestCase + +import cherrypy + +# WMCore modules +from WMCore.Services.Rucio import Rucio + +from WMQuality.TestInitCouchApp import TestInitCouchApp +from WMQuality.Emulators.WMSpecGenerator.WMSpecGenerator import WMSpecGenerator +from WMCore.WorkQueue.WorkQueue import globalQueue +from WMCore.Services.WorkQueue.WorkQueue import WorkQueue as WorkQueueDS +from WMCore.MicroService.MSRuleCleaner.MSRuleCleaner import MSRuleCleaner + +import json +# system modules +import os +import time + +import unittest + + +class DummyREST: + def __init__(self): + self.logger = None # Optional: add logger if needed + self.config = None + +#MSRuleCleaner requires plain dictionary to be passed as config while CherryPyPeriodic requires attributes, so we create a DictWithAttrs class +class DictWithAttrs(dict): + def __getattr__(self, key): + try: + return self[key] + except KeyError as e: + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{key}'") from e + +class InputDataRucioRuleCleanerTest(EmulatedUnitTestCase): + + def setUp(self): + self.msConfig = {"verbose": True, + "interval": 1 * 60, + "services": ['ruleCleaner'], + "rucioAccount": 'wma_test', + 'reqmgr2Url': 'https://cmsweb-testbed.cern.ch/reqmgr2', + 'msOutputUrl': 'https://cmsweb-testbed.cern.ch/ms-output', + 'reqmgrCacheUrl': 'https://cmsweb-testbed.cern.ch/couchdb/reqmgr_workload_cache', + 'phedexUrl': 'https://cmsweb-testbed.cern.ch/phedex/datasvc/json/prod', + 'dbsUrl': 'https://cmsweb-testbed.cern.ch/dbs/int/global/DBSReader', + 'rucioUrl': 'http://cms-rucio-int.cern.ch', + 'rucioAuthUrl': 'https://cms-rucio-auth-int.cern.ch', + "wmstatsUrl": "https://cmsweb-testbed.cern.ch/wmstatsserver", + "logDBUrl": "https://cmsweb-testbed.cern.ch/couchdb/wmstats_logdb", + 'logDBReporter': 'reqmgr2ms_ruleCleaner', + 'archiveDelayHours': 8, + 'archiveAlarmHours': 24, + 'enableRealMode': True} + + self.creds = {"client_cert": os.getenv("X509_USER_CERT", "Unknown"), + "client_key": os.getenv("X509_USER_KEY", "Unknown")} + self.rucioConfigDict = {"rucio_host": self.msConfig['rucioUrl'], + "auth_host": self.msConfig['rucioAuthUrl'], + "auth_type": "x509", + "account": self.msConfig['rucioAccount'], + "ca_cert": False, + "timeout": 30, + "request_retries": 3, + "creds": self.creds} + + + self.specGenerator = WMSpecGenerator("WMSpecs") + self.schema = [] + self.couchApps = ["WorkQueue"] + self.testInit = TestInitCouchApp('WorkQueueServiceTest') + self.testInit.setLogging() + self.testInit.setDatabaseConnection() + self.testInit.setSchema(customModules=self.schema, + useDefault=False) + self.testInit.setupCouch('workqueue_t', *self.couchApps) + self.testInit.setupCouch('workqueue_t_inbox', *self.couchApps) + self.testInit.setupCouch('local_workqueue_t', *self.couchApps) + self.testInit.setupCouch('local_workqueue_t_inbox', *self.couchApps) + self.testInit.generateWorkDir() + + self.msConfig.update({'QueueURL':self.testInit.couchUrl}) + + self.queueParams = {} + self.queueParams['log_reporter'] = "Services_WorkQueue_Unittest" + self.queueParams['rucioAccount'] = self.msConfig['rucioAccount'] + self.queueParams['rucioAuthUrl'] = "http://cms-rucio-int.cern.ch" + self.queueParams['rucioUrl'] = "https://cms-rucio-auth-int.cern.ch" + self.queueParams['_internal_name'] = 'GlobalWorkQueueTest' + self.queueParams['log_file'] = 'test.log' + + + print("X509_USER_CERT:", os.getenv("X509_USER_CERT")) + print("X509_USER_KEY:", os.getenv("X509_USER_KEY")) + + # Create config object with attributes + self.config_obj = DictWithAttrs(self.msConfig) + #additional attributes needed by cherrypy periodic task + self.config_obj._internal_name = "GlobalWorkQueueTest" + self.config_obj.log_file = "test.log" + #additional attributes needed by global workqueue + self.config_obj.queueParams = self.queueParams + #duration for the periodic task in seconds + self.config_obj.cleanInputDataRucioRuleDuration = 10 + + super(InputDataRucioRuleCleanerTest, self).setUp() + + def testInputDataRucioRuleCleaner(self): + """ + Test the InputDataRucioRuleCleaner task + """ + #Get workflow description. ReRecoWorkloadFactory.getTestArguments() is used in createReRecoSpec below, + #so the workflow description here and the one used in creating workqueue is the same + specName = "RerecoSpec" + inputdataset = {"InputDataset": "/JetHT/Run2012C-v1/RAW"} + + #Create ReRecoSpec as stored in GlobalQueue + specUrl = self.specGenerator.createReRecoSpec(specName, "file", + assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]},InputDataset=inputdataset["InputDataset"]) + + #cleaner = InputDataRucioRuleCleaner(rest=self.mockRest, config=self.config_obj) + cleaner = InputDataRucioRuleCleaner(rest=DummyREST(), config=self.config_obj) + + #Make GlobalQueue + globalQ = globalQueue(DbName='workqueue_t', + QueueURL=self.testInit.couchUrl, + UnittestFlag=True, logger=cleaner.logger, **self.queueParams) + globalQ.queueWork(specUrl, specName, "teamA") + cleaner.globalQ = globalQ + + #Make MSRuleCleaner + msRuleCleaner = MSRuleCleaner(self.config_obj,logger=cleaner.logger) + msRuleCleaner.resetCounters() + msRuleCleaner.rucio = Rucio.Rucio(self.msConfig['rucioAccount'], + hostUrl=self.rucioConfigDict['rucio_host'], + authUrl=self.rucioConfigDict['auth_host'], + configDict=self.rucioConfigDict) + + cleaner.msRuleCleaner = msRuleCleaner + + #Let try to modify the element in GlobalQueue to have PercentComplete and PercentSuccess set to 100 + wqService = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') + #Use this instead of wqService.getWQElementsByWorkflow(workflowName) to have the element'id' + data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName], 'endkey': [specName, {}], + 'reduce': False}) + + print("Elements in GlobalQueue:") + elements = data.get('rows', []) + print(json.dumps(elements, indent=2)) + + #let update the PercentComplete and PercentSuccess and Status='Done' of the first elements + element_id = [elements[0]['id']] # Get the first element's ID + print("Updating element:", element_id) + wqService.updateElements(*element_id, PercentComplete=100, PercentSuccess=100, Status='Done') + + #create a rule and inject it in wma_test account + blockNames = list(elements[0]['value']['Inputs'].keys()) # Get the block name from the first element + print("Block Name:", blockNames[0]) + + #need to create rule here otherwise we do not know which element was updated since the element order changes each time re-fetching (of course we can use the element_id) + rule_id = cleaner.msRuleCleaner.rucio.createReplicationRule( + names=blockNames[0], + rseExpression="T2_US_Nebraska", + copies=1, + grouping="DATASET", + lifetime=360, + account="wma_test", + ask_approval=False, + activity="Production Input", + comment="WMCore test block rule creation" + ) + + print("Created Rucio rule with ID:", rule_id) + rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) + print(rule_info) + + # Re-fetch the elements to see the update + data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName], 'endkey': [specName, {}], + 'reduce': False}) + #element order changes each time, so we need to re-fetch the elements + elements = data.get('rows', []) + #elements=wqService.getWQElementsByWorkflow(specName) + print("Updated Elements in GlobalQueue:") + for e in elements: + print(e["id"], e['value']['Status'], e['value']["PercentComplete"], e['value']["PercentSuccess"]) + #print(e["id"], e['Status'], e["PercentComplete"], e["PercentSuccess"]) + + + results = cleaner.cleanRucioRules(self.config_obj) + print("Results from cleanRucioRules:", json.dumps(results, indent=2)) + #now make sure the rule is cleaned + #keep deleting until success or timeout + rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) + delResult = False + timeleft = 0 + start_time = time.time() + while rule_info and not delResult and timeleft < 300: + #now delete it + print('Manually deleting rucio rules: ', blockNames[0], cleaner.msRuleCleaner.rucio.listDataRules(blockNames[0], account=self.msConfig['rucioAccount'])) + delResult = cleaner.msRuleCleaner.rucio.deleteRule(rule_id[0]) + print("Deleted Rucio rule with ID:", rule_id, delResult) + if delResult: break + time.sleep(60) + timeleft = time.time() - start_time + + if not delResult and timeleft >= 300: + print("Failed to delete the rule after 5 minutes, exiting...") + + self.assertTrue(results['CleanupStatus']['Current']) + + + def testInputDataRucioRuleCleanerWithThreading(self): + """ + Test the InputDataRucioRuleCleaner task with threading + """ + + #cleaner = InputDataRucioRuleCleaner(rest=self.mockRest, config=self.config_obj) + cleaner = InputDataRucioRuleCleaner(rest=DummyREST(), config=self.config_obj) + + #Get workflow description. ReRecoWorkloadFactory.getTestArguments() is used in createReRecoSpec below, + #so the workflow description here and the one used in creating workqueue is the same + specName = "RerecoSpec" + inputdataset = {"InputDataset": "/JetHT/Run2012C-v1/RAW"} + + #Create ReRecoSpec as stored in GlobalQueue + specUrl = self.specGenerator.createReRecoSpec(specName, "file", + assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]},InputDataset=inputdataset["InputDataset"]) + + #Make GlobalQueue + globalQ = globalQueue(DbName='workqueue_t', + QueueURL=self.testInit.couchUrl, + UnittestFlag=True, logger=cleaner.logger, **self.queueParams) + globalQ.queueWork(specUrl, specName, "teamA") + cleaner.globalQ = globalQ + + #Make MSRuleCleaner + msRuleCleaner = MSRuleCleaner(self.config_obj,logger=cleaner.logger) + msRuleCleaner.resetCounters() + msRuleCleaner.rucio = Rucio.Rucio(self.msConfig['rucioAccount'], + hostUrl=self.rucioConfigDict['rucio_host'], + authUrl=self.rucioConfigDict['auth_host'], + configDict=self.rucioConfigDict) + cleaner.msRuleCleaner = msRuleCleaner + + # Start CherryPy engine + print('CherryPy engine starting...') + cherrypy.engine.start() + # Give CherryPy a moment to start and modify the element in GlobalQueue after 5 seconds and before the next run of the periodic task + time.sleep(5) + + #Let try to modify the element in GlobalQueue to have PercentComplete and PercentSuccess set to 100 + wqService = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') + #Use this instead of wqService.getWQElementsByWorkflow(workflowName) to have the element'id' + data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName], 'endkey': [specName, {}], + 'reduce': False}) + + print("Elements in GlobalQueue:") + elements = data.get('rows', []) + print(json.dumps(elements, indent=2)) + + #let update the PercentComplete and PercentSuccess and Status='Done' of the first elements + element_id = [elements[0]['id']] # Get the first element's ID + print("Updating element:", element_id) + wqService.updateElements(*element_id, PercentComplete=100, PercentSuccess=100, Status='Done') + + #create a rule and inject it in wma_test account + blockNames = list(elements[0]['value']['Inputs'].keys()) # Get the block name from the first element + + #need to create rule here otherwise we do not know which element was updated since the element order changes each time re-fetching (of course we can use the element_id) + rule_id = cleaner.msRuleCleaner.rucio.createReplicationRule( + names=blockNames[0], + rseExpression="T2_US_Nebraska", + copies=1, + grouping="DATASET", + lifetime=360, + account="wma_test", + ask_approval=False, + activity="Production Input", + comment="WMCore test block rule creation" + ) + + print("Created Rucio rule with ID:", rule_id) + rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) + print(rule_info) + + # Re-fetch the elements to see the update + data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName], 'endkey': [specName, {}], + 'reduce': False}) + + elements = data.get('rows', []) + print("Updated Elements in GlobalQueue:") + for e in elements: + print(e["id"], e['value']['Status'], e['value']["PercentComplete"], e['value']["PercentSuccess"]) + + time.sleep(20) + + print('CherryPy engine exiting...') + cherrypy.engine.exit() + + #now continuously check the rule status until it is cleaned and exit after 10 minutes + rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) + timeleft = 0 + start_time = time.time() + while rule_info and timeleft < 600: # Check for 10 minutes + print("Rule still exists:", rule_id[0], rule_info) + time.sleep(60) + timeleft = time.time() - start_time + rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) + + rule_info_for_check = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) + + #now make sure the rule should be cleaned (note that the rule may not be cleaned immediately after the periodic task execution (~5 mins), but we just clean it again here) + rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) + delResult = False + if not rule_info: + print("Rule not found.") + + #keep deleting until success or timeout + timeleft = 0 + start_time = time.time() + while rule_info and not delResult and timeleft < 300: + #now delete it + print('Manually deleting rucio rules: ', blockNames[0], cleaner.msRuleCleaner.rucio.listDataRules(blockNames[0], account=self.msConfig['rucioAccount'])) + delResult = cleaner.msRuleCleaner.rucio.deleteRule(rule_id[0]) + print("Deleted Rucio rule with ID:", rule_id, delResult) + if delResult: break + time.sleep(60) + timeleft = time.time() - start_time + + if not delResult and timeleft >= 300: + print("Failed to delete the rule after 5 minutes, exiting...") + + self.assertTrue(not rule_info_for_check, "Rule not deleted successfully after periodic task execution.") + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/test/python/WMCore_t/GlobalWorkQueue_t/__init__.py b/test/python/WMCore_t/GlobalWorkQueue_t/__init__.py new file mode 100644 index 0000000000..d7144706e8 --- /dev/null +++ b/test/python/WMCore_t/GlobalWorkQueue_t/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python +""" +_GlobalWorkQueue_t_ + +""" + +__all__ = [] diff --git a/test/python/WMCore_t/MicroService_t/MSRuleCleaner_t/MSRuleCleaner_t.py b/test/python/WMCore_t/MicroService_t/MSRuleCleaner_t/MSRuleCleaner_t.py index 5cfa4551c8..c9f2562bbd 100644 --- a/test/python/WMCore_t/MicroService_t/MSRuleCleaner_t/MSRuleCleaner_t.py +++ b/test/python/WMCore_t/MicroService_t/MSRuleCleaner_t/MSRuleCleaner_t.py @@ -15,7 +15,17 @@ from WMCore.MicroService.MSRuleCleaner.MSRuleCleaner import MSRuleCleaner, MSRuleCleanerArchivalSkip from WMCore.MicroService.MSRuleCleaner.MSRuleCleanerWflow import MSRuleCleanerWflow from WMCore.Services.Rucio import Rucio +from rucio.common.exception import RuleNotFound +from WMQuality.Emulators.EmulatedUnitTestCase import EmulatedUnitTestCase + +from WMQuality.TestInitCouchApp import TestInitCouchApp +from WMQuality.Emulators.WMSpecGenerator.WMSpecGenerator import WMSpecGenerator +from WMCore.WorkQueue.WorkQueue import globalQueue +from WMCore.Services.WorkQueue.WorkQueue import WorkQueue as WorkQueueDS + + +from WMCore.WMSpec.StdSpecs.ReReco import ReRecoWorkloadFactory def getTestFile(partialPath): """ @@ -25,8 +35,8 @@ def getTestFile(partialPath): return os.path.join(normPath, partialPath) -# class MSRuleCleanerTest(EmulatedUnitTestCase): -class MSRuleCleanerTest(unittest.TestCase): +class MSRuleCleanerTest(EmulatedUnitTestCase): +#class MSRuleCleanerTest(unittest.TestCase): "Unit test for MSruleCleaner module" def setUp(self): @@ -63,12 +73,39 @@ def setUp(self): "creds": self.creds} self.reqStatus = ['announced', 'aborted-completed', 'rejected'] + + self.specGenerator = WMSpecGenerator("WMSpecs") + self.schema = [] + self.couchApps = ["WorkQueue"] + self.testInit = TestInitCouchApp('WorkQueueServiceTest') + self.testInit.setLogging() + self.testInit.setDatabaseConnection() + self.testInit.setSchema(customModules=self.schema, + useDefault=False) + self.testInit.setupCouch('workqueue_t', *self.couchApps) + self.testInit.setupCouch('workqueue_t_inbox', *self.couchApps) + self.testInit.setupCouch('local_workqueue_t', *self.couchApps) + self.testInit.setupCouch('local_workqueue_t_inbox', *self.couchApps) + self.testInit.generateWorkDir() + + self.msConfig.update({'QueueURL':self.testInit.couchUrl}) + print("msConfig: ", json.dumps(self.msConfig, indent=2)) + + self.msRuleCleaner = MSRuleCleaner(self.msConfig) self.msRuleCleaner.resetCounters() self.msRuleCleaner.rucio = Rucio.Rucio(self.msConfig['rucioAccount'], hostUrl=self.rucioConfigDict['rucio_host'], authUrl=self.rucioConfigDict['auth_host'], configDict=self.rucioConfigDict) + + + + self.queueParams = {} + self.queueParams['log_reporter'] = "Services_WorkQueue_Unittest" + self.queueParams['rucioAccount'] = "wma_test" + self.queueParams['rucioAuthUrl'] = "http://cms-rucio-int.cern.ch" + self.queueParams['rucioUrl'] = "https://cms-rucio-auth-int.cern.ch" self.taskChainFile = getTestFile('data/ReqMgr/requests/Static/TaskChainRequestDump.json') self.stepChainFile = getTestFile('data/ReqMgr/requests/Static/StepChainRequestDump.json') @@ -181,6 +218,7 @@ def testPipelineAgentCont(self): def testPipelineMSTrBlock(self): # Test plineAgentCont wflow = MSRuleCleanerWflow(self.taskChainReq) + print(wflow) self.msRuleCleaner.plineMSTrBlock.run(wflow) expectedWflow = {'CleanupStatus': {'plineMSTrBlock': True}, 'ForceArchive': False, @@ -222,7 +260,98 @@ def testPipelineMSTrBlock(self): 'TransferTape': False, 'TapeRulesStatus': [], 'StatusAdvanceExpiredMsg': ""} + self.assertDictEqual(wflow, expectedWflow) + + def testPipelineMSTrBlockGlobalQueue(self): + + #turn of tests of pipelineMSTrBlockGlobalQueue + self.skipTest("Skipping testPipelineMSTrBlockGlobalQueue. This is placeholder for future test of pipelineMSTrBlockGlobalQueue if needed") + + #Get workflow description. ReRecoWorkloadFactory.getTestArguments() is used in createReRecoSpec below, + #so the workflow description here and the one used in creating workqueue is the same + specName = "RerecoSpec" + inputdataset = {"InputDataset": "/JetHT/Run2012C-v1/RAW"} + workflowDescription = ReRecoWorkloadFactory.getTestArguments() + workflowDescription['RequestName'] = specName + workflowDescription['InputDataset'] = inputdataset["InputDataset"] + + wflow = MSRuleCleanerWflow(workflowDescription) + + #Create ReRecoSpec as stored in GlobalQueue + specUrl = self.specGenerator.createReRecoSpec(specName, "file", + assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]},InputDataset=inputdataset["InputDataset"]) + #Make GlobalQueue + globalQ = globalQueue(DbName='workqueue_t', + QueueURL=self.testInit.couchUrl, + UnittestFlag=True, **self.queueParams) + globalQ.queueWork(specUrl, specName, "teamA") + + #Let try to modify the element in GlobalQueue to have PercentComplete and PercentSuccess set to 100 + wqService = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') + #Use this instead of wqService.getWQElementsByWorkflow(workflowName) to have the element'id' + data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName], 'endkey': [specName, {}], + 'reduce': False}) + + print("Elements in GlobalQueue:") + elements = data.get('rows', []) + print(json.dumps(elements, indent=2)) + + #let update the PercentComplete and PercentSuccess of the first elements + element_id = [elements[0]['id']] # Get the first element's ID + print("Updating element:", element_id) + wqService.updateElements(*element_id, PercentComplete=100, PercentSuccess=100) + # Re-fetch the elements to see the update + data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName], 'endkey': [specName, {}], + 'reduce': False}) + elements = data.get('rows', []) + #elements=wqService.getWQElementsByWorkflow(specName) + print("Updated Elements in GlobalQueue:") + for e in elements: + print(e["id"], e['value']['Status'], e['value']["PercentComplete"], e['value']["PercentSuccess"]) + #print(e["id"], e['Status'], e["PercentComplete"], e["PercentSuccess"]) + + #now let try to create Rucio rule for the block + #create a rule and inject it in wma_test account + blockNames = list(elements[0]['value']['Inputs'].keys()) # Get the block name from the first element + rule_id = self.msRuleCleaner.rucio.createReplicationRule( + names=blockNames[0], + rseExpression="T2_US_Nebraska", + copies=1, + grouping="DATASET", + lifetime=360, + account="wma_test", + ask_approval=False, + activity="Production Input", + comment="WMCore test block rule creation" + ) + + print("Created Rucio rule with ID:", rule_id) + rule_info = self.msRuleCleaner.rucio.getRule(rule_id[0]) + print(rule_info) + + self.msRuleCleaner.plineMSTrBlockGlobalQueue.run(wflow) + print("Workflow after plineMSTrBlockGlobalQueue:") + print(json.dumps(wflow, indent=2)) + + #now make sure the rule is cleaned + try: + rule_info = self.msRuleCleaner.rucio.getRule(rule_id[0]) + #print("Rule exists:", json.dumps(rule_info, indent=2)) + #now delete it + self.msRuleCleaner.rucio.deleteRule(rule_id[0]) + print("Deleted Rucio rule with ID:", rule_id) + except RuleNotFound: + print("Rule not found.") + except Exception as e: + print("Error checking rule:", e) + + print("Cleanup status: ", wflow['CleanupStatus']['plineMSTrBlockGlobalQueue']) + print("Rules to clean: ", wflow['RulesToClean']['plineMSTrBlockGlobalQueue'], rule_id) + assert((wflow['CleanupStatus']['plineMSTrBlockGlobalQueue'] is True)) + assert((wflow['RulesToClean']['plineMSTrBlockGlobalQueue'] == rule_id)) def testPipelineMSTrCont(self): # Test plineAgentCont From a6d2d27d3bd1690f9c47dc0d302c57221e89d79b Mon Sep 17 00:00:00 2001 From: Duong Date: Tue, 18 Nov 2025 22:18:06 -0500 Subject: [PATCH 02/14] restore MSRuleCleaner --- .../MSRuleCleaner/MSRuleCleaner.py | 71 +--------- .../MSRuleCleaner_t/MSRuleCleaner_t.py | 133 +----------------- 2 files changed, 3 insertions(+), 201 deletions(-) diff --git a/src/python/WMCore/MicroService/MSRuleCleaner/MSRuleCleaner.py b/src/python/WMCore/MicroService/MSRuleCleaner/MSRuleCleaner.py index d86ea230a2..f1ab977790 100644 --- a/src/python/WMCore/MicroService/MSRuleCleaner/MSRuleCleaner.py +++ b/src/python/WMCore/MicroService/MSRuleCleaner/MSRuleCleaner.py @@ -38,7 +38,6 @@ from Utils.Pipeline import Pipeline, Functor from Utils.CertTools import ckey, cert -from WMCore.Services.WorkQueue.WorkQueue import WorkQueue as WorkQueueDS class MSRuleCleanerResolveParentError(WMException): """ @@ -85,7 +84,6 @@ def __init__(self, msConfig, logger=None): self.msConfig.setdefault("services", ['ruleCleaner']) self.msConfig.setdefault("rucioWmaAccount", "wma_test") self.msConfig.setdefault("rucioMStrAccount", "wmcore_transferor") - self.msConfig.setdefault("QueueURL", "http://localhost:5984/workqueue") self.msConfig.setdefault('enableRealMode', False) self.msConfig.setdefault('archiveDelayHours', 24 * 2) self.msConfig.setdefault('archiveAlarmHours', 24 * 30) @@ -137,19 +135,12 @@ def __init__(self, msConfig, logger=None): Functor(self.setArchivalDelayExpired), Functor(self.setLogDBClean), Functor(self.archive)]) - - pName = 'plineMSTrBlockGlobalQueue' - self.plineMSTrBlockGlobalQueue = Pipeline(name=pName, - funcLine=[Functor(self.setPlineMarker, pName), - Functor(self.getGlobalWorkQueueRucioRules,self.msConfig['QueueURL'],self.msConfig['rucioWmaAccount']), #use wma_test for now. Should be changed later to self.msConfig['rucioMStrAccount'] - Functor(self.cleanRucioRules)]) - + # Building the different set of plines we will need later: # NOTE: The following are all the functional pipelines which are supposed to include # a cleanup function and report cleanup status in the MSRuleCleanerWflow object self.cleanuplines = [self.plineMSTrCont, self.plineMSTrBlock, - #self.plineMSTrBlockGlobalQueue, self.plineAgentCont, self.plineAgentBlock] # Building an auxiliary list of cleanup pipeline names only: @@ -160,7 +151,6 @@ def __init__(self, msConfig, logger=None): self.plineAgentBlock] self.mstrlines = [self.plineMSTrCont, self.plineMSTrBlock] - #self.plineMSTrBlockGlobalQueue] # Initialization of the 'cleaned' and 'archived' counters: self.wfCounters = {'cleaned': {}, @@ -756,65 +746,6 @@ def getRucioRules(self, wflow, gran, rucioAcct): msg = "Container: %s not found in Rucio for workflow: %s." self.logger.info(msg, dataCont, wflow['RequestName']) return wflow - - def getGlobalWorkQueueRucioRules(self,wflow,QueueURL,rucioAcct): - - """ - Queries globle queue and builds the list of blocklevel rules of finished elements for the given workflow - :param wflow: A MSRuleCleaner workflow representation - :param QueueURL: The URL of the Global Work Queue - :rucioAcct: The Rucio account to use for querying rules - :return: The workflow object - """ - - currPline = wflow['PlineMarkers'][-1] - workflowName = wflow['RequestName'] - - #get work queue elements using work queue API. Not use since the microservice can be run on machines that can not access backend directly - #globalQueueElements = self.globalQueue.backend.getElementsForWorkflow(workflowName) - - wqService = WorkQueueDS(QueueURL, 'workqueue_t') - globalQueueElements=wqService.getWQElementsByWorkflow(workflowName) - - # Check and process the retrieved elements - if globalQueueElements: - print(f"Found {len(globalQueueElements)} elements for workflow: {workflowName}") - for element in globalQueueElements: - - percentComplete = element.get('PercentComplete', 0) # Default to 0 if key is missing - percentSuccess = element.get('PercentSuccess', 0) # Default to 0 if key is missing - - if percentComplete == 100 and percentSuccess == 100: - - #'Inputs': {'/MinimumBias/ComissioningHI-v1/RAW#372d624c-089d-11e1-8347-003048caaace': - blocks = element.get('Inputs') # Example key for dataset - - # Fetch rules for blocks - if blocks: - for block in blocks: - print("Adding block ", block, " to RulesToClean") - dataCont = block.split('#')[0] # Extract the container name from the block - - if dataCont in self.globalLocks: - msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the " - msg += "RulesToClean list for both container and block level Rules for workflow: %s!" - self.logger.info(msg, dataCont, wflow['RequestName']) - continue - try: - for rule in self.rucio.listDataRules(block, account=rucioAcct): - wflow['RulesToClean'][currPline].append(rule['id']) - msg = "Found %s block-level rule to be deleted for container %s" - self.logger.info(msg, rule['id'], dataCont) - except WMRucioDIDNotFoundException: - msg = "Block: %s not found in Rucio for workflow: %s." - self.logger.info(msg, block, wflow['RequestName']) - - - else: - print(f"No elements found for workflow: {workflowName}") - - return wflow - def cleanRucioRules(self, wflow): """ diff --git a/test/python/WMCore_t/MicroService_t/MSRuleCleaner_t/MSRuleCleaner_t.py b/test/python/WMCore_t/MicroService_t/MSRuleCleaner_t/MSRuleCleaner_t.py index c9f2562bbd..5cfa4551c8 100644 --- a/test/python/WMCore_t/MicroService_t/MSRuleCleaner_t/MSRuleCleaner_t.py +++ b/test/python/WMCore_t/MicroService_t/MSRuleCleaner_t/MSRuleCleaner_t.py @@ -15,17 +15,7 @@ from WMCore.MicroService.MSRuleCleaner.MSRuleCleaner import MSRuleCleaner, MSRuleCleanerArchivalSkip from WMCore.MicroService.MSRuleCleaner.MSRuleCleanerWflow import MSRuleCleanerWflow from WMCore.Services.Rucio import Rucio -from rucio.common.exception import RuleNotFound -from WMQuality.Emulators.EmulatedUnitTestCase import EmulatedUnitTestCase - -from WMQuality.TestInitCouchApp import TestInitCouchApp -from WMQuality.Emulators.WMSpecGenerator.WMSpecGenerator import WMSpecGenerator -from WMCore.WorkQueue.WorkQueue import globalQueue -from WMCore.Services.WorkQueue.WorkQueue import WorkQueue as WorkQueueDS - - -from WMCore.WMSpec.StdSpecs.ReReco import ReRecoWorkloadFactory def getTestFile(partialPath): """ @@ -35,8 +25,8 @@ def getTestFile(partialPath): return os.path.join(normPath, partialPath) -class MSRuleCleanerTest(EmulatedUnitTestCase): -#class MSRuleCleanerTest(unittest.TestCase): +# class MSRuleCleanerTest(EmulatedUnitTestCase): +class MSRuleCleanerTest(unittest.TestCase): "Unit test for MSruleCleaner module" def setUp(self): @@ -73,39 +63,12 @@ def setUp(self): "creds": self.creds} self.reqStatus = ['announced', 'aborted-completed', 'rejected'] - - self.specGenerator = WMSpecGenerator("WMSpecs") - self.schema = [] - self.couchApps = ["WorkQueue"] - self.testInit = TestInitCouchApp('WorkQueueServiceTest') - self.testInit.setLogging() - self.testInit.setDatabaseConnection() - self.testInit.setSchema(customModules=self.schema, - useDefault=False) - self.testInit.setupCouch('workqueue_t', *self.couchApps) - self.testInit.setupCouch('workqueue_t_inbox', *self.couchApps) - self.testInit.setupCouch('local_workqueue_t', *self.couchApps) - self.testInit.setupCouch('local_workqueue_t_inbox', *self.couchApps) - self.testInit.generateWorkDir() - - self.msConfig.update({'QueueURL':self.testInit.couchUrl}) - print("msConfig: ", json.dumps(self.msConfig, indent=2)) - - self.msRuleCleaner = MSRuleCleaner(self.msConfig) self.msRuleCleaner.resetCounters() self.msRuleCleaner.rucio = Rucio.Rucio(self.msConfig['rucioAccount'], hostUrl=self.rucioConfigDict['rucio_host'], authUrl=self.rucioConfigDict['auth_host'], configDict=self.rucioConfigDict) - - - - self.queueParams = {} - self.queueParams['log_reporter'] = "Services_WorkQueue_Unittest" - self.queueParams['rucioAccount'] = "wma_test" - self.queueParams['rucioAuthUrl'] = "http://cms-rucio-int.cern.ch" - self.queueParams['rucioUrl'] = "https://cms-rucio-auth-int.cern.ch" self.taskChainFile = getTestFile('data/ReqMgr/requests/Static/TaskChainRequestDump.json') self.stepChainFile = getTestFile('data/ReqMgr/requests/Static/StepChainRequestDump.json') @@ -218,7 +181,6 @@ def testPipelineAgentCont(self): def testPipelineMSTrBlock(self): # Test plineAgentCont wflow = MSRuleCleanerWflow(self.taskChainReq) - print(wflow) self.msRuleCleaner.plineMSTrBlock.run(wflow) expectedWflow = {'CleanupStatus': {'plineMSTrBlock': True}, 'ForceArchive': False, @@ -260,98 +222,7 @@ def testPipelineMSTrBlock(self): 'TransferTape': False, 'TapeRulesStatus': [], 'StatusAdvanceExpiredMsg': ""} - self.assertDictEqual(wflow, expectedWflow) - - def testPipelineMSTrBlockGlobalQueue(self): - - #turn of tests of pipelineMSTrBlockGlobalQueue - self.skipTest("Skipping testPipelineMSTrBlockGlobalQueue. This is placeholder for future test of pipelineMSTrBlockGlobalQueue if needed") - - #Get workflow description. ReRecoWorkloadFactory.getTestArguments() is used in createReRecoSpec below, - #so the workflow description here and the one used in creating workqueue is the same - specName = "RerecoSpec" - inputdataset = {"InputDataset": "/JetHT/Run2012C-v1/RAW"} - workflowDescription = ReRecoWorkloadFactory.getTestArguments() - workflowDescription['RequestName'] = specName - workflowDescription['InputDataset'] = inputdataset["InputDataset"] - - wflow = MSRuleCleanerWflow(workflowDescription) - - #Create ReRecoSpec as stored in GlobalQueue - specUrl = self.specGenerator.createReRecoSpec(specName, "file", - assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]},InputDataset=inputdataset["InputDataset"]) - #Make GlobalQueue - globalQ = globalQueue(DbName='workqueue_t', - QueueURL=self.testInit.couchUrl, - UnittestFlag=True, **self.queueParams) - globalQ.queueWork(specUrl, specName, "teamA") - - #Let try to modify the element in GlobalQueue to have PercentComplete and PercentSuccess set to 100 - wqService = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') - #Use this instead of wqService.getWQElementsByWorkflow(workflowName) to have the element'id' - data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName], 'endkey': [specName, {}], - 'reduce': False}) - - print("Elements in GlobalQueue:") - elements = data.get('rows', []) - print(json.dumps(elements, indent=2)) - - #let update the PercentComplete and PercentSuccess of the first elements - element_id = [elements[0]['id']] # Get the first element's ID - print("Updating element:", element_id) - wqService.updateElements(*element_id, PercentComplete=100, PercentSuccess=100) - # Re-fetch the elements to see the update - data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName], 'endkey': [specName, {}], - 'reduce': False}) - elements = data.get('rows', []) - #elements=wqService.getWQElementsByWorkflow(specName) - print("Updated Elements in GlobalQueue:") - for e in elements: - print(e["id"], e['value']['Status'], e['value']["PercentComplete"], e['value']["PercentSuccess"]) - #print(e["id"], e['Status'], e["PercentComplete"], e["PercentSuccess"]) - - #now let try to create Rucio rule for the block - #create a rule and inject it in wma_test account - blockNames = list(elements[0]['value']['Inputs'].keys()) # Get the block name from the first element - rule_id = self.msRuleCleaner.rucio.createReplicationRule( - names=blockNames[0], - rseExpression="T2_US_Nebraska", - copies=1, - grouping="DATASET", - lifetime=360, - account="wma_test", - ask_approval=False, - activity="Production Input", - comment="WMCore test block rule creation" - ) - - print("Created Rucio rule with ID:", rule_id) - rule_info = self.msRuleCleaner.rucio.getRule(rule_id[0]) - print(rule_info) - - self.msRuleCleaner.plineMSTrBlockGlobalQueue.run(wflow) - print("Workflow after plineMSTrBlockGlobalQueue:") - print(json.dumps(wflow, indent=2)) - - #now make sure the rule is cleaned - try: - rule_info = self.msRuleCleaner.rucio.getRule(rule_id[0]) - #print("Rule exists:", json.dumps(rule_info, indent=2)) - #now delete it - self.msRuleCleaner.rucio.deleteRule(rule_id[0]) - print("Deleted Rucio rule with ID:", rule_id) - except RuleNotFound: - print("Rule not found.") - except Exception as e: - print("Error checking rule:", e) - - print("Cleanup status: ", wflow['CleanupStatus']['plineMSTrBlockGlobalQueue']) - print("Rules to clean: ", wflow['RulesToClean']['plineMSTrBlockGlobalQueue'], rule_id) - assert((wflow['CleanupStatus']['plineMSTrBlockGlobalQueue'] is True)) - assert((wflow['RulesToClean']['plineMSTrBlockGlobalQueue'] == rule_id)) def testPipelineMSTrCont(self): # Test plineAgentCont From ec48cd7f677619fbd51d2e762b36d656ea80dfd7 Mon Sep 17 00:00:00 2001 From: Duong Date: Wed, 19 Nov 2025 00:00:09 -0500 Subject: [PATCH 03/14] turn-off msRuleCleaner --- .../CherryPyThreads/InputDataRucioRuleCleaner.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py index a417de8c93..380b81880a 100644 --- a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py +++ b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py @@ -77,8 +77,15 @@ def cleanRucioRules(self, config): self.logger.info(msg, block, requestName) continue - self.logger.info("%s executed in %.3f secs.", self.__class__.__name__, time() - tStart) - return self.msRuleCleaner.cleanRucioRules(rulesToClean) + self.logger.info("%s executed in %.3f secs. Found %d global queue elements.", self.__class__.__name__, time() - tStart, len(globalQueueElements)) + tmp = rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]] + ids = '' + for rid in tmp: + ids += rid + ', ' + rulesToClean['CleanupStatus']['Current'].append({'RuleID': rid, 'Status': 'Pending'}) + self.logger.info('Rules to be cleaned: %s', ids) + return rulesToClean + #return self.msRuleCleaner.cleanRucioRules(rulesToClean) else: print("No elements with status DONE found in GlobalQueue") From 1dc341c10bef97a3b98fb33d140bac895aeda496 Mon Sep 17 00:00:00 2001 From: Duong Date: Wed, 19 Nov 2025 12:59:01 -0500 Subject: [PATCH 04/14] retrigger checks From 6f0358edbb083dc7d54c7653b0eee4576d256da3 Mon Sep 17 00:00:00 2001 From: Duong Date: Wed, 19 Nov 2025 12:59:10 -0500 Subject: [PATCH 05/14] retrigger checks From 09acf2f4fe1717e591b7ed115c9a802b4044a9cd Mon Sep 17 00:00:00 2001 From: Duong Date: Wed, 10 Dec 2025 15:25:32 -0500 Subject: [PATCH 06/14] fix msRuleCleaner initialization, add time tracking --- .../InputDataRucioRuleCleaner.py | 54 +++++++++++-------- src/python/WMQuality/TestInitCouchApp.py | 7 ++- .../InputDataRucioRuleCleaner_t.py | 29 +++++----- 3 files changed, 54 insertions(+), 36 deletions(-) diff --git a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py index 380b81880a..75ad76c33b 100644 --- a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py +++ b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py @@ -1,19 +1,25 @@ from __future__ import (division, print_function) -from time import time +import time from WMCore.REST.CherryPyPeriodicTask import CherryPyPeriodicTask from WMCore.WorkQueue.WorkQueue import globalQueue from WMCore.MicroService.MSRuleCleaner.MSRuleCleaner import MSRuleCleaner from WMCore.Services.Rucio.Rucio import WMRucioDIDNotFoundException +def format_timestamp(timestamp_float): + """Converts a float timestamp (seconds since epoch) to a readable string.""" + # This format gives you: "2025-12-09 19:22:15" + return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp_float)) + + class InputDataRucioRuleCleaner(CherryPyPeriodicTask): def __init__(self, rest, config): super(InputDataRucioRuleCleaner, self).__init__(config) self.globalQ = globalQueue(logger=self.logger, **config.queueParams) - self.msRuleCleaner = MSRuleCleaner(config, logger=self.logger) # Initialize MSRuleCleaner + self.msRuleCleaner = MSRuleCleaner(config.msRuleCleaner, logger=self.logger) # Initialize MSRuleCleaner def setConcurrentTasks(self, config): """ @@ -28,7 +34,7 @@ def cleanRucioRules(self, config): :return: The result of MSRuleCleaner cleanRucioRules(self, wflow) method, which is True if all rules were deleted successfully, False otherwise. """ - tStart = time() + tStart = time.time() #statuses = ['Available', 'Done', 'Acquired', 'Failed', 'Canceled'] #globalQueueElements=self.globalQ.getWork({'Status':'Done'},siteJobCounts={}) @@ -42,13 +48,14 @@ def cleanRucioRules(self, config): if globalQueueElements: #print(f"Found {len(globalQueueElements)} elements in GlobalQueue") + current_time = format_timestamp(time.time()) + self.logger.info(f"{current_time}: Found {len(globalQueueElements)} globalqueue elements.") for element in globalQueueElements: requestName = element.get('RequestName') # Extract the RequestName field percentComplete = element.get('PercentComplete', 0) # Default to 0 if key is missing percentSuccess = element.get('PercentSuccess', 0) # Default to 0 if key is missing - if percentComplete == 100 and percentSuccess == 100: #'Inputs': {'/MinimumBias/ComissioningHI-v1/RAW#372d624c-089d-11e1-8347-003048caaace': @@ -57,7 +64,7 @@ def cleanRucioRules(self, config): # Fetch rules for blocks if blocks: for block in blocks: - print("Adding block ", block, " to RulesToClean") + #print("Adding block ", block, " to RulesToClean") dataCont = block.split('#')[0] # Extract the container name from the block if dataCont in self.msRuleCleaner.globalLocks: @@ -66,10 +73,12 @@ def cleanRucioRules(self, config): self.logger.info(msg, dataCont, requestName) continue try: - print('Fetching rules for block:', block, "\n", config.rucioAccount, "\n", self.msRuleCleaner.rucio.listDataRules(block, account=config.rucioAccount)) - for rule in self.msRuleCleaner.rucio.listDataRules(block, account=config.rucioAccount): - msg = "Found %s block-level rule to be deleted for container %s" - self.logger.info(msg, rule['id'], dataCont) + #print('Fetching rules for block:', block, "\n", config.rucioAccount, "\n", self.msRuleCleaner.rucio.listDataRules(block, account=config.rucioAccount)) + for rule in self.msRuleCleaner.rucio.listDataRules(block, account=config.msRuleCleaner['rucioAccount']): + #msg = "Found %s block-level rule to be deleted for container %s" + #self.logger.info(msg, rule['id'], dataCont) + current_time = format_timestamp(time.time()) + self.logger.info(f"{current_time}: Rule {rule['id']} {block} {rule['bytes']} {requestName} to be cleaned") #cleanRules of MSRuleCleaner expects a list of rule ids and always clean the last one in the list of PlineMarkers rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]].append(rule['id']) except WMRucioDIDNotFoundException: @@ -77,18 +86,21 @@ def cleanRucioRules(self, config): self.logger.info(msg, block, requestName) continue - self.logger.info("%s executed in %.3f secs. Found %d global queue elements.", self.__class__.__name__, time() - tStart, len(globalQueueElements)) - tmp = rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]] - ids = '' - for rid in tmp: - ids += rid + ', ' - rulesToClean['CleanupStatus']['Current'].append({'RuleID': rid, 'Status': 'Pending'}) - self.logger.info('Rules to be cleaned: %s', ids) - return rulesToClean - #return self.msRuleCleaner.cleanRucioRules(rulesToClean) + current_time = format_timestamp(time.time()) + self.logger.info(f"{current_time}: {self.__class__.__name__} executed in {(time.time() - tStart):.3f} secs.") + #tmp = rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]] + #ids = '' + #for rid in tmp: + # ids += rid + ', ' + # rulesToClean['CleanupStatus']['Current'].append({'RuleID': rid, 'Status': 'Pending'}) + #self.logger.info('Rules to be cleaned: %s', ids) + #return rulesToClean + return self.msRuleCleaner.cleanRucioRules(rulesToClean) else: - print("No elements with status DONE found in GlobalQueue") - - self.logger.info("%s executed in %.3f secs.", self.__class__.__name__, time() - tStart) + current_time = format_timestamp(time.time()) + self.logger.info(f"{current_time}: No elements with status DONE found in GlobalQueue") + + current_time = format_timestamp(time.time()) + self.logger.info(f"{current_time}: {self.__class__.__name__} executed in {(time.time() - tStart):.3f} secs.") return \ No newline at end of file diff --git a/src/python/WMQuality/TestInitCouchApp.py b/src/python/WMQuality/TestInitCouchApp.py index 8a116b86c8..05fdb7b352 100644 --- a/src/python/WMQuality/TestInitCouchApp.py +++ b/src/python/WMQuality/TestInitCouchApp.py @@ -25,6 +25,7 @@ from WMQuality.TestInit import TestInit +from WMCore.WMBase import getWMBASE class CouchAppTestHarness(object): """ @@ -111,8 +112,10 @@ def couchAppRoot(self, couchapp): :param couchapp: couch application to use """ - wmcoreroot = os.path.normpath(os.path.join(self.init.getWMBASE(), '..', '..', '..')) - develPath = os.path.join(self.init.getWMBASE(), "src", "couchapps") + #wmcoreroot = os.path.normpath(os.path.join(self.init.getWMBASE(), '..', '..', '..')) + #develPath = os.path.join(self.init.getWMBASE(), "src", "couchapps") + wmcoreroot = os.path.normpath(os.path.join(getWMBASE(), '..', '..', '..')) + develPath = os.path.join(getWMBASE(), "src", "couchapps") if os.path.exists(os.path.join(develPath, couchapp)): return develPath elif os.path.exists(os.path.join(wmcoreroot, 'xdata', 'couchapps', couchapp)): diff --git a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py index 654e9c588c..b21a3b2dc2 100644 --- a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py +++ b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py @@ -37,7 +37,8 @@ def __getattr__(self, key): class InputDataRucioRuleCleanerTest(EmulatedUnitTestCase): def setUp(self): - self.msConfig = {"verbose": True, + self.config = {} + self.msRuleCleaner = {"verbose": True, "interval": 1 * 60, "services": ['ruleCleaner'], "rucioAccount": 'wma_test', @@ -57,10 +58,10 @@ def setUp(self): self.creds = {"client_cert": os.getenv("X509_USER_CERT", "Unknown"), "client_key": os.getenv("X509_USER_KEY", "Unknown")} - self.rucioConfigDict = {"rucio_host": self.msConfig['rucioUrl'], - "auth_host": self.msConfig['rucioAuthUrl'], + self.rucioConfigDict = {"rucio_host": self.msRuleCleaner['rucioUrl'], + "auth_host": self.msRuleCleaner['rucioAuthUrl'], "auth_type": "x509", - "account": self.msConfig['rucioAccount'], + "account": self.msRuleCleaner['rucioAccount'], "ca_cert": False, "timeout": 30, "request_retries": 3, @@ -81,11 +82,11 @@ def setUp(self): self.testInit.setupCouch('local_workqueue_t_inbox', *self.couchApps) self.testInit.generateWorkDir() - self.msConfig.update({'QueueURL':self.testInit.couchUrl}) + self.msRuleCleaner.update({'QueueURL':self.testInit.couchUrl}) self.queueParams = {} self.queueParams['log_reporter'] = "Services_WorkQueue_Unittest" - self.queueParams['rucioAccount'] = self.msConfig['rucioAccount'] + self.queueParams['rucioAccount'] = self.msRuleCleaner['rucioAccount'] self.queueParams['rucioAuthUrl'] = "http://cms-rucio-int.cern.ch" self.queueParams['rucioUrl'] = "https://cms-rucio-auth-int.cern.ch" self.queueParams['_internal_name'] = 'GlobalWorkQueueTest' @@ -96,12 +97,14 @@ def setUp(self): print("X509_USER_KEY:", os.getenv("X509_USER_KEY")) # Create config object with attributes - self.config_obj = DictWithAttrs(self.msConfig) + self.config_obj = DictWithAttrs(self.config) #additional attributes needed by cherrypy periodic task self.config_obj._internal_name = "GlobalWorkQueueTest" self.config_obj.log_file = "test.log" #additional attributes needed by global workqueue self.config_obj.queueParams = self.queueParams + #additional attributes needed by MSRuleCleaner + self.config_obj.msRuleCleaner = self.msRuleCleaner #duration for the periodic task in seconds self.config_obj.cleanInputDataRucioRuleDuration = 10 @@ -131,9 +134,9 @@ def testInputDataRucioRuleCleaner(self): cleaner.globalQ = globalQ #Make MSRuleCleaner - msRuleCleaner = MSRuleCleaner(self.config_obj,logger=cleaner.logger) + msRuleCleaner = MSRuleCleaner(self.config_obj.msRuleCleaner,logger=cleaner.logger) msRuleCleaner.resetCounters() - msRuleCleaner.rucio = Rucio.Rucio(self.msConfig['rucioAccount'], + msRuleCleaner.rucio = Rucio.Rucio(self.msRuleCleaner['rucioAccount'], hostUrl=self.rucioConfigDict['rucio_host'], authUrl=self.rucioConfigDict['auth_host'], configDict=self.rucioConfigDict) @@ -200,7 +203,7 @@ def testInputDataRucioRuleCleaner(self): start_time = time.time() while rule_info and not delResult and timeleft < 300: #now delete it - print('Manually deleting rucio rules: ', blockNames[0], cleaner.msRuleCleaner.rucio.listDataRules(blockNames[0], account=self.msConfig['rucioAccount'])) + print('Manually deleting rucio rules: ', blockNames[0], cleaner.msRuleCleaner.rucio.listDataRules(blockNames[0], account=self.msRuleCleaner['rucioAccount'])) delResult = cleaner.msRuleCleaner.rucio.deleteRule(rule_id[0]) print("Deleted Rucio rule with ID:", rule_id, delResult) if delResult: break @@ -238,9 +241,9 @@ def testInputDataRucioRuleCleanerWithThreading(self): cleaner.globalQ = globalQ #Make MSRuleCleaner - msRuleCleaner = MSRuleCleaner(self.config_obj,logger=cleaner.logger) + msRuleCleaner = MSRuleCleaner(self.config_obj.msRuleCleaner,logger=cleaner.logger) msRuleCleaner.resetCounters() - msRuleCleaner.rucio = Rucio.Rucio(self.msConfig['rucioAccount'], + msRuleCleaner.rucio = Rucio.Rucio(self.msRuleCleaner['rucioAccount'], hostUrl=self.rucioConfigDict['rucio_host'], authUrl=self.rucioConfigDict['auth_host'], configDict=self.rucioConfigDict) @@ -326,7 +329,7 @@ def testInputDataRucioRuleCleanerWithThreading(self): start_time = time.time() while rule_info and not delResult and timeleft < 300: #now delete it - print('Manually deleting rucio rules: ', blockNames[0], cleaner.msRuleCleaner.rucio.listDataRules(blockNames[0], account=self.msConfig['rucioAccount'])) + print('Manually deleting rucio rules: ', blockNames[0], cleaner.msRuleCleaner.rucio.listDataRules(blockNames[0], account=self.msRuleCleaner['rucioAccount'])) delResult = cleaner.msRuleCleaner.rucio.deleteRule(rule_id[0]) print("Deleted Rucio rule with ID:", rule_id, delResult) if delResult: break From eec5d69d8835ef041f9b38a7a08fcad428f9f60e Mon Sep 17 00:00:00 2001 From: Duong Date: Mon, 12 Jan 2026 09:23:43 -0500 Subject: [PATCH 07/14] used for Dec. 18, 2025 test workflow --- .../InputDataRucioRuleCleaner.py | 66 ++++++++++++++----- .../InputDataRucioRuleCleaner_t.py | 2 +- 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py index 75ad76c33b..ade391e608 100644 --- a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py +++ b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py @@ -42,14 +42,14 @@ def cleanRucioRules(self, config): #print("Elements in GlobalQueue cleanRucioRules:") #print(json.dumps(globalQueueElements,indent=2)) - - #to be able to use cleanRules method of MSRuleCleaner - rulesToClean = {'PlineMarkers':['Current'], 'RulesToClean': {'Current': []}, 'CleanupStatus': {'Current': []}} + + do_cleaning = False if globalQueueElements: #print(f"Found {len(globalQueueElements)} elements in GlobalQueue") current_time = format_timestamp(time.time()) self.logger.info(f"{current_time}: Found {len(globalQueueElements)} globalqueue elements.") + for element in globalQueueElements: requestName = element.get('RequestName') # Extract the RequestName field @@ -57,11 +57,15 @@ def cleanRucioRules(self, config): percentSuccess = element.get('PercentSuccess', 0) # Default to 0 if key is missing if percentComplete == 100 and percentSuccess == 100: - + + #to be able to use cleanRules method of MSRuleCleaner + rulesToClean = {'PlineMarkers':['Current'], 'RulesToClean': {'Current': []}, 'CleanupStatus': {'Current': []}} + #'Inputs': {'/MinimumBias/ComissioningHI-v1/RAW#372d624c-089d-11e1-8347-003048caaace': blocks = element.get('Inputs') # Example key for dataset # Fetch rules for blocks + cleanedRules_info = {} if blocks: for block in blocks: #print("Adding block ", block, " to RulesToClean") @@ -74,20 +78,48 @@ def cleanRucioRules(self, config): continue try: #print('Fetching rules for block:', block, "\n", config.rucioAccount, "\n", self.msRuleCleaner.rucio.listDataRules(block, account=config.rucioAccount)) - for rule in self.msRuleCleaner.rucio.listDataRules(block, account=config.msRuleCleaner['rucioAccount']): - #msg = "Found %s block-level rule to be deleted for container %s" - #self.logger.info(msg, rule['id'], dataCont) - current_time = format_timestamp(time.time()) - self.logger.info(f"{current_time}: Rule {rule['id']} {block} {rule['bytes']} {requestName} to be cleaned") - #cleanRules of MSRuleCleaner expects a list of rule ids and always clean the last one in the list of PlineMarkers - rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]].append(rule['id']) + rules = self.msRuleCleaner.rucio.listDataRules(block, account=config.msRuleCleaner['rucioAccount']) + #found rules for this block. If the rules of this block already cleaned there is no rules found + if rules: + cleanedRules_info[block] = {} + #one block can have multiple rules + cleanedRules_info[block]['id'] = [] + cleanedRules_info[block]['bytes'] = [] + for rule in rules: + #msg = "Found %s block-level rule to be deleted for container %s" + #self.logger.info(msg, rule['id'], dataCont) + #current_time = format_timestamp(time.time()) + #self.logger.info(f"{current_time}: Rule {rule['id']} {block} {rule['bytes']} {requestName} to be cleaned") + cleanedRules_info[block]['id'].append(rule['id']) + cleanedRules_info[block]['bytes'].append(rule['bytes']) + #cleanRules of MSRuleCleaner expects a list of rule ids and always clean the last one in the list of PlineMarkers + rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]].append(rule['id']) except WMRucioDIDNotFoundException: msg = "Block: %s not found in Rucio for workflow: %s." self.logger.info(msg, block, requestName) continue + + if cleanedRules_info: + current_time = format_timestamp(time.time()) + self.logger.info(f"{current_time}: Start cleaning rules for completed element {element.id}") + + do_cleaning = True + + for block, info in cleanedRules_info.items(): + for rule_id, size in zip(info["id"], info["bytes"]): + self.logger.info(f"{current_time} Rule to clean: {rule_id} {block} {size} {requestName}") + + self.msRuleCleaner.cleanRucioRules(rulesToClean) + + current_time = format_timestamp(time.time()) + self.logger.info(f"{current_time}: End cleaning rules for completed element {element.id}") - current_time = format_timestamp(time.time()) - self.logger.info(f"{current_time}: {self.__class__.__name__} executed in {(time.time() - tStart):.3f} secs.") + if not do_cleaning: + current_time = format_timestamp(time.time()) + self.logger.info(f"{current_time} No cleaning happened: There are no completed workqueue elements or rules already cleaned") + + #current_time = format_timestamp(time.time()) + #self.logger.info(f"{current_time}: {self.__class__.__name__} executed in {(time.time() - tStart):.3f} secs.") #tmp = rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]] #ids = '' #for rid in tmp: @@ -95,12 +127,12 @@ def cleanRucioRules(self, config): # rulesToClean['CleanupStatus']['Current'].append({'RuleID': rid, 'Status': 'Pending'}) #self.logger.info('Rules to be cleaned: %s', ids) #return rulesToClean - return self.msRuleCleaner.cleanRucioRules(rulesToClean) + #return self.msRuleCleaner.cleanRucioRules(rulesToClean) else: current_time = format_timestamp(time.time()) - self.logger.info(f"{current_time}: No elements with status DONE found in GlobalQueue") + self.logger.info(f"{current_time} No elements found in GlobalQueue") current_time = format_timestamp(time.time()) - self.logger.info(f"{current_time}: {self.__class__.__name__} executed in {(time.time() - tStart):.3f} secs.") - return \ No newline at end of file + self.logger.info(f"{current_time} {self.__class__.__name__} executed in {(time.time() - tStart):.3f} secs.") + return do_cleaning \ No newline at end of file diff --git a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py index b21a3b2dc2..cc5ccc4d05 100644 --- a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py +++ b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py @@ -213,7 +213,7 @@ def testInputDataRucioRuleCleaner(self): if not delResult and timeleft >= 300: print("Failed to delete the rule after 5 minutes, exiting...") - self.assertTrue(results['CleanupStatus']['Current']) + self.assertTrue(results) def testInputDataRucioRuleCleanerWithThreading(self): From 664d498c862ab0518648aa9bcc5ed7b6c4992a32 Mon Sep 17 00:00:00 2001 From: Duong Date: Wed, 28 Jan 2026 16:57:51 -0500 Subject: [PATCH 08/14] two workflows using the same input data --- .../InputDataRucioRuleCleaner.py | 219 +++++++++++++++- .../InputDataRucioRuleCleaner_t.py | 243 +++++++++++++++++- 2 files changed, 457 insertions(+), 5 deletions(-) diff --git a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py index ade391e608..15e4898525 100644 --- a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py +++ b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py @@ -1,17 +1,110 @@ from __future__ import (division, print_function) import time +#import json from WMCore.REST.CherryPyPeriodicTask import CherryPyPeriodicTask from WMCore.WorkQueue.WorkQueue import globalQueue from WMCore.MicroService.MSRuleCleaner.MSRuleCleaner import MSRuleCleaner from WMCore.Services.Rucio.Rucio import WMRucioDIDNotFoundException +from WMCore.ReqMgr.Web.ReqMgrService import getdata +#from WMCore.Services.pycurl_manager import RequestHandler +#from Utils.CertTools import ckey, cert + def format_timestamp(timestamp_float): """Converts a float timestamp (seconds since epoch) to a readable string.""" # This format gives you: "2025-12-09 19:22:15" return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp_float)) +''' +def canDeleteRucioRule(self, currentRequestName, block, dataCont, config): + """ + Check if the Rucio rule for the given block can be deleted. + :param currentRequest: The name of the current request being processed + :param block: The data block to check + :param dataCont: The container name extracted from the block + :param config: The configuration object + :return: True if the rule can be deleted, False otherwise + """ + try: + # Step 1: Find the requests that use the same input data + #url = 'https://cmsweb.cern.ch/reqmgr2/data/request?outputdataset=%s' % dataset + #params = {} + #headers = {'Accept': 'application/json'} + #https://gitlab.cern.ch/cmsweb-k8s/services_config/-/blob/test/config.test16/workqueue/config.py?ref_type=heads + #config.msRuleCleaner['reqmgr2Url'] = "%s/reqmgr2" % BASE_URL + #url = config.msRuleCleaner['reqmgr2Url']+'/data/request?inputdataset=%s' % dataCont + url = f"{config.msRuleCleaner['reqmgr2Url']}/data/request?inputdataset={dataCont}" + params = {} + headers = {"Accept": "application/json"} + + # Use the getdata function to fetch the requests using the current input data + response = getdata(url, params, headers) + + if not response or "result" not in response: + self.logger.warning(f"Failed to fetch requests using dataset {dataCont}. Response: {response}") + return True # Assume no requests if the response is invalid + + requestsUsingData = response["result"] + + # Step 1a: If there are no requests, return True. No need since this always returns at least the current request + #if not requestsUsingData: + # self.logger.info(f"No requests are using dataset {dataCont}. Rule for block {block} can be deleted.") + # return True + + #self.logger.info(f"Dataset {dataCont} is in use by the following requests: {requestsUsingData}") + + # Step 2: Find the workqueue elements of those requests + foundElements = False # Track if any workqueue elements are found + for request in requestsUsingData: + # Skip the current request + if request['RequestName'] == currentRequestName: + continue + + # if the workflow is done etc return True. We can delete the rule + + try: + # Query the global queue for elements of the other request + otherRequestElements = self.globalQ.backend.getElements(WorkflowName=request['RequestName']) + + # Step 3: If there are no workqueue elements for this request, continue to the next request + if not otherRequestElements: + self.logger.info(f"No workqueue elements found for request {request}. The workqueue might not have been created yet.") + continue + + foundElements = True # At least one element is found at other request + + # Step 4: Check the status of these workqueue element that uses the same datablock + for otherElement in otherRequestElements: + + if block not in otherElement.get('Inputs'): continue + + percentComplete = otherElement.get('PercentComplete', 0) # Default to 0 if key is missing + percentSuccess = otherElement.get('PercentSuccess', 0) # Default to 0 if key is missing + + # Step 5: If any workqueue element is not completed, return False + if percentComplete < 100 or percentSuccess < 100: + self.logger.info(f"Workqueue element {otherElement.get('id')} for request {request['RequestName']} is not yet completed. Rule for block {block} cannot be deleted.") + return False + + except Exception as ex: + self.logger.error(f"Error while finding elements for request {request}: {str(ex)}") + return False + + # Step 3 (fixed): If no elements were found for all requests, return False + if not foundElements: + self.logger.info(f"No workqueue elements found for any of the requests using dataset {dataCont}. Rule for block {block} cannot be deleted.") + return False + + # Step 4: If all workqueue elements are processed, return True + self.logger.info(f"All workqueue elements for requests using dataset {dataCont} are completed. Rule for block {block} can be deleted.") + return True + + except Exception as ex: + self.logger.error(f"Error while checking if rule for block {block} can be deleted: {str(ex)}") + return False +''' class InputDataRucioRuleCleaner(CherryPyPeriodicTask): @@ -20,6 +113,7 @@ def __init__(self, rest, config): super(InputDataRucioRuleCleaner, self).__init__(config) self.globalQ = globalQueue(logger=self.logger, **config.queueParams) self.msRuleCleaner = MSRuleCleaner(config.msRuleCleaner, logger=self.logger) # Initialize MSRuleCleaner + #self.curlMgr = RequestHandler() def setConcurrentTasks(self, config): """ @@ -27,6 +121,93 @@ def setConcurrentTasks(self, config): """ self.concurrentTasks = [{'func': self.cleanRucioRules, 'duration': config.cleanInputDataRucioRuleDuration}] + def getRequestForInputDataset(self, inputdataset, reqmgr2Url): + # Step 1: Find the requests that use the same input data + #url = f"{config.msRuleCleaner['reqmgr2Url']}/data/request?inputdataset={inputdataset}" + url = f"{reqmgr2Url}/data/request?inputdataset={inputdataset}" + params = {} + headers = {"Accept": "application/json"} + res = None + try: + #res = self.curlMgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) + #res = json.loads(res) + res = getdata(url, params, headers) + except Exception as ex: + msg = "General exception while fetching requests from ReqMgr2 for inputdataset %s" + self.logger.exception(msg, inputdataset, str(ex)) + + # Use the getdata function to fetch the requests using the current input data + return res + + def canDeleteRucioRule(self, currentRequestName, block, dataCont, config): + """ + Check if the Rucio rule for the given block can be deleted. + :param currentRequest: The name of the current request being processed + :param block: The data block to check + :param dataCont: The container name extracted from the block + :param config: The configuration object + :return: True if the rule can be deleted, False otherwise + """ + try: + # Step 1: Find the requests that use the same input data + #url = f"{config.msRuleCleaner['reqmgr2Url']}/data/request?inputdataset={dataCont}" + #params = {} + #headers = {"Accept": "application/json"} + + # Use the getdata function to fetch the requests using the current input data + #response = getdata(url, params, headers) + response = self.getRequestForInputDataset(dataCont, config.msRuleCleaner['reqmgr2Url']) + + if not response or "result" not in response: + self.logger.warning(f"Failed to fetch requests using dataset {dataCont}. Response: {response}") + return False # We do not know what is going on, better not delete the rule + + self.logger.info(f"Response: {response}") + + requestsUsingData = response["result"][0] + + self.logger.info(f"Requests: {requestsUsingData}") + + for request_id,request_data in requestsUsingData.items(): + # Skip the current request + if request_data['RequestName'] == currentRequestName: + continue + + # only consider workflows in good status and not done yet + if request_data['RequestStatus'] not in ['new', 'assignment-approved', 'assigned', 'staging', 'acquired', 'staged', 'running-open', 'running-closed']: + self.logger.info(f"Request {request_data['RequestName']} is in status {request_data['RequestStatus']}. Continuing to next request.") + continue + + try: + # Step 2: Query the global queue for elements of the other request + otherRequestElements = self.globalQ.backend.getElements(WorkflowName=request_data['RequestName']) + + if not otherRequestElements: + self.logger.info(f"No workqueue elements found for request {request_id}: {request_data}. The workqueue might not have been created yet.") + return False # We do not know what is going on, better not delete the rule + + # Step 3: Check the status of these workqueue element that uses the same datablock + for otherElement in otherRequestElements: + + if block not in otherElement.get('Inputs'): continue + + percentComplete = otherElement.get('PercentComplete', 0) # Default to 0 if key is missing + percentSuccess = otherElement.get('PercentSuccess', 0) # Default to 0 if key is missing + + if percentComplete < 100 or percentSuccess < 100: + self.logger.info(f"Rule for block {block} cannot be deleted. Workqueue elements of request {request_data['RequestName']} using the same block have not completed processing ({percentComplete}, {percentSuccess}).") + return False + + except Exception as ex: + self.logger.error(f"Error while finding elements for request {request_id}: {request_data} and making consideration on data processing completion: {str(ex)}") + return False #We do not know what is going on, better not delete the rule + + return True + + except Exception as ex: + self.logger.error(f"Error while checking if rule for block {block} can be deleted: {str(ex)}") + return False #We do not know what is going on, better not delete the rule + def cleanRucioRules(self, config): """ Queries global queue and builds the list of blocklevel Rucio rules of finished elements to be deleted. Calls MSRuleCleaner cleanRucioRules(self, wflow) to delete the rules. @@ -71,11 +252,41 @@ def cleanRucioRules(self, config): #print("Adding block ", block, " to RulesToClean") dataCont = block.split('#')[0] # Extract the container name from the block - if dataCont in self.msRuleCleaner.globalLocks: - msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the " - msg += "RulesToClean list for both container and block level Rules for workflow: %s!" - self.logger.info(msg, dataCont, requestName) + # Check if the Rucio rule for this block can be deleted + if not self.canDeleteRucioRule(requestName, block, dataCont, config): + self.logger.info(f"Skipping deletion of rules for block {block} as it is still in use.") continue + + ## Check if the dataset is in use by other requests + #try: + # # Assuming there's an endpoint that provides the requests using the input data + # url = f"{config.reqmgrUrl}/data/request_by_input" + # params = {"input": dataCont} + # headers = {"Accept": "application/json"} + # + # # Use the getdata function to fetch the requests using the current input data + # response = getdata(url, params, headers) + # + # # Process the response to extract request names + # if response and "result" in response: + # requestsUsingData = response["result"] + # if requestsUsingData: + # self.logger.info(f"Dataset {dataCont} is still in use by the following requests: {requestsUsingData}") + # continue # Skip cleaning for this dataset + # else: + # self.logger.info(f"Dataset {dataCont} is not in use by any other requests.") + # else: + # self.logger.warning(f"Failed to fetch requests using dataset {dataCont}. Response: {response}") + #except Exception as ex: + # self.logger.error(f"Error while checking requests using dataset {dataCont}: {str(ex)}") + # continue + + #need to self.getGlobalLocks() before using self.msRuleCleaner.globalLocks + #if dataCont in self.msRuleCleaner.globalLocks: + # msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the " + # msg += "RulesToClean list for both container and block level Rules for workflow: %s!" + # self.logger.info(msg, dataCont, requestName) + # continue try: #print('Fetching rules for block:', block, "\n", config.rucioAccount, "\n", self.msRuleCleaner.rucio.listDataRules(block, account=config.rucioAccount)) rules = self.msRuleCleaner.rucio.listDataRules(block, account=config.msRuleCleaner['rucioAccount']) diff --git a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py index cc5ccc4d05..f9b8b99d0b 100644 --- a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py +++ b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py @@ -12,6 +12,7 @@ from WMCore.WorkQueue.WorkQueue import globalQueue from WMCore.Services.WorkQueue.WorkQueue import WorkQueue as WorkQueueDS from WMCore.MicroService.MSRuleCleaner.MSRuleCleaner import MSRuleCleaner +from WMCore.ReqMgr.Web.ReqMgrService import getdata import json # system modules @@ -20,6 +21,9 @@ import unittest +from urllib.parse import parse_qs, urlparse +#from unittest.mock import patch +from mock import mock class DummyREST: def __init__(self): @@ -110,7 +114,9 @@ def setUp(self): super(InputDataRucioRuleCleanerTest, self).setUp() - def testInputDataRucioRuleCleaner(self): + + @mock.patch('WMCore.GlobalWorkQueue.CherryPyThreads.InputDataRucioRuleCleaner.InputDataRucioRuleCleaner.getRequestForInputDataset') + def testInputDataRucioRuleCleaner(self, mock_getRequestForInputDataset): """ Test the InputDataRucioRuleCleaner task """ @@ -193,6 +199,30 @@ def testInputDataRucioRuleCleaner(self): #print(e["id"], e['Status'], e["PercentComplete"], e["PercentSuccess"]) + # Define a variable to hold the dynamic RequestName and RequestStatus + self.dynamicRequestName = specName + self.dynamicRequestStatus = "running-open" # Default value + self.ReferenceInputDatasets = ["/JetHT/Run2012C-v1/RAW"] + + def mock_getRequestForInputDataset_side_effect(inputdataset, reqmgr2Url): + if inputdataset in self.ReferenceInputDatasets: #only respond to the input data set that is used by a request + # Simulate retrieving the workflow details + return { + "result": [ + {self.dynamicRequestName:{ + "RequestName": self.dynamicRequestName, # Use the dynamic RequestName + "RequestStatus": self.dynamicRequestStatus, # Use the dynamic RequestStatus + } + } + ] + } + else: + #return {"status": "error", "message": "Invalid request: inputdataset not found"} + return {"result": []} + + # Assign the side effect to the mock object + mock_getRequestForInputDataset.side_effect = mock_getRequestForInputDataset_side_effect + results = cleaner.cleanRucioRules(self.config_obj) print("Results from cleanRucioRules:", json.dumps(results, indent=2)) #now make sure the rule is cleaned @@ -213,9 +243,220 @@ def testInputDataRucioRuleCleaner(self): if not delResult and timeleft >= 300: print("Failed to delete the rule after 5 minutes, exiting...") + #self.assertTrue(False) + self.assertTrue(results) + + @mock.patch('WMCore.GlobalWorkQueue.CherryPyThreads.InputDataRucioRuleCleaner.InputDataRucioRuleCleaner.getRequestForInputDataset') + def testInputDataRucioRuleCleanerTwoWorkflowSameInputdata(self, mock_getRequestForInputDataset): + """ + Test the InputDataRucioRuleCleaner task with two workflows using the same input data set + """ + #Get workflow description. ReRecoWorkloadFactory.getTestArguments() is used in createReRecoSpec below, + #so the workflow description here and the one used in creating workqueue is the same + specName = "RerecoSpec" + inputdataset = {"InputDataset": "/JetHT/Run2012C-v1/RAW"} + + #Create ReRecoSpec as stored in GlobalQueue + specUrl = self.specGenerator.createReRecoSpec(specName, "file", + assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]},InputDataset=inputdataset["InputDataset"]) + + #Second workflow using the same input data set + specName1 = "RerecoSpec1" + specUrl1 = self.specGenerator.createReRecoSpec(specName1, "file", + assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]},InputDataset=inputdataset["InputDataset"]) + + #cleaner = InputDataRucioRuleCleaner(rest=self.mockRest, config=self.config_obj) + cleaner = InputDataRucioRuleCleaner(rest=DummyREST(), config=self.config_obj) + + #Make GlobalQueue + globalQ = globalQueue(DbName='workqueue_t', + QueueURL=self.testInit.couchUrl, + UnittestFlag=True, logger=cleaner.logger, **self.queueParams) + globalQ.queueWork(specUrl, specName, "teamA") + globalQ.queueWork(specUrl1, specName1, "teamB") + + + cleaner.globalQ = globalQ + + #Make MSRuleCleaner + msRuleCleaner = MSRuleCleaner(self.config_obj.msRuleCleaner,logger=cleaner.logger) + msRuleCleaner.resetCounters() + msRuleCleaner.rucio = Rucio.Rucio(self.msRuleCleaner['rucioAccount'], + hostUrl=self.rucioConfigDict['rucio_host'], + authUrl=self.rucioConfigDict['auth_host'], + configDict=self.rucioConfigDict) + + cleaner.msRuleCleaner = msRuleCleaner + + #Let try to modify the element in GlobalQueue to have PercentComplete and PercentSuccess set to 100 + wqService = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') + #Use this instead of wqService.getWQElementsByWorkflow(workflowName) to have the element'id' + data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName], 'endkey': [specName, {}], + 'reduce': False}) + + print(f"Elements in GlobalQueue {specName}:") + elements = data.get('rows', []) + print(json.dumps(elements, indent=2)) + + data1 = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName1], 'endkey': [specName1, {}], + 'reduce': False}) + + print(f"Elements in GlobalQueue {specName1}:") + elements1 = data1.get('rows', []) + print(json.dumps(elements1, indent=2)) + + #let update the PercentComplete and PercentSuccess and Status='Done' of the first elements + element_id = [elements[0]['id']] # Get the first element's ID + print("Updating element:", element_id) + wqService.updateElements(*element_id, PercentComplete=100, PercentSuccess=100, Status='Done') + + #create a rule and inject it in wma_test account + blockNames = list(elements[0]['value']['Inputs'].keys()) # Get the block name from the first element + print("Block Name:", blockNames[0]) + + #need to create rule here otherwise we do not know which element was updated since the element order changes each time re-fetching (of course we can use the element_id) + rule_id = cleaner.msRuleCleaner.rucio.createReplicationRule( + names=blockNames[0], + rseExpression="T2_US_Nebraska", + copies=1, + grouping="DATASET", + lifetime=360, + account="wma_test", + ask_approval=False, + activity="Production Input", + comment="WMCore test block rule creation" + ) + + print("Created Rucio rule with ID:", rule_id) + rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) + print(rule_info) + + # Re-fetch the elements to see the update + data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName], 'endkey': [specName, {}], + 'reduce': False}) + #element order changes each time, so we need to re-fetch the elements + elements = data.get('rows', []) + #elements=wqService.getWQElementsByWorkflow(specName) + print("Updated Elements in GlobalQueue:") + for e in elements: + print(e["id"], e['value']['Status'], e['value']["PercentComplete"], e['value']["PercentSuccess"]) + #print(e["id"], e['Status'], e["PercentComplete"], e["PercentSuccess"]) + + + # Define variables to hold the dynamic RequestName and RequestStatus + self.dynamicRequestName = specName + self.dynamicRequestName1 = specName1 + self.dynamicRequestStatus = "running-open" + self.dynamicRequestStatus1 = "running-open" + self.ReferenceInputDatasets = ["/JetHT/Run2012C-v1/RAW"] + + def mock_getRequestForInputDataset_side_effect(inputdataset, reqmgr2Url): + if inputdataset in self.ReferenceInputDatasets: #only respond to the input data set that is used by a request + # Simulate retrieving the workflow details + return { + "result": [ + {self.dynamicRequestName:{ + "id": 123, + "RequestName": self.dynamicRequestName, # Use the dynamic RequestName + "RequestStatus": self.dynamicRequestStatus, # Use the dynamic RequestStatus + }, + self.dynamicRequestName1:{ + "id": 456, + "RequestName": self.dynamicRequestName1, # Use the dynamic RequestName + "RequestStatus": self.dynamicRequestStatus1, # Use the dynamic RequestStatus + } + } + ] + } + else: + #return {"status": "error", "message": "Invalid request: inputdataset not found"} + return {"result": []} + + # Assign the side effect to the mock object + mock_getRequestForInputDataset.side_effect = mock_getRequestForInputDataset_side_effect + + #First test to clean the rule. It should not be successful since the second workflow is still running + results = cleaner.cleanRucioRules(self.config_obj) + print("Results from cleanRucioRules:", json.dumps(results, indent=2)) + self.assertTrue(not results) + + #Second test, change the second workflow element to 100% and try to clean the rule again + #now change the percentage of workqueue of the second workflow to 100% + #find the id that corresponds to blockname + print("Block Name:", blockNames[0]) + element_id1 = [elements1[0]['id']] # Get the first element's ID + for e in elements1: + inputs = list(e['value']['Inputs'].keys()) + if blockNames == inputs: + print("Found matching element in second workflow:", e['id']) + element_id1 = [e['id']] + break + + print(f"Updating element {element_id1} to (PercentComplete=100, PercentSuccess=100, Status='Done')") + wqService.updateElements(*element_id1, PercentComplete=100, PercentSuccess=100, Status='Done') + # Re-fetch the elements to see the update + data1 = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName1], 'endkey': [specName1, {}], + 'reduce': False}) + #element order changes each time, so we need to re-fetch the elements + elements1 = data1.get('rows', []) + print("Updated Elements in GlobalQueue (workflow 1):") + for e in elements1: + print(e["id"], e['value']['Status'], e['value']["PercentComplete"], e['value']["PercentSuccess"]) + + #now try to clean the rule again. It should be successful this time + results = cleaner.cleanRucioRules(self.config_obj) + print("Results from cleanRucioRules with elements from other workflow complete:", json.dumps(results, indent=2)) self.assertTrue(results) + + #Third test, change the second workflow to aborted and its elements to 0%. It should be able to clean the rule + #now change back the percentage of workqueue of the second workflow to 0% + print(f"Updating element {element_id1} to (PercentComplete=0, PercentSuccess=0, Status='Done')") + wqService.updateElements(*element_id1, PercentComplete=0, PercentSuccess=0, Status='Available') + #test the status of other request is aborted + self.dynamicRequestStatus1 = "aborted" + results = cleaner.cleanRucioRules(self.config_obj) + print("Results from cleanRucioRules with other request is aborted:", json.dumps(results, indent=2)) + self.assertTrue(results) #should be true since other request already aborted + + #Fourth test, now testing workflow in staging status and no workqueue is created. It should not clean the rule + globalQ.backend.deleteWQElementsByWorkflow([specName1]) + data1 = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [specName1], 'endkey': [specName1, {}], + 'reduce': False}) + + print(f"Elements in GlobalQueue {specName1}:") + elements1 = data1.get('rows', []) + print(json.dumps(elements1, indent=2)) + self.dynamicRequestStatus1 = "staging" + results = cleaner.cleanRucioRules(self.config_obj) + print("Results from cleanRucioRules with other request is staging:", json.dumps(results, indent=2)) + self.assertTrue(not results) #this should be false since staging request using the same data should not trigger rule deletion + + + #now make sure the rule is cleaned + #keep deleting until success or timeout + rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) + delResult = False + timeleft = 0 + start_time = time.time() + while rule_info and not delResult and timeleft < 300: + #now delete it + print('Manually deleting rucio rules: ', blockNames[0], cleaner.msRuleCleaner.rucio.listDataRules(blockNames[0], account=self.msRuleCleaner['rucioAccount'])) + delResult = cleaner.msRuleCleaner.rucio.deleteRule(rule_id[0]) + print("Deleted Rucio rule with ID:", rule_id, delResult) + if delResult: break + time.sleep(60) + timeleft = time.time() - start_time + + if not delResult and timeleft >= 300: + print("Failed to delete the rule after 5 minutes, exiting...") + #@mock.patch('WMCore.GlobalWorkQueue.CherryPyThreads.InputDataRucioRuleCleaner.InputDataRucioRuleCleaner.getRequestForInputDataset') def testInputDataRucioRuleCleanerWithThreading(self): """ Test the InputDataRucioRuleCleaner task with threading From 07e5da2c9f79db77ea50f9a6fee8adbfe71af3d7 Mon Sep 17 00:00:00 2001 From: Duong Date: Thu, 29 Jan 2026 02:38:10 -0500 Subject: [PATCH 09/14] turn off test with threading --- .../WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py index f9b8b99d0b..da40434611 100644 --- a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py +++ b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py @@ -455,7 +455,7 @@ def mock_getRequestForInputDataset_side_effect(inputdataset, reqmgr2Url): if not delResult and timeleft >= 300: print("Failed to delete the rule after 5 minutes, exiting...") - + ''' #@mock.patch('WMCore.GlobalWorkQueue.CherryPyThreads.InputDataRucioRuleCleaner.InputDataRucioRuleCleaner.getRequestForInputDataset') def testInputDataRucioRuleCleanerWithThreading(self): """ @@ -581,6 +581,7 @@ def testInputDataRucioRuleCleanerWithThreading(self): print("Failed to delete the rule after 5 minutes, exiting...") self.assertTrue(not rule_info_for_check, "Rule not deleted successfully after periodic task execution.") + ''' if __name__ == '__main__': unittest.main() \ No newline at end of file From fb4c90838c5a3e144f3e0843019432242c973f50 Mon Sep 17 00:00:00 2001 From: Duong Date: Mon, 30 Mar 2026 11:40:08 -0400 Subject: [PATCH 10/14] update --- .../InputDataRucioRuleCleaner.py | 38 ++++++++++++++----- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py index 15e4898525..6c3d45f7d5 100644 --- a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py +++ b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py @@ -1,16 +1,23 @@ from __future__ import (division, print_function) +import json import time -#import json from WMCore.REST.CherryPyPeriodicTask import CherryPyPeriodicTask from WMCore.WorkQueue.WorkQueue import globalQueue from WMCore.MicroService.MSRuleCleaner.MSRuleCleaner import MSRuleCleaner from WMCore.Services.Rucio.Rucio import WMRucioDIDNotFoundException -from WMCore.ReqMgr.Web.ReqMgrService import getdata -#from WMCore.Services.pycurl_manager import RequestHandler -#from Utils.CertTools import ckey, cert +from WMCore.Services.pycurl_manager import RequestHandler +from Utils.CertTools import getKeyCertFromEnv + + +def getdata(url, params, headers=None): + "Helper function to get data from the service" + ckey, cert = getKeyCertFromEnv() + mgr = RequestHandler() + res = mgr.getdata(url, params=params, headers=headers, ckey=ckey, cert=cert) + return json.loads(res) def format_timestamp(timestamp_float): """Converts a float timestamp (seconds since epoch) to a readable string.""" @@ -162,22 +169,30 @@ def canDeleteRucioRule(self, currentRequestName, block, dataCont, config): self.logger.warning(f"Failed to fetch requests using dataset {dataCont}. Response: {response}") return False # We do not know what is going on, better not delete the rule - self.logger.info(f"Response: {response}") + #self.logger.info(f"Response: {response}") requestsUsingData = response["result"][0] - self.logger.info(f"Requests: {requestsUsingData}") + for r, d in requestsUsingData.items(): + self.logger.info(f"Request using same input data: {r} status={d.get('RequestStatus')} inputDataset={d.get('InputDataset')}") for request_id,request_data in requestsUsingData.items(): + self.logger.info(f"Check request: {request_data['RequestName']}") # Skip the current request if request_data['RequestName'] == currentRequestName: + self.logger.info(f"Request {request_data['RequestName']} is the current request. Continuing to next request.") continue + #TEMP + #if request_data['RequestName'] == 'cmsunified_Run2023D_JetMET1_JMENanoAODv15-Backfill_260304_170855_8502': + # self.logger.info(f"Temporary skipping request: {request_data['RequestName']}") + # continue + # only consider workflows in good status and not done yet if request_data['RequestStatus'] not in ['new', 'assignment-approved', 'assigned', 'staging', 'acquired', 'staged', 'running-open', 'running-closed']: self.logger.info(f"Request {request_data['RequestName']} is in status {request_data['RequestStatus']}. Continuing to next request.") continue - + try: # Step 2: Query the global queue for elements of the other request otherRequestElements = self.globalQ.backend.getElements(WorkflowName=request_data['RequestName']) @@ -254,7 +269,7 @@ def cleanRucioRules(self, config): # Check if the Rucio rule for this block can be deleted if not self.canDeleteRucioRule(requestName, block, dataCont, config): - self.logger.info(f"Skipping deletion of rules for block {block} as it is still in use.") + self.logger.info(f"Skipping deletion of rules for block {block} of request {requestName} as it is still in use by other requests.") continue ## Check if the dataset is in use by other requests @@ -305,8 +320,11 @@ def cleanRucioRules(self, config): cleanedRules_info[block]['bytes'].append(rule['bytes']) #cleanRules of MSRuleCleaner expects a list of rule ids and always clean the last one in the list of PlineMarkers rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]].append(rule['id']) + else: + msg = "Rucio rule for block: %s not found for workflow: %s." + self.logger.info(msg, block, requestName) except WMRucioDIDNotFoundException: - msg = "Block: %s not found in Rucio for workflow: %s." + msg = "Exception when cleaning Rucio rule for block: %s of workflow: %s." self.logger.info(msg, block, requestName) continue @@ -327,7 +345,7 @@ def cleanRucioRules(self, config): if not do_cleaning: current_time = format_timestamp(time.time()) - self.logger.info(f"{current_time} No cleaning happened: There are no completed workqueue elements or rules already cleaned") + self.logger.info(f"{current_time} No cleaning happened: There are no completed workqueue elements or block is currently used by other requests or rules already cleaned") #current_time = format_timestamp(time.time()) #self.logger.info(f"{current_time}: {self.__class__.__name__} executed in {(time.time() - tStart):.3f} secs.") From e6bda4340e193cee2e2bc94a197233b91aee6e02 Mon Sep 17 00:00:00 2001 From: Duong Date: Mon, 6 Apr 2026 17:48:31 -0400 Subject: [PATCH 11/14] Optimization and clean loginfo --- .../InputDataRucioRuleCleaner.py | 400 ++++----- .../Emulators/RucioClient/MockRucioApi.py | 51 +- .../InputDataRucioRuleCleaner_t.py | 783 ++++++------------ 3 files changed, 472 insertions(+), 762 deletions(-) diff --git a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py index 6c3d45f7d5..3f52a36c18 100644 --- a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py +++ b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py @@ -24,103 +24,81 @@ def format_timestamp(timestamp_float): # This format gives you: "2025-12-09 19:22:15" return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp_float)) -''' -def canDeleteRucioRule(self, currentRequestName, block, dataCont, config): - """ - Check if the Rucio rule for the given block can be deleted. - :param currentRequest: The name of the current request being processed - :param block: The data block to check - :param dataCont: The container name extracted from the block - :param config: The configuration object - :return: True if the rule can be deleted, False otherwise - """ - try: - # Step 1: Find the requests that use the same input data - #url = 'https://cmsweb.cern.ch/reqmgr2/data/request?outputdataset=%s' % dataset - #params = {} - #headers = {'Accept': 'application/json'} - #https://gitlab.cern.ch/cmsweb-k8s/services_config/-/blob/test/config.test16/workqueue/config.py?ref_type=heads - #config.msRuleCleaner['reqmgr2Url'] = "%s/reqmgr2" % BASE_URL - #url = config.msRuleCleaner['reqmgr2Url']+'/data/request?inputdataset=%s' % dataCont - url = f"{config.msRuleCleaner['reqmgr2Url']}/data/request?inputdataset={dataCont}" - params = {} - headers = {"Accept": "application/json"} - - # Use the getdata function to fetch the requests using the current input data - response = getdata(url, params, headers) - - if not response or "result" not in response: - self.logger.warning(f"Failed to fetch requests using dataset {dataCont}. Response: {response}") - return True # Assume no requests if the response is invalid - - requestsUsingData = response["result"] - - # Step 1a: If there are no requests, return True. No need since this always returns at least the current request - #if not requestsUsingData: - # self.logger.info(f"No requests are using dataset {dataCont}. Rule for block {block} can be deleted.") - # return True - - #self.logger.info(f"Dataset {dataCont} is in use by the following requests: {requestsUsingData}") - - # Step 2: Find the workqueue elements of those requests - foundElements = False # Track if any workqueue elements are found - for request in requestsUsingData: - # Skip the current request - if request['RequestName'] == currentRequestName: - continue - - # if the workflow is done etc return True. We can delete the rule - - try: - # Query the global queue for elements of the other request - otherRequestElements = self.globalQ.backend.getElements(WorkflowName=request['RequestName']) - - # Step 3: If there are no workqueue elements for this request, continue to the next request - if not otherRequestElements: - self.logger.info(f"No workqueue elements found for request {request}. The workqueue might not have been created yet.") - continue - - foundElements = True # At least one element is found at other request - - # Step 4: Check the status of these workqueue element that uses the same datablock - for otherElement in otherRequestElements: - - if block not in otherElement.get('Inputs'): continue - - percentComplete = otherElement.get('PercentComplete', 0) # Default to 0 if key is missing - percentSuccess = otherElement.get('PercentSuccess', 0) # Default to 0 if key is missing - - # Step 5: If any workqueue element is not completed, return False - if percentComplete < 100 or percentSuccess < 100: - self.logger.info(f"Workqueue element {otherElement.get('id')} for request {request['RequestName']} is not yet completed. Rule for block {block} cannot be deleted.") - return False - - except Exception as ex: - self.logger.error(f"Error while finding elements for request {request}: {str(ex)}") - return False - - # Step 3 (fixed): If no elements were found for all requests, return False - if not foundElements: - self.logger.info(f"No workqueue elements found for any of the requests using dataset {dataCont}. Rule for block {block} cannot be deleted.") - return False - - # Step 4: If all workqueue elements are processed, return True - self.logger.info(f"All workqueue elements for requests using dataset {dataCont} are completed. Rule for block {block} can be deleted.") - return True - - except Exception as ex: - self.logger.error(f"Error while checking if rule for block {block} can be deleted: {str(ex)}") - return False -''' class InputDataRucioRuleCleaner(CherryPyPeriodicTask): + """ + A periodic CherryPy task that cleans block-level Rucio replication rules for + input datasets of completed GlobalWorkQueue elements. + + Overview + -------- + The GlobalWorkQueue holds elements representing units of work, each associated + with one or more input data blocks. When a workflow finishes processing a block, + its Rucio replication rule (which was created to stage the data to a site) is + no longer needed and should be removed to free storage quota. + + This task runs periodically and performs the following steps each cycle: + + 1. Fetch Done elements + Query CouchDB for GlobalWorkQueue elements with Status='Done' only, avoiding + a full table scan of all elements. + + 2. Skip already-processed elements + An in-memory set (_processedElementIds) tracks elements whose rules were fully + cleaned in a previous cycle. These are skipped immediately with an O(1) lookup, + avoiding redundant HTTP and CouchDB calls. The set is trimmed each cycle to + only IDs still present in the queue, preventing unbounded memory growth. + + 3. Filter by completion + Only elements with PercentComplete == 100 and PercentSuccess == 100 are + considered. Status='Done' alone does not guarantee full success (some jobs + may have failed), so this additional check ensures we only clean rules for + fully successful elements. + + 4. Check whether each block's rule can be deleted (canDeleteRucioRule) + A block's Rucio rule must not be deleted if another active workflow is still + using the same input data. For each block: + a. Query ReqMgr2 for all requests that use the same input container + (dataCont = block without the #hash suffix). Results are cached by + dataCont in a per-cycle dict (reqmgr2Cache) so that multiple blocks + sharing the same container only trigger one HTTP call per cycle. + b. For each other active request found, query the GlobalWorkQueue for its + elements and check whether the specific block has been fully processed + (PercentComplete == 100 and PercentSuccess == 100). If any other request + is still processing the block, deletion is deferred to the next cycle. + + 5. Collect and delete Rucio rules + For blocks cleared in step 4, query Rucio for existing replication rules + (listDataRules). Rules already deleted in a previous cycle return an empty + list and are silently skipped. Remaining rules are batched into a single + cleanRucioRules call per element, which sets their Rucio lifetime to 0 + (effectively scheduling them for deletion). + + 6. Track element completion + An element is added to _processedElementIds only when ALL of the following + hold for that cycle: + - No block was deferred (canDeleteRucioRule returned True for every block) + - No unexpected error occurred during rule lookup + - All Rucio rule updates reported success (CleanupStatus == True) + If any condition fails, the element is re-evaluated next cycle. + + Scalability notes + ----------------- + - getElements(status='Done') uses the elementsByStatus CouchDB index, avoiding + a full queue scan. + - _processedElementIds eliminates repeat processing of already-cleaned elements + across cycles, so the active working set per cycle is bounded to newly Done + elements since the last cycle. + - reqmgr2Cache reduces ReqMgr2 HTTP calls from O(elements x blocks) to O(unique + containers) per cycle. + """ def __init__(self, rest, config): super(InputDataRucioRuleCleaner, self).__init__(config) self.globalQ = globalQueue(logger=self.logger, **config.queueParams) - self.msRuleCleaner = MSRuleCleaner(config.msRuleCleaner, logger=self.logger) # Initialize MSRuleCleaner - #self.curlMgr = RequestHandler() + self.msRuleCleaner = MSRuleCleaner(config.msRuleCleaner, logger=self.logger) + self._processedElementIds = set() # element IDs confirmed with all block rules fully cleaned in this process lifetime def setConcurrentTasks(self, config): """ @@ -129,239 +107,185 @@ def setConcurrentTasks(self, config): self.concurrentTasks = [{'func': self.cleanRucioRules, 'duration': config.cleanInputDataRucioRuleDuration}] def getRequestForInputDataset(self, inputdataset, reqmgr2Url): - # Step 1: Find the requests that use the same input data - #url = f"{config.msRuleCleaner['reqmgr2Url']}/data/request?inputdataset={inputdataset}" url = f"{reqmgr2Url}/data/request?inputdataset={inputdataset}" params = {} headers = {"Accept": "application/json"} res = None try: - #res = self.curlMgr.getdata(url, params=params, headers=headers, ckey=ckey(), cert=cert()) - #res = json.loads(res) res = getdata(url, params, headers) except Exception as ex: msg = "General exception while fetching requests from ReqMgr2 for inputdataset %s" self.logger.exception(msg, inputdataset, str(ex)) - - # Use the getdata function to fetch the requests using the current input data return res - def canDeleteRucioRule(self, currentRequestName, block, dataCont, config): + def canDeleteRucioRule(self, currentRequestName, block, dataCont, config, reqmgr2Cache=None): """ Check if the Rucio rule for the given block can be deleted. - :param currentRequest: The name of the current request being processed + :param currentRequestName: The name of the current request being processed :param block: The data block to check :param dataCont: The container name extracted from the block :param config: The configuration object + :param reqmgr2Cache: Optional dict caching ReqMgr2 responses by dataCont to avoid redundant HTTP calls :return: True if the rule can be deleted, False otherwise """ try: - # Step 1: Find the requests that use the same input data - #url = f"{config.msRuleCleaner['reqmgr2Url']}/data/request?inputdataset={dataCont}" - #params = {} - #headers = {"Accept": "application/json"} - - # Use the getdata function to fetch the requests using the current input data - #response = getdata(url, params, headers) - response = self.getRequestForInputDataset(dataCont, config.msRuleCleaner['reqmgr2Url']) - + # Use the cache to avoid repeated HTTP calls for the same container within a cycle. + if reqmgr2Cache is not None and dataCont in reqmgr2Cache: + response = reqmgr2Cache[dataCont] + else: + response = self.getRequestForInputDataset(dataCont, config.msRuleCleaner['reqmgr2Url']) + if reqmgr2Cache is not None: + reqmgr2Cache[dataCont] = response + if not response or "result" not in response: self.logger.warning(f"Failed to fetch requests using dataset {dataCont}. Response: {response}") return False # We do not know what is going on, better not delete the rule - - #self.logger.info(f"Response: {response}") requestsUsingData = response["result"][0] - for r, d in requestsUsingData.items(): - self.logger.info(f"Request using same input data: {r} status={d.get('RequestStatus')} inputDataset={d.get('InputDataset')}") - - for request_id,request_data in requestsUsingData.items(): - self.logger.info(f"Check request: {request_data['RequestName']}") - # Skip the current request + for request_id, request_data in requestsUsingData.items(): + self.logger.debug(f"Check request: {request_data['RequestName']}") + if request_data['RequestName'] == currentRequestName: - self.logger.info(f"Request {request_data['RequestName']} is the current request. Continuing to next request.") + self.logger.debug(f"Request {request_data['RequestName']} is the current request. Continuing to next request.") continue - - #TEMP - #if request_data['RequestName'] == 'cmsunified_Run2023D_JetMET1_JMENanoAODv15-Backfill_260304_170855_8502': - # self.logger.info(f"Temporary skipping request: {request_data['RequestName']}") - # continue - # only consider workflows in good status and not done yet + # Only consider workflows in active statuses — skip requests that are already done or cancelled if request_data['RequestStatus'] not in ['new', 'assignment-approved', 'assigned', 'staging', 'acquired', 'staged', 'running-open', 'running-closed']: - self.logger.info(f"Request {request_data['RequestName']} is in status {request_data['RequestStatus']}. Continuing to next request.") + self.logger.debug(f"Request {request_data['RequestName']} is in status {request_data['RequestStatus']}. Continuing to next request.") continue - + try: - # Step 2: Query the global queue for elements of the other request otherRequestElements = self.globalQ.backend.getElements(WorkflowName=request_data['RequestName']) - + if not otherRequestElements: self.logger.info(f"No workqueue elements found for request {request_id}: {request_data}. The workqueue might not have been created yet.") - return False # We do not know what is going on, better not delete the rule - - # Step 3: Check the status of these workqueue element that uses the same datablock + return False # We do not know what is going on, better not delete the rule + for otherElement in otherRequestElements: - - if block not in otherElement.get('Inputs'): continue - - percentComplete = otherElement.get('PercentComplete', 0) # Default to 0 if key is missing - percentSuccess = otherElement.get('PercentSuccess', 0) # Default to 0 if key is missing - + + if block not in otherElement.get('Inputs'): + continue + + percentComplete = otherElement.get('PercentComplete', 0) + percentSuccess = otherElement.get('PercentSuccess', 0) + if percentComplete < 100 or percentSuccess < 100: - self.logger.info(f"Rule for block {block} cannot be deleted. Workqueue elements of request {request_data['RequestName']} using the same block have not completed processing ({percentComplete}, {percentSuccess}).") + self.logger.debug(f"Rule for block {block} cannot be deleted. Workqueue elements of request {request_data['RequestName']} using the same block have not completed processing ({percentComplete}, {percentSuccess}).") return False - + except Exception as ex: self.logger.error(f"Error while finding elements for request {request_id}: {request_data} and making consideration on data processing completion: {str(ex)}") - return False #We do not know what is going on, better not delete the rule - - return True - + return False # We do not know what is going on, better not delete the rule + + return True + except Exception as ex: self.logger.error(f"Error while checking if rule for block {block} can be deleted: {str(ex)}") - return False #We do not know what is going on, better not delete the rule + return False # We do not know what is going on, better not delete the rule def cleanRucioRules(self, config): """ - Queries global queue and builds the list of blocklevel Rucio rules of finished elements to be deleted. Calls MSRuleCleaner cleanRucioRules(self, wflow) to delete the rules. - :config: The configuration for the task. This uses Rucio account from config to use for querying rules - :return: The result of MSRuleCleaner cleanRucioRules(self, wflow) method, which is True if all rules were deleted successfully, False otherwise. + Queries global queue and builds the list of blocklevel Rucio rules of finished elements to be deleted. + Calls MSRuleCleaner cleanRucioRules(self, wflow) to delete the rules. + :config: The configuration for the task. This uses Rucio account from config to use for querying rules + :return: True if any cleaning happened this cycle, False otherwise. """ - + tStart = time.time() - - #statuses = ['Available', 'Done', 'Acquired', 'Failed', 'Canceled'] - #globalQueueElements=self.globalQ.getWork({'Status':'Done'},siteJobCounts={}) - globalQueueElements=self.globalQ.backend.getElements() - - #print("Elements in GlobalQueue cleanRucioRules:") - #print(json.dumps(globalQueueElements,indent=2)) - + + globalQueueElements = self.globalQ.backend.getElements(status='Done') + + # Trim skip-set to only IDs still present in the queue, preventing unbounded growth + currentIds = {el.id for el in globalQueueElements} + self._processedElementIds &= currentIds + + # Per-cycle cache: dataCont -> ReqMgr2 response, shared across all elements and blocks. + # Avoids redundant HTTP calls for blocks sharing the same container within a cycle. + reqmgr2Cache = {} + do_cleaning = False if globalQueueElements: - #print(f"Found {len(globalQueueElements)} elements in GlobalQueue") current_time = format_timestamp(time.time()) - self.logger.info(f"{current_time}: Found {len(globalQueueElements)} globalqueue elements.") + self.logger.info(f"{current_time}: Found {len(globalQueueElements)} globalqueue elements ({len(self._processedElementIds)} already fully processed, skipping).") for element in globalQueueElements: - - requestName = element.get('RequestName') # Extract the RequestName field - percentComplete = element.get('PercentComplete', 0) # Default to 0 if key is missing - percentSuccess = element.get('PercentSuccess', 0) # Default to 0 if key is missing + + if element.id in self._processedElementIds: + continue + + requestName = element.get('RequestName') + percentComplete = element.get('PercentComplete', 0) + percentSuccess = element.get('PercentSuccess', 0) if percentComplete == 100 and percentSuccess == 100: - #to be able to use cleanRules method of MSRuleCleaner - rulesToClean = {'PlineMarkers':['Current'], 'RulesToClean': {'Current': []}, 'CleanupStatus': {'Current': []}} + # Structure required by MSRuleCleaner.cleanRucioRules() + rulesToClean = {'PlineMarkers': ['Current'], 'RulesToClean': {'Current': []}, 'CleanupStatus': {'Current': []}} + + blocks = element.get('Inputs') - #'Inputs': {'/MinimumBias/ComissioningHI-v1/RAW#372d624c-089d-11e1-8347-003048caaace': - blocks = element.get('Inputs') # Example key for dataset - - # Fetch rules for blocks cleanedRules_info = {} + elementFullyProcessed = True # flipped to False on any deferral, error, or partial Rucio failure if blocks: for block in blocks: - #print("Adding block ", block, " to RulesToClean") - dataCont = block.split('#')[0] # Extract the container name from the block - - # Check if the Rucio rule for this block can be deleted - if not self.canDeleteRucioRule(requestName, block, dataCont, config): - self.logger.info(f"Skipping deletion of rules for block {block} of request {requestName} as it is still in use by other requests.") + dataCont = block.split('#')[0] # strip block hash to get the container name + + if not self.canDeleteRucioRule(requestName, block, dataCont, config, reqmgr2Cache): + elementFullyProcessed = False continue - - ## Check if the dataset is in use by other requests - #try: - # # Assuming there's an endpoint that provides the requests using the input data - # url = f"{config.reqmgrUrl}/data/request_by_input" - # params = {"input": dataCont} - # headers = {"Accept": "application/json"} - # - # # Use the getdata function to fetch the requests using the current input data - # response = getdata(url, params, headers) - # - # # Process the response to extract request names - # if response and "result" in response: - # requestsUsingData = response["result"] - # if requestsUsingData: - # self.logger.info(f"Dataset {dataCont} is still in use by the following requests: {requestsUsingData}") - # continue # Skip cleaning for this dataset - # else: - # self.logger.info(f"Dataset {dataCont} is not in use by any other requests.") - # else: - # self.logger.warning(f"Failed to fetch requests using dataset {dataCont}. Response: {response}") - #except Exception as ex: - # self.logger.error(f"Error while checking requests using dataset {dataCont}: {str(ex)}") - # continue - - #need to self.getGlobalLocks() before using self.msRuleCleaner.globalLocks - #if dataCont in self.msRuleCleaner.globalLocks: - # msg = "Found dataset: %s in GlobalLocks. NOT considering it for filling the " - # msg += "RulesToClean list for both container and block level Rules for workflow: %s!" - # self.logger.info(msg, dataCont, requestName) - # continue + try: - #print('Fetching rules for block:', block, "\n", config.rucioAccount, "\n", self.msRuleCleaner.rucio.listDataRules(block, account=config.rucioAccount)) rules = self.msRuleCleaner.rucio.listDataRules(block, account=config.msRuleCleaner['rucioAccount']) - #found rules for this block. If the rules of this block already cleaned there is no rules found if rules: - cleanedRules_info[block] = {} - #one block can have multiple rules - cleanedRules_info[block]['id'] = [] - cleanedRules_info[block]['bytes'] = [] + # one block can have multiple rules + cleanedRules_info[block] = {'id': [], 'bytes': []} for rule in rules: - #msg = "Found %s block-level rule to be deleted for container %s" - #self.logger.info(msg, rule['id'], dataCont) - #current_time = format_timestamp(time.time()) - #self.logger.info(f"{current_time}: Rule {rule['id']} {block} {rule['bytes']} {requestName} to be cleaned") cleanedRules_info[block]['id'].append(rule['id']) cleanedRules_info[block]['bytes'].append(rule['bytes']) - #cleanRules of MSRuleCleaner expects a list of rule ids and always clean the last one in the list of PlineMarkers + # cleanRucioRules expects rule ids under the last PlineMarker rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]].append(rule['id']) - else: + else: msg = "Rucio rule for block: %s not found for workflow: %s." - self.logger.info(msg, block, requestName) + self.logger.debug(msg, block, requestName) except WMRucioDIDNotFoundException: msg = "Exception when cleaning Rucio rule for block: %s of workflow: %s." - self.logger.info(msg, block, requestName) + self.logger.debug(msg, block, requestName) continue - + except Exception as ex: + self.logger.error(f"Unexpected error fetching rules for block {block} of workflow {requestName}: {str(ex)}") + elementFullyProcessed = False + continue + if cleanedRules_info: current_time = format_timestamp(time.time()) self.logger.info(f"{current_time}: Start cleaning rules for completed element {element.id}") - + do_cleaning = True for block, info in cleanedRules_info.items(): for rule_id, size in zip(info["id"], info["bytes"]): self.logger.info(f"{current_time} Rule to clean: {rule_id} {block} {size} {requestName}") - + self.msRuleCleaner.cleanRucioRules(rulesToClean) - + if not rulesToClean['CleanupStatus']['Current']: + elementFullyProcessed = False + current_time = format_timestamp(time.time()) - self.logger.info(f"{current_time}: End cleaning rules for completed element {element.id}") - + self.logger.info(f"{current_time}: End cleaning rules for completed element {element.id} (success={elementFullyProcessed})") + + if elementFullyProcessed: + self._processedElementIds.add(element.id) + if not do_cleaning: current_time = format_timestamp(time.time()) - self.logger.info(f"{current_time} No cleaning happened: There are no completed workqueue elements or block is currently used by other requests or rules already cleaned") - - #current_time = format_timestamp(time.time()) - #self.logger.info(f"{current_time}: {self.__class__.__name__} executed in {(time.time() - tStart):.3f} secs.") - #tmp = rulesToClean['RulesToClean'][rulesToClean['PlineMarkers'][-1]] - #ids = '' - #for rid in tmp: - # ids += rid + ', ' - # rulesToClean['CleanupStatus']['Current'].append({'RuleID': rid, 'Status': 'Pending'}) - #self.logger.info('Rules to be cleaned: %s', ids) - #return rulesToClean - #return self.msRuleCleaner.cleanRucioRules(rulesToClean) + self.logger.debug(f"{current_time} No cleaning happened: There are no completed workqueue elements or block is currently used by other requests or rules already cleaned") else: current_time = format_timestamp(time.time()) - self.logger.info(f"{current_time} No elements found in GlobalQueue") - + self.logger.debug(f"{current_time} No elements found in GlobalQueue") + current_time = format_timestamp(time.time()) self.logger.info(f"{current_time} {self.__class__.__name__} executed in {(time.time() - tStart):.3f} secs.") - return do_cleaning \ No newline at end of file + return do_cleaning diff --git a/src/python/WMQuality/Emulators/RucioClient/MockRucioApi.py b/src/python/WMQuality/Emulators/RucioClient/MockRucioApi.py index 4dff0888da..65e73b1b92 100644 --- a/src/python/WMQuality/Emulators/RucioClient/MockRucioApi.py +++ b/src/python/WMQuality/Emulators/RucioClient/MockRucioApi.py @@ -51,6 +51,7 @@ def __init__(self, acct, hostUrl=None, authUrl=None, configDict=None): self.dataBlocks = DataBlockGenerator() self.subRequests = {} self.rucioParams = {} + self._mockRules = {} # ruleId -> rule dict, used by getRule/listDataRules/deleteRule def sitesByBlock(self, block): """ @@ -131,13 +132,55 @@ def attachDIDs(self, rse, superDID, portion, scope='cms'): logging.info("%s attachDID rse=%s, suportDID=%s, portion=%s, scope=%s", cname, rse, superDID, portion, scope) return True - def createReplicationRule(self, portion, rseExpression, scope='cms'): + def createReplicationRule(self, names, rseExpression, scope='cms', copies=1, **kwargs): """ - Emulate createReplicationRule Rucio API + Emulate createReplicationRule Rucio API — stores the rule in _mockRules and returns a fake rule id list. """ + import hashlib cname = self.__class__.__name__ - logging.info("%s createReplicationRule portion=%s, rseExpression=%s", cname, portion, rseExpression) - return [rseExpression] + logging.info("%s createReplicationRule names=%s, rseExpression=%s", cname, names, rseExpression) + ruleId = hashlib.md5("{}{}".format(names, rseExpression).encode()).hexdigest() + self._mockRules[ruleId] = { + 'id': ruleId, + 'name': names, + 'rse_expression': rseExpression, + 'scope': scope, + 'copies': copies, + 'account': kwargs.get('account', 'mock'), + 'state': 'OK', + 'bytes': 0, + } + return [ruleId] + + def getRule(self, ruleId): + """ + Emulate getRule Rucio API — return a minimal rule dict for an existing mock rule id. + """ + cname = self.__class__.__name__ + logging.info("%s getRule ruleId=%s", cname, ruleId) + # Mock rule storage: non-empty dict means the rule exists + if ruleId in self._mockRules: + return self._mockRules[ruleId] + return {} + + def listDataRules(self, name, **kwargs): + """ + Emulate listDataRules Rucio API — return mock rules for the given DID. + """ + cname = self.__class__.__name__ + logging.info("%s listDataRules name=%s kwargs=%s", cname, name, kwargs) + account = kwargs.get('account', None) + return [rule for rule in self._mockRules.values() + if rule.get('name') == name and (account is None or rule.get('account') == account)] + + def deleteRule(self, ruleId, purgeReplicas=False): + """ + Emulate deleteRule Rucio API — remove the rule from the mock store. + """ + cname = self.__class__.__name__ + logging.info("%s deleteRule ruleId=%s", cname, ruleId) + self._mockRules.pop(ruleId, None) + return True def updateRule(self, rid, opts): """ diff --git a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py index da40434611..08a79162d0 100644 --- a/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py +++ b/test/python/WMCore_t/GlobalWorkQueue_t/InputDataRucioRuleCleaner_t.py @@ -1,36 +1,23 @@ from WMCore.GlobalWorkQueue.CherryPyThreads.InputDataRucioRuleCleaner import InputDataRucioRuleCleaner - from WMQuality.Emulators.EmulatedUnitTestCase import EmulatedUnitTestCase - -import cherrypy - -# WMCore modules -from WMCore.Services.Rucio import Rucio - from WMQuality.TestInitCouchApp import TestInitCouchApp -from WMQuality.Emulators.WMSpecGenerator.WMSpecGenerator import WMSpecGenerator from WMCore.WorkQueue.WorkQueue import globalQueue from WMCore.Services.WorkQueue.WorkQueue import WorkQueue as WorkQueueDS from WMCore.MicroService.MSRuleCleaner.MSRuleCleaner import MSRuleCleaner -from WMCore.ReqMgr.Web.ReqMgrService import getdata import json -# system modules -import os -import time - import unittest -from urllib.parse import parse_qs, urlparse -#from unittest.mock import patch from mock import mock + class DummyREST: def __init__(self): - self.logger = None # Optional: add logger if needed + self.logger = None self.config = None -#MSRuleCleaner requires plain dictionary to be passed as config while CherryPyPeriodic requires attributes, so we create a DictWithAttrs class + +# MSRuleCleaner requires a plain dict while CherryPyPeriodicTask requires attribute access class DictWithAttrs(dict): def __getattr__(self, key): try: @@ -38,550 +25,306 @@ def __getattr__(self, key): except KeyError as e: raise AttributeError(f"'{type(self).__name__}' object has no attribute '{key}'") from e + class InputDataRucioRuleCleanerTest(EmulatedUnitTestCase): - def setUp(self): - self.config = {} - self.msRuleCleaner = {"verbose": True, - "interval": 1 * 60, - "services": ['ruleCleaner'], - "rucioAccount": 'wma_test', - 'reqmgr2Url': 'https://cmsweb-testbed.cern.ch/reqmgr2', - 'msOutputUrl': 'https://cmsweb-testbed.cern.ch/ms-output', - 'reqmgrCacheUrl': 'https://cmsweb-testbed.cern.ch/couchdb/reqmgr_workload_cache', - 'phedexUrl': 'https://cmsweb-testbed.cern.ch/phedex/datasvc/json/prod', - 'dbsUrl': 'https://cmsweb-testbed.cern.ch/dbs/int/global/DBSReader', - 'rucioUrl': 'http://cms-rucio-int.cern.ch', - 'rucioAuthUrl': 'https://cms-rucio-auth-int.cern.ch', - "wmstatsUrl": "https://cmsweb-testbed.cern.ch/wmstatsserver", - "logDBUrl": "https://cmsweb-testbed.cern.ch/couchdb/wmstats_logdb", - 'logDBReporter': 'reqmgr2ms_ruleCleaner', - 'archiveDelayHours': 8, - 'archiveAlarmHours': 24, - 'enableRealMode': True} - - self.creds = {"client_cert": os.getenv("X509_USER_CERT", "Unknown"), - "client_key": os.getenv("X509_USER_KEY", "Unknown")} - self.rucioConfigDict = {"rucio_host": self.msRuleCleaner['rucioUrl'], - "auth_host": self.msRuleCleaner['rucioAuthUrl'], - "auth_type": "x509", - "account": self.msRuleCleaner['rucioAccount'], - "ca_cert": False, - "timeout": 30, - "request_retries": 3, - "creds": self.creds} - - + def setUp(self): + # --- Why we mock TagCollector before super().setUp() --- + # + # EmulatedUnitTestCase.setUp() sets up several patchers, e.g.: + # mock.patch('WMCore.ReqMgr.Tools.cms.CRIC', ...) + # mock.patch('WMCore_t.WMSpec_t.Steps_t.Fetchers_t.PileupFetcher_t.Rucio', ...) + # + # Setting up each patcher causes Python to import the target module for the + # first time. Both of the above targets cause cms.py to be imported, and cms.py + # has this at module level: + # + # TC = TagCollector() # cms.py line 18 + # + # TagCollector.__init__ tries to load SSL certificates, which fails in a + # test environment without real certs. + # + # By replacing the TagCollector *class* with a MagicMock before super().setUp() + # runs, the first import of cms.py instantiates the mock instead of the real + # class, so no SSL calls are made. + # + # --- Why we configure releases/architectures return values --- + # + # WMSpecGenerator.createReRecoSpec() calls StdBase.getTestArguments(), which + # picks a test CMSSWVersion and ScramArch. Those values are then validated: + # + # "validate": lambda x: x in releases() # StdBase.py + # + # releases() calls TC.releases(), and TC is the MagicMock instance created + # above (tagCollectorMock.return_value). Without configuring the return value, + # TC.releases() returns a bare MagicMock and 'x' in MagicMock() evaluates to + # False, failing validation. + # + # We use _AnyContains — a list subclass whose __contains__ always returns True — + # so that any CMSSW version or ScramArch passes validation, regardless of what + # getTestArguments() returns. This avoids brittle hardcoding that would break + # if StdBase.getTestArguments() is updated to use a newer release. + # + # Mock hierarchy: + # tagCollectorMock — the mocked class + # tagCollectorMock() — instantiating it → tagCollectorMock.return_value + # TC = TagCollector() → TC = tagCollectorMock.return_value + # TC.releases() → tagCollectorMock.return_value.releases.return_value + class _AnyContains(list): + def __contains__(self, item): + return True + + from unittest.mock import MagicMock + tagCollectorMock = MagicMock() + tagCollectorMock.return_value.releases.return_value = _AnyContains() + tagCollectorMock.return_value.architectures.return_value = _AnyContains() + tagCollectorPatcher = mock.patch( + 'WMCore.Services.TagCollector.TagCollector.TagCollector', new=tagCollectorMock) + tagCollectorPatcher.start() + self.addCleanup(tagCollectorPatcher.stop) + + super(InputDataRucioRuleCleanerTest, self).setUp() + + from WMQuality.Emulators.WMSpecGenerator.WMSpecGenerator import WMSpecGenerator + + self.msRuleCleanerConfig = { + "verbose": True, + "interval": 1 * 60, + "services": ['ruleCleaner'], + "rucioAccount": 'wma_test', + 'reqmgr2Url': 'https://cmsweb-testbed.cern.ch/reqmgr2', + 'msOutputUrl': 'https://cmsweb-testbed.cern.ch/ms-output', + 'reqmgrCacheUrl': 'https://cmsweb-testbed.cern.ch/couchdb/reqmgr_workload_cache', + 'phedexUrl': 'https://cmsweb-testbed.cern.ch/phedex/datasvc/json/prod', + 'dbsUrl': 'https://cmsweb-testbed.cern.ch/dbs/int/global/DBSReader', + 'rucioUrl': 'http://cms-rucio-int.cern.ch', + 'rucioAuthUrl': 'https://cms-rucio-auth-int.cern.ch', + "wmstatsUrl": "https://cmsweb-testbed.cern.ch/wmstatsserver", + "logDBUrl": "https://cmsweb-testbed.cern.ch/couchdb/wmstats_logdb", + 'logDBReporter': 'reqmgr2ms_ruleCleaner', + 'archiveDelayHours': 8, + 'archiveAlarmHours': 24, + 'enableRealMode': True, + } + self.specGenerator = WMSpecGenerator("WMSpecs") - self.schema = [] - self.couchApps = ["WorkQueue"] self.testInit = TestInitCouchApp('WorkQueueServiceTest') self.testInit.setLogging() self.testInit.setDatabaseConnection() - self.testInit.setSchema(customModules=self.schema, - useDefault=False) - self.testInit.setupCouch('workqueue_t', *self.couchApps) - self.testInit.setupCouch('workqueue_t_inbox', *self.couchApps) - self.testInit.setupCouch('local_workqueue_t', *self.couchApps) - self.testInit.setupCouch('local_workqueue_t_inbox', *self.couchApps) + self.testInit.setSchema(customModules=[], useDefault=False) + for dbName in ('workqueue_t', 'workqueue_t_inbox', 'local_workqueue_t', 'local_workqueue_t_inbox'): + self.testInit.setupCouch(dbName, "WorkQueue") self.testInit.generateWorkDir() - self.msRuleCleaner.update({'QueueURL':self.testInit.couchUrl}) - - self.queueParams = {} - self.queueParams['log_reporter'] = "Services_WorkQueue_Unittest" - self.queueParams['rucioAccount'] = self.msRuleCleaner['rucioAccount'] - self.queueParams['rucioAuthUrl'] = "http://cms-rucio-int.cern.ch" - self.queueParams['rucioUrl'] = "https://cms-rucio-auth-int.cern.ch" - self.queueParams['_internal_name'] = 'GlobalWorkQueueTest' - self.queueParams['log_file'] = 'test.log' - - - print("X509_USER_CERT:", os.getenv("X509_USER_CERT")) - print("X509_USER_KEY:", os.getenv("X509_USER_KEY")) - - # Create config object with attributes - self.config_obj = DictWithAttrs(self.config) - #additional attributes needed by cherrypy periodic task + self.msRuleCleanerConfig['QueueURL'] = self.testInit.couchUrl + + self.queueParams = { + 'log_reporter': "Services_WorkQueue_Unittest", + 'rucioAccount': self.msRuleCleanerConfig['rucioAccount'], + 'rucioAuthUrl': "http://cms-rucio-int.cern.ch", + 'rucioUrl': "https://cms-rucio-auth-int.cern.ch", + '_internal_name': 'GlobalWorkQueueTest', + 'log_file': 'test.log', + } + + self.config_obj = DictWithAttrs() self.config_obj._internal_name = "GlobalWorkQueueTest" self.config_obj.log_file = "test.log" - #additional attributes needed by global workqueue self.config_obj.queueParams = self.queueParams - #additional attributes needed by MSRuleCleaner - self.config_obj.msRuleCleaner = self.msRuleCleaner - #duration for the periodic task in seconds + self.config_obj.msRuleCleaner = self.msRuleCleanerConfig self.config_obj.cleanInputDataRucioRuleDuration = 10 - super(InputDataRucioRuleCleanerTest, self).setUp() - + def _makeCleaner(self): + """Create an InputDataRucioRuleCleaner with a fresh GlobalQueue and MockRucioApi.""" + from WMQuality.Emulators.RucioClient.MockRucioApi import MockRucioApi + cleaner = InputDataRucioRuleCleaner(rest=DummyREST(), config=self.config_obj) + cleaner.globalQ = globalQueue(DbName='workqueue_t', + QueueURL=self.testInit.couchUrl, + UnittestFlag=True, + logger=cleaner.logger, + **self.queueParams) + msRuleCleaner = MSRuleCleaner(self.config_obj.msRuleCleaner, logger=cleaner.logger) + msRuleCleaner.resetCounters() + msRuleCleaner.rucio = MockRucioApi(self.msRuleCleanerConfig['rucioAccount']) + cleaner.msRuleCleaner = msRuleCleaner + return cleaner + + def _getWorkflowElements(self, wqService, workflowName): + """Return raw CouchDB view rows for the given workflow.""" + return wqService.db.loadView( + 'WorkQueue', 'elementsDetailByWorkflowAndStatus', + {'startkey': [workflowName], 'endkey': [workflowName, {}], 'reduce': False} + )['rows'] @mock.patch('WMCore.GlobalWorkQueue.CherryPyThreads.InputDataRucioRuleCleaner.InputDataRucioRuleCleaner.getRequestForInputDataset') def testInputDataRucioRuleCleaner(self, mock_getRequestForInputDataset): """ - Test the InputDataRucioRuleCleaner task + Single-workflow happy path: + - Done element at <100% is skipped. + - Done element at 100%/100% with a Rucio rule gets the rule cleaned. + - The element is added to the skip-set and skipped on the next cycle. """ - #Get workflow description. ReRecoWorkloadFactory.getTestArguments() is used in createReRecoSpec below, - #so the workflow description here and the one used in creating workqueue is the same specName = "RerecoSpec" - inputdataset = {"InputDataset": "/JetHT/Run2012C-v1/RAW"} - - #Create ReRecoSpec as stored in GlobalQueue - specUrl = self.specGenerator.createReRecoSpec(specName, "file", - assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]},InputDataset=inputdataset["InputDataset"]) - - #cleaner = InputDataRucioRuleCleaner(rest=self.mockRest, config=self.config_obj) - cleaner = InputDataRucioRuleCleaner(rest=DummyREST(), config=self.config_obj) - - #Make GlobalQueue - globalQ = globalQueue(DbName='workqueue_t', - QueueURL=self.testInit.couchUrl, - UnittestFlag=True, logger=cleaner.logger, **self.queueParams) - globalQ.queueWork(specUrl, specName, "teamA") - cleaner.globalQ = globalQ - - #Make MSRuleCleaner - msRuleCleaner = MSRuleCleaner(self.config_obj.msRuleCleaner,logger=cleaner.logger) - msRuleCleaner.resetCounters() - msRuleCleaner.rucio = Rucio.Rucio(self.msRuleCleaner['rucioAccount'], - hostUrl=self.rucioConfigDict['rucio_host'], - authUrl=self.rucioConfigDict['auth_host'], - configDict=self.rucioConfigDict) - - cleaner.msRuleCleaner = msRuleCleaner - - #Let try to modify the element in GlobalQueue to have PercentComplete and PercentSuccess set to 100 + inputDataset = "/JetHT/Run2012C-v1/RAW" + + specUrl = self.specGenerator.createReRecoSpec( + specName, "file", + assignKwargs={'SiteWhitelist': ["T2_XX_SiteA"]}, + InputDataset=inputDataset) + + cleaner = self._makeCleaner() + cleaner.globalQ.queueWork(specUrl, specName, "teamA") + wqService = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') - #Use this instead of wqService.getWQElementsByWorkflow(workflowName) to have the element'id' - data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName], 'endkey': [specName, {}], - 'reduce': False}) - - print("Elements in GlobalQueue:") - elements = data.get('rows', []) - print(json.dumps(elements, indent=2)) - - #let update the PercentComplete and PercentSuccess and Status='Done' of the first elements - element_id = [elements[0]['id']] # Get the first element's ID - print("Updating element:", element_id) - wqService.updateElements(*element_id, PercentComplete=100, PercentSuccess=100, Status='Done') - - #create a rule and inject it in wma_test account - blockNames = list(elements[0]['value']['Inputs'].keys()) # Get the block name from the first element - print("Block Name:", blockNames[0]) - - #need to create rule here otherwise we do not know which element was updated since the element order changes each time re-fetching (of course we can use the element_id) + rows = self._getWorkflowElements(wqService, specName) + element_id = rows[0]['id'] + blockName = list(rows[0]['value']['Inputs'].keys())[0] + print("Elements in GlobalQueue:", json.dumps(rows, indent=2)) + + # Mock returns only the current request — canDeleteRucioRule skips self and allows deletion + mock_getRequestForInputDataset.return_value = { + "result": [{specName: {"RequestName": specName, "RequestStatus": "running-open"}}] + } + + # Done element at 50%/50% must be skipped + wqService.updateElements(element_id, PercentComplete=50, PercentSuccess=50, Status='Done') + self.assertFalse(cleaner.cleanRucioRules(self.config_obj), + "Should not clean rules for a Done element with less than 100% completion") + + # Now fully complete — create a mock Rucio rule to be cleaned + wqService.updateElements(element_id, PercentComplete=100, PercentSuccess=100, Status='Done') rule_id = cleaner.msRuleCleaner.rucio.createReplicationRule( - names=blockNames[0], + names=blockName, rseExpression="T2_US_Nebraska", copies=1, - grouping="DATASET", - lifetime=360, - account="wma_test", - ask_approval=False, - activity="Production Input", - comment="WMCore test block rule creation" - ) - + account=self.msRuleCleanerConfig['rucioAccount'], + )[0] print("Created Rucio rule with ID:", rule_id) - rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) - print(rule_info) - - # Re-fetch the elements to see the update - data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName], 'endkey': [specName, {}], - 'reduce': False}) - #element order changes each time, so we need to re-fetch the elements - elements = data.get('rows', []) - #elements=wqService.getWQElementsByWorkflow(specName) + print("Rule info:", cleaner.msRuleCleaner.rucio.getRule(rule_id)) + + rows = self._getWorkflowElements(wqService, specName) print("Updated Elements in GlobalQueue:") - for e in elements: + for e in rows: print(e["id"], e['value']['Status'], e['value']["PercentComplete"], e['value']["PercentSuccess"]) - #print(e["id"], e['Status'], e["PercentComplete"], e["PercentSuccess"]) - - - # Define a variable to hold the dynamic RequestName and RequestStatus - self.dynamicRequestName = specName - self.dynamicRequestStatus = "running-open" # Default value - self.ReferenceInputDatasets = ["/JetHT/Run2012C-v1/RAW"] - - def mock_getRequestForInputDataset_side_effect(inputdataset, reqmgr2Url): - if inputdataset in self.ReferenceInputDatasets: #only respond to the input data set that is used by a request - # Simulate retrieving the workflow details - return { - "result": [ - {self.dynamicRequestName:{ - "RequestName": self.dynamicRequestName, # Use the dynamic RequestName - "RequestStatus": self.dynamicRequestStatus, # Use the dynamic RequestStatus - } - } - ] - } - else: - #return {"status": "error", "message": "Invalid request: inputdataset not found"} - return {"result": []} - - # Assign the side effect to the mock object - mock_getRequestForInputDataset.side_effect = mock_getRequestForInputDataset_side_effect - - results = cleaner.cleanRucioRules(self.config_obj) - print("Results from cleanRucioRules:", json.dumps(results, indent=2)) - #now make sure the rule is cleaned - #keep deleting until success or timeout - rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) - delResult = False - timeleft = 0 - start_time = time.time() - while rule_info and not delResult and timeleft < 300: - #now delete it - print('Manually deleting rucio rules: ', blockNames[0], cleaner.msRuleCleaner.rucio.listDataRules(blockNames[0], account=self.msRuleCleaner['rucioAccount'])) - delResult = cleaner.msRuleCleaner.rucio.deleteRule(rule_id[0]) - print("Deleted Rucio rule with ID:", rule_id, delResult) - if delResult: break - time.sleep(60) - timeleft = time.time() - start_time - - if not delResult and timeleft >= 300: - print("Failed to delete the rule after 5 minutes, exiting...") - - #self.assertTrue(False) - self.assertTrue(results) + + self.assertTrue(cleaner.cleanRucioRules(self.config_obj), + "cleanRucioRules should return True after cleaning a completed element's rules") + + # Skip-set must be populated and contain only IDs still in the Done queue + done_ids = {el.id for el in cleaner.globalQ.backend.getElements(status='Done')} + self.assertTrue(len(cleaner._processedElementIds) > 0, + "Skip-set should be non-empty after a successful clean cycle") + self.assertTrue(cleaner._processedElementIds.issubset(done_ids), + "All skip-set IDs should still be present in the Done queue") + + # Second cycle: element already in skip-set, nothing to do + self.assertFalse(cleaner.cleanRucioRules(self.config_obj), + "Second cycle should return False — element already in skip-set") + + # The cleaner sets rule lifetime to 0 via updateRule (not deleteRule), + # so the mock rule still exists in the store + self.assertTrue(cleaner.msRuleCleaner.rucio.getRule(rule_id), + "Mock rule should still exist after lifetime-0 update (not physically deleted)") @mock.patch('WMCore.GlobalWorkQueue.CherryPyThreads.InputDataRucioRuleCleaner.InputDataRucioRuleCleaner.getRequestForInputDataset') def testInputDataRucioRuleCleanerTwoWorkflowSameInputdata(self, mock_getRequestForInputDataset): """ - Test the InputDataRucioRuleCleaner task with two workflows using the same input data set + Two workflows sharing the same input dataset: + Cycle 1: specName1 still running → rule must NOT be cleaned. + Cycle 2: both workflows at 100% → rule IS cleaned. + Cycle 3: specName1 aborted → rule IS cleaned (aborted is not an active status). + Cycle 4: specName1 staging with no WQ elements → rule must NOT be cleaned + (conservative: staging request with no queue yet blocks deletion). """ - #Get workflow description. ReRecoWorkloadFactory.getTestArguments() is used in createReRecoSpec below, - #so the workflow description here and the one used in creating workqueue is the same specName = "RerecoSpec" - inputdataset = {"InputDataset": "/JetHT/Run2012C-v1/RAW"} - - #Create ReRecoSpec as stored in GlobalQueue - specUrl = self.specGenerator.createReRecoSpec(specName, "file", - assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]},InputDataset=inputdataset["InputDataset"]) - - #Second workflow using the same input data set - specName1 = "RerecoSpec1" - specUrl1 = self.specGenerator.createReRecoSpec(specName1, "file", - assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]},InputDataset=inputdataset["InputDataset"]) - - #cleaner = InputDataRucioRuleCleaner(rest=self.mockRest, config=self.config_obj) - cleaner = InputDataRucioRuleCleaner(rest=DummyREST(), config=self.config_obj) - - #Make GlobalQueue - globalQ = globalQueue(DbName='workqueue_t', - QueueURL=self.testInit.couchUrl, - UnittestFlag=True, logger=cleaner.logger, **self.queueParams) - globalQ.queueWork(specUrl, specName, "teamA") - globalQ.queueWork(specUrl1, specName1, "teamB") - - - cleaner.globalQ = globalQ - - #Make MSRuleCleaner - msRuleCleaner = MSRuleCleaner(self.config_obj.msRuleCleaner,logger=cleaner.logger) - msRuleCleaner.resetCounters() - msRuleCleaner.rucio = Rucio.Rucio(self.msRuleCleaner['rucioAccount'], - hostUrl=self.rucioConfigDict['rucio_host'], - authUrl=self.rucioConfigDict['auth_host'], - configDict=self.rucioConfigDict) - - cleaner.msRuleCleaner = msRuleCleaner - - #Let try to modify the element in GlobalQueue to have PercentComplete and PercentSuccess set to 100 + specName1 = "RerecoSpec1" + inputDataset = "/JetHT/Run2012C-v1/RAW" + + specUrl = self.specGenerator.createReRecoSpec( + specName, "file", + assignKwargs={'SiteWhitelist': ["T2_XX_SiteA"]}, + InputDataset=inputDataset) + specUrl1 = self.specGenerator.createReRecoSpec( + specName1, "file", + assignKwargs={'SiteWhitelist': ["T2_XX_SiteA"]}, + InputDataset=inputDataset) + + cleaner = self._makeCleaner() + cleaner.globalQ.queueWork(specUrl, specName, "teamA") + cleaner.globalQ.queueWork(specUrl1, specName1, "teamB") + wqService = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') - #Use this instead of wqService.getWQElementsByWorkflow(workflowName) to have the element'id' - data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName], 'endkey': [specName, {}], - 'reduce': False}) - - print(f"Elements in GlobalQueue {specName}:") - elements = data.get('rows', []) - print(json.dumps(elements, indent=2)) - - data1 = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName1], 'endkey': [specName1, {}], - 'reduce': False}) - - print(f"Elements in GlobalQueue {specName1}:") - elements1 = data1.get('rows', []) - print(json.dumps(elements1, indent=2)) - - #let update the PercentComplete and PercentSuccess and Status='Done' of the first elements - element_id = [elements[0]['id']] # Get the first element's ID - print("Updating element:", element_id) - wqService.updateElements(*element_id, PercentComplete=100, PercentSuccess=100, Status='Done') - - #create a rule and inject it in wma_test account - blockNames = list(elements[0]['value']['Inputs'].keys()) # Get the block name from the first element - print("Block Name:", blockNames[0]) - - #need to create rule here otherwise we do not know which element was updated since the element order changes each time re-fetching (of course we can use the element_id) + rows = self._getWorkflowElements(wqService, specName) + rows1 = self._getWorkflowElements(wqService, specName1) + print(f"Elements in GlobalQueue {specName}:", json.dumps(rows, indent=2)) + print(f"Elements in GlobalQueue {specName1}:", json.dumps(rows1, indent=2)) + + element_id = rows[0]['id'] + blockName = list(rows[0]['value']['Inputs'].keys())[0] + + # Find the specName1 element that covers the same block as specName's first element + element_id1 = rows1[0]['id'] + for e in rows1: + if list(e['value']['Inputs'].keys()) == [blockName]: + element_id1 = e['id'] + break + + # Set specName's element to Done/100%/100% and create a Rucio rule for its block + wqService.updateElements(element_id, PercentComplete=100, PercentSuccess=100, Status='Done') rule_id = cleaner.msRuleCleaner.rucio.createReplicationRule( - names=blockNames[0], + names=blockName, rseExpression="T2_US_Nebraska", copies=1, - grouping="DATASET", - lifetime=360, - account="wma_test", - ask_approval=False, - activity="Production Input", - comment="WMCore test block rule creation" - ) - + account=self.msRuleCleanerConfig['rucioAccount'], + )[0] print("Created Rucio rule with ID:", rule_id) - rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) - print(rule_info) - - # Re-fetch the elements to see the update - data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName], 'endkey': [specName, {}], - 'reduce': False}) - #element order changes each time, so we need to re-fetch the elements - elements = data.get('rows', []) - #elements=wqService.getWQElementsByWorkflow(specName) - print("Updated Elements in GlobalQueue:") - for e in elements: - print(e["id"], e['value']['Status'], e['value']["PercentComplete"], e['value']["PercentSuccess"]) - #print(e["id"], e['Status'], e["PercentComplete"], e["PercentSuccess"]) - - - # Define variables to hold the dynamic RequestName and RequestStatus - self.dynamicRequestName = specName - self.dynamicRequestName1 = specName1 - self.dynamicRequestStatus = "running-open" - self.dynamicRequestStatus1 = "running-open" - self.ReferenceInputDatasets = ["/JetHT/Run2012C-v1/RAW"] - - def mock_getRequestForInputDataset_side_effect(inputdataset, reqmgr2Url): - if inputdataset in self.ReferenceInputDatasets: #only respond to the input data set that is used by a request - # Simulate retrieving the workflow details - return { - "result": [ - {self.dynamicRequestName:{ - "id": 123, - "RequestName": self.dynamicRequestName, # Use the dynamic RequestName - "RequestStatus": self.dynamicRequestStatus, # Use the dynamic RequestStatus - }, - self.dynamicRequestName1:{ - "id": 456, - "RequestName": self.dynamicRequestName1, # Use the dynamic RequestName - "RequestStatus": self.dynamicRequestStatus1, # Use the dynamic RequestStatus - } - } - ] - } - else: - #return {"status": "error", "message": "Invalid request: inputdataset not found"} - return {"result": []} - - # Assign the side effect to the mock object - mock_getRequestForInputDataset.side_effect = mock_getRequestForInputDataset_side_effect - - #First test to clean the rule. It should not be successful since the second workflow is still running - results = cleaner.cleanRucioRules(self.config_obj) - print("Results from cleanRucioRules:", json.dumps(results, indent=2)) - self.assertTrue(not results) - - #Second test, change the second workflow element to 100% and try to clean the rule again - #now change the percentage of workqueue of the second workflow to 100% - #find the id that corresponds to blockname - print("Block Name:", blockNames[0]) - element_id1 = [elements1[0]['id']] # Get the first element's ID - for e in elements1: - inputs = list(e['value']['Inputs'].keys()) - if blockNames == inputs: - print("Found matching element in second workflow:", e['id']) - element_id1 = [e['id']] - break + print("Rule info:", cleaner.msRuleCleaner.rucio.getRule(rule_id)) - print(f"Updating element {element_id1} to (PercentComplete=100, PercentSuccess=100, Status='Done')") - wqService.updateElements(*element_id1, PercentComplete=100, PercentSuccess=100, Status='Done') - # Re-fetch the elements to see the update - data1 = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName1], 'endkey': [specName1, {}], - 'reduce': False}) - #element order changes each time, so we need to re-fetch the elements - elements1 = data1.get('rows', []) - print("Updated Elements in GlobalQueue (workflow 1):") - for e in elements1: - print(e["id"], e['value']['Status'], e['value']["PercentComplete"], e['value']["PercentSuccess"]) + # Mutable status dict so each cycle can update statuses without redefining the closure + statuses = {specName: "running-open", specName1: "running-open"} - #now try to clean the rule again. It should be successful this time - results = cleaner.cleanRucioRules(self.config_obj) - print("Results from cleanRucioRules with elements from other workflow complete:", json.dumps(results, indent=2)) - self.assertTrue(results) - - #Third test, change the second workflow to aborted and its elements to 0%. It should be able to clean the rule - #now change back the percentage of workqueue of the second workflow to 0% - print(f"Updating element {element_id1} to (PercentComplete=0, PercentSuccess=0, Status='Done')") - wqService.updateElements(*element_id1, PercentComplete=0, PercentSuccess=0, Status='Available') - #test the status of other request is aborted - self.dynamicRequestStatus1 = "aborted" - results = cleaner.cleanRucioRules(self.config_obj) - print("Results from cleanRucioRules with other request is aborted:", json.dumps(results, indent=2)) - self.assertTrue(results) #should be true since other request already aborted - - #Fourth test, now testing workflow in staging status and no workqueue is created. It should not clean the rule - globalQ.backend.deleteWQElementsByWorkflow([specName1]) - data1 = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName1], 'endkey': [specName1, {}], - 'reduce': False}) - - print(f"Elements in GlobalQueue {specName1}:") - elements1 = data1.get('rows', []) - print(json.dumps(elements1, indent=2)) - self.dynamicRequestStatus1 = "staging" - results = cleaner.cleanRucioRules(self.config_obj) - print("Results from cleanRucioRules with other request is staging:", json.dumps(results, indent=2)) - self.assertTrue(not results) #this should be false since staging request using the same data should not trigger rule deletion - - - #now make sure the rule is cleaned - #keep deleting until success or timeout - rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) - delResult = False - timeleft = 0 - start_time = time.time() - while rule_info and not delResult and timeleft < 300: - #now delete it - print('Manually deleting rucio rules: ', blockNames[0], cleaner.msRuleCleaner.rucio.listDataRules(blockNames[0], account=self.msRuleCleaner['rucioAccount'])) - delResult = cleaner.msRuleCleaner.rucio.deleteRule(rule_id[0]) - print("Deleted Rucio rule with ID:", rule_id, delResult) - if delResult: break - time.sleep(60) - timeleft = time.time() - start_time - - if not delResult and timeleft >= 300: - print("Failed to delete the rule after 5 minutes, exiting...") - - ''' - #@mock.patch('WMCore.GlobalWorkQueue.CherryPyThreads.InputDataRucioRuleCleaner.InputDataRucioRuleCleaner.getRequestForInputDataset') - def testInputDataRucioRuleCleanerWithThreading(self): - """ - Test the InputDataRucioRuleCleaner task with threading - """ - - #cleaner = InputDataRucioRuleCleaner(rest=self.mockRest, config=self.config_obj) - cleaner = InputDataRucioRuleCleaner(rest=DummyREST(), config=self.config_obj) - - #Get workflow description. ReRecoWorkloadFactory.getTestArguments() is used in createReRecoSpec below, - #so the workflow description here and the one used in creating workqueue is the same - specName = "RerecoSpec" - inputdataset = {"InputDataset": "/JetHT/Run2012C-v1/RAW"} - - #Create ReRecoSpec as stored in GlobalQueue - specUrl = self.specGenerator.createReRecoSpec(specName, "file", - assignKwargs={'SiteWhitelist':["T2_XX_SiteA"]},InputDataset=inputdataset["InputDataset"]) - - #Make GlobalQueue - globalQ = globalQueue(DbName='workqueue_t', - QueueURL=self.testInit.couchUrl, - UnittestFlag=True, logger=cleaner.logger, **self.queueParams) - globalQ.queueWork(specUrl, specName, "teamA") - cleaner.globalQ = globalQ - - #Make MSRuleCleaner - msRuleCleaner = MSRuleCleaner(self.config_obj.msRuleCleaner,logger=cleaner.logger) - msRuleCleaner.resetCounters() - msRuleCleaner.rucio = Rucio.Rucio(self.msRuleCleaner['rucioAccount'], - hostUrl=self.rucioConfigDict['rucio_host'], - authUrl=self.rucioConfigDict['auth_host'], - configDict=self.rucioConfigDict) - cleaner.msRuleCleaner = msRuleCleaner - - # Start CherryPy engine - print('CherryPy engine starting...') - cherrypy.engine.start() - # Give CherryPy a moment to start and modify the element in GlobalQueue after 5 seconds and before the next run of the periodic task - time.sleep(5) - - #Let try to modify the element in GlobalQueue to have PercentComplete and PercentSuccess set to 100 - wqService = WorkQueueDS(self.testInit.couchUrl, 'workqueue_t') - #Use this instead of wqService.getWQElementsByWorkflow(workflowName) to have the element'id' - data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName], 'endkey': [specName, {}], - 'reduce': False}) - - print("Elements in GlobalQueue:") - elements = data.get('rows', []) - print(json.dumps(elements, indent=2)) - - #let update the PercentComplete and PercentSuccess and Status='Done' of the first elements - element_id = [elements[0]['id']] # Get the first element's ID - print("Updating element:", element_id) - wqService.updateElements(*element_id, PercentComplete=100, PercentSuccess=100, Status='Done') - - #create a rule and inject it in wma_test account - blockNames = list(elements[0]['value']['Inputs'].keys()) # Get the block name from the first element - - #need to create rule here otherwise we do not know which element was updated since the element order changes each time re-fetching (of course we can use the element_id) - rule_id = cleaner.msRuleCleaner.rucio.createReplicationRule( - names=blockNames[0], - rseExpression="T2_US_Nebraska", - copies=1, - grouping="DATASET", - lifetime=360, - account="wma_test", - ask_approval=False, - activity="Production Input", - comment="WMCore test block rule creation" - ) + def side_effect(inputdataset, reqmgr2Url): + if inputdataset == inputDataset: + return {"result": [{ + specName: {"RequestName": specName, "RequestStatus": statuses[specName]}, + specName1: {"RequestName": specName1, "RequestStatus": statuses[specName1]}, + }]} + return {"result": []} - print("Created Rucio rule with ID:", rule_id) - rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) - print(rule_info) + mock_getRequestForInputDataset.side_effect = side_effect - # Re-fetch the elements to see the update - data = wqService.db.loadView('WorkQueue', 'elementsDetailByWorkflowAndStatus', - {'startkey': [specName], 'endkey': [specName, {}], - 'reduce': False}) + # Cycle 1: specName1 element is still at 0%/0% → block deletion + self.assertFalse(cleaner.cleanRucioRules(self.config_obj), + "Rule must not be cleaned while specName1 is still processing the same block") - elements = data.get('rows', []) - print("Updated Elements in GlobalQueue:") - for e in elements: + # Cycle 2: specName1 element completes → deletion allowed + wqService.updateElements(element_id1, PercentComplete=100, PercentSuccess=100, Status='Done') + print(f"Updated Elements in GlobalQueue {specName1}:") + for e in self._getWorkflowElements(wqService, specName1): print(e["id"], e['value']['Status'], e['value']["PercentComplete"], e['value']["PercentSuccess"]) - - time.sleep(20) - - print('CherryPy engine exiting...') - cherrypy.engine.exit() - - #now continuously check the rule status until it is cleaned and exit after 10 minutes - rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) - timeleft = 0 - start_time = time.time() - while rule_info and timeleft < 600: # Check for 10 minutes - print("Rule still exists:", rule_id[0], rule_info) - time.sleep(60) - timeleft = time.time() - start_time - rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) - - rule_info_for_check = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) - - #now make sure the rule should be cleaned (note that the rule may not be cleaned immediately after the periodic task execution (~5 mins), but we just clean it again here) - rule_info = cleaner.msRuleCleaner.rucio.getRule(rule_id[0]) - delResult = False - if not rule_info: - print("Rule not found.") - - #keep deleting until success or timeout - timeleft = 0 - start_time = time.time() - while rule_info and not delResult and timeleft < 300: - #now delete it - print('Manually deleting rucio rules: ', blockNames[0], cleaner.msRuleCleaner.rucio.listDataRules(blockNames[0], account=self.msRuleCleaner['rucioAccount'])) - delResult = cleaner.msRuleCleaner.rucio.deleteRule(rule_id[0]) - print("Deleted Rucio rule with ID:", rule_id, delResult) - if delResult: break - time.sleep(60) - timeleft = time.time() - start_time - - if not delResult and timeleft >= 300: - print("Failed to delete the rule after 5 minutes, exiting...") - - self.assertTrue(not rule_info_for_check, "Rule not deleted successfully after periodic task execution.") - ''' - + self.assertTrue(cleaner.cleanRucioRules(self.config_obj), + "Rule should be cleaned once both workflows have completed the block") + + # Cycle 3: specName1 is aborted (inactive) → deletion allowed even though its element + # is reset back to incomplete + wqService.updateElements(element_id1, PercentComplete=0, PercentSuccess=0, Status='Available') + statuses[specName1] = "aborted" + cleaner._processedElementIds = set() # reset so this scenario is evaluated independently + self.assertTrue(cleaner.cleanRucioRules(self.config_obj), + "Rule should be cleaned when the other request is aborted") + + # Cycle 4: specName1 is staging (active) but has no WQ elements → conservative block + cleaner.globalQ.backend.deleteWQElementsByWorkflow([specName1]) + statuses[specName1] = "staging" + cleaner._processedElementIds = set() # reset so this scenario is evaluated independently + self.assertFalse(cleaner.cleanRucioRules(self.config_obj), + "Rule must not be cleaned when a staging request has no queue elements yet") + + # The cleaner sets rule lifetime to 0 via updateRule (not deleteRule), + # so the mock rule still exists in the store + self.assertTrue(cleaner.msRuleCleaner.rucio.getRule(rule_id), + "Mock rule should still exist after lifetime-0 update (not physically deleted)") + + if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() From 5941cb5898df1d2e84912915f68e4e431ce80234 Mon Sep 17 00:00:00 2001 From: Duong Date: Tue, 7 Apr 2026 10:38:58 -0400 Subject: [PATCH 12/14] retrigger CI From 49513ee79600f2c2ded23d78a49a5e2ffca021e8 Mon Sep 17 00:00:00 2001 From: Duong Date: Wed, 15 Apr 2026 10:04:44 -0400 Subject: [PATCH 13/14] remove status=Done --- .../CherryPyThreads/InputDataRucioRuleCleaner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py index 3f52a36c18..0c9fa91ce4 100644 --- a/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py +++ b/src/python/WMCore/GlobalWorkQueue/CherryPyThreads/InputDataRucioRuleCleaner.py @@ -194,7 +194,8 @@ def cleanRucioRules(self, config): tStart = time.time() - globalQueueElements = self.globalQ.backend.getElements(status='Done') + #globalQueueElements = self.globalQ.backend.getElements(status='Done') + globalQueueElements = self.globalQ.backend.getElements() # Trim skip-set to only IDs still present in the queue, preventing unbounded growth currentIds = {el.id for el in globalQueueElements} @@ -220,6 +221,8 @@ def cleanRucioRules(self, config): percentSuccess = element.get('PercentSuccess', 0) if percentComplete == 100 and percentSuccess == 100: + self.logger.info("Element %s workflow=%s status=%s PercentComplete=%s PercentSuccess=%s", + element.id, requestName, element.get('Status'), percentComplete, percentSuccess) # Structure required by MSRuleCleaner.cleanRucioRules() rulesToClean = {'PlineMarkers': ['Current'], 'RulesToClean': {'Current': []}, 'CleanupStatus': {'Current': []}} From 9a08a76b54c414794ab04523085667d3e7781293 Mon Sep 17 00:00:00 2001 From: Duong Date: Wed, 29 Apr 2026 09:01:33 -0400 Subject: [PATCH 14/14] retrigger CI