Skip to content

Commit a17f59b

Browse files
committed
Do not crash if Slurm returns invalid remaining timelimit
1 parent ee9796b commit a17f59b

2 files changed

Lines changed: 13 additions & 5 deletions

File tree

crates/hyperqueue/src/common/manager/slurm.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use crate::common::manager::common::format_duration;
22
use crate::common::utils::time::parse_hms_time;
3+
use anyhow::Context;
34
use std::process::Command;
45
use std::time::Duration;
56
use tako::Map;
@@ -32,13 +33,15 @@ pub fn parse_remaining_timelimit(output: &str) -> anyhow::Result<Duration> {
3233
items
3334
.get("RunTime")
3435
.ok_or_else(|| anyhow::anyhow!("RunTime entry not found"))?,
35-
)?;
36+
)
37+
.context("Cannot parse Slurm runtime")?;
3638

3739
let time_limit = parse_slurm_duration(
3840
items
3941
.get("TimeLimit")
4042
.ok_or_else(|| anyhow::anyhow!("TimeLimit entry not found"))?,
41-
)?;
43+
)
44+
.context("Cannot parse Slurm timelimit")?;
4245

4346
if time_limit < run_time {
4447
anyhow::bail!("Slurm: TimeLimit is smaller than RunTime");

crates/hyperqueue/src/worker/bootstrap.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,13 @@ pub fn try_get_slurm_info() -> anyhow::Result<ManagerInfo> {
177177
anyhow!("SLURM_JOB_ID/SLURM_JOBID not found. The process is not running under SLURM")
178178
})?;
179179

180-
let duration = slurm::get_remaining_timelimit(&manager_job_id)
181-
.expect("Could not get remaining time from scontrol");
180+
let time_limit = match slurm::get_remaining_timelimit(&manager_job_id) {
181+
Ok(duration) => Some(duration),
182+
Err(error) => {
183+
log::warn!("Cannot get remaining worker timelimit from Slurm: {error:?}");
184+
None
185+
}
186+
};
182187

183188
let max_memory_mb = std::env::var("SLURM_MEM_PER_NODE")
184189
.ok()
@@ -189,7 +194,7 @@ pub fn try_get_slurm_info() -> anyhow::Result<ManagerInfo> {
189194
Ok(ManagerInfo {
190195
manager: ManagerType::Slurm,
191196
allocation_id: manager_job_id,
192-
time_limit: Some(duration),
197+
time_limit,
193198
max_memory_mb,
194199
})
195200
}

0 commit comments

Comments
 (0)