Skip to content

Commit ff6487d

Browse files
committed
Implement worker command wrapping in autoalloc
1 parent 55a61c6 commit ff6487d

12 files changed

Lines changed: 77 additions & 16 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
* New policy `tight` (and `tight!`) that is the original implementation of `compact`.
1919
The policy `compact` now behaves as is described in the section "Changes".
2020
* Resource policy `compact!` is now allowed to take fractional resource request.
21-
* There is a new command `hq alloc cat <alloc-id> <stdout/stderr>`, which can be used
21+
* New command `hq alloc cat <alloc-id> <stdout/stderr>`, which can be used
2222
to debug the output of allocations submitted by the automatic allocator.
23-
* There is a new command `hq server wait` that repeatedly tries to connect to a server with a configurable timeout.
23+
* New command `hq server wait` that repeatedly tries to connect to a server with a configurable timeout.
2424
This is useful for deployment scripts that need to wait for server availability.
25+
* New `hq alloc add` parameter called `--wrap-worker-cmd`. It can be used to start
26+
workers on allocated nodes using some wrapping mechanism (e.g. Podman).
2527

2628
### Fixes
2729

crates/hyperqueue/src/client/commands/autoalloc.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,17 @@ struct SharedQueueOpts {
149149
#[arg(long)]
150150
worker_stop_cmd: Option<String>,
151151

152+
/// A command that will be prepended before arguments required for starting a worker
153+
///
154+
/// Normally, the worker is started with `hq worker start ...`
155+
/// If you specify e.g. `--worker-wrap-cmd foo bar --arg=1`, the worker will be started using
156+
/// `foo bar --arg=1 hq worker start`.
157+
///
158+
/// Note that if multiple workers are started in an allocation (on multiple nodes), then the
159+
/// provided wrapping command will be executed on each node.
160+
#[arg(long)]
161+
worker_wrap_cmd: Option<String>,
162+
152163
#[arg(
153164
long,
154165
value_parser = parse_hms_or_human_time,
@@ -290,6 +301,7 @@ fn args_to_params(manager: ManagerType, args: SharedQueueOpts) -> anyhow::Result
290301
worker_args,
291302
worker_start_cmd,
292303
worker_stop_cmd,
304+
worker_wrap_cmd,
293305
worker_time_limit,
294306
min_utilization,
295307
additional_args,
@@ -398,6 +410,7 @@ wasted allocation duration."
398410
additional_args,
399411
worker_start_cmd,
400412
worker_stop_cmd,
413+
worker_wrap_cmd,
401414
max_worker_count,
402415
worker_args,
403416
idle_timeout,

crates/hyperqueue/src/server/autoalloc/process.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2640,6 +2640,7 @@ mod tests {
26402640
additional_args: vec![],
26412641
worker_start_cmd: None,
26422642
worker_stop_cmd: None,
2643+
worker_wrap_cmd: None,
26432644
cli_resource_descriptor: cli_resources,
26442645
worker_args: vec![],
26452646
idle_timeout: None,

crates/hyperqueue/src/server/autoalloc/queue/common.rs

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,14 @@ pub fn build_worker_args(
158158
write!(env, "RUST_LOG={log_env} ").unwrap();
159159
}
160160

161+
let wrap_cmd = params
162+
.worker_wrap_cmd
163+
.as_deref()
164+
.map(|cmd| format!("{} ", cmd.trim()))
165+
.unwrap_or_default();
166+
161167
let mut args = format!(
162-
"{env}{hq} worker start --idle-timeout \"{idle_timeout}\" --manager \"{manager}\" --server-dir \"{server_dir}\"",
168+
"{env}{wrap_cmd}{hq} worker start --idle-timeout \"{idle_timeout}\" --manager \"{manager}\" --server-dir \"{server_dir}\"",
163169
hq = hq_path.display(),
164170
server_dir = server_dir.display()
165171
);
@@ -171,7 +177,7 @@ pub fn build_worker_args(
171177
args
172178
}
173179

174-
pub fn wrap_worker_cmd(
180+
pub fn add_start_stop_worker_commands(
175181
mut worker_cmd: String,
176182
worker_start_cmd: Option<&str>,
177183
worker_stop_cmd: Option<&str>,
@@ -189,37 +195,41 @@ pub fn wrap_worker_cmd(
189195
#[cfg(test)]
190196
mod tests {
191197
use crate::common::utils::fs::normalize_exe_path;
192-
use crate::server::autoalloc::queue::common::wrap_worker_cmd;
198+
use crate::server::autoalloc::queue::common::add_start_stop_worker_commands;
193199
use std::path::PathBuf;
194200

195201
#[test]
196202
fn wrap_cmd_noop() {
197203
assert_eq!(
198-
wrap_worker_cmd("foo".to_string(), None, None),
204+
add_start_stop_worker_commands("foo".to_string(), None, None),
199205
"foo".to_string()
200206
);
201207
}
202208

203209
#[test]
204210
fn wrap_cmd_start() {
205211
assert_eq!(
206-
wrap_worker_cmd("foo bar".to_string(), Some("init.sh"), None),
212+
add_start_stop_worker_commands("foo bar".to_string(), Some("init.sh"), None),
207213
"init.sh && foo bar".to_string()
208214
);
209215
}
210216

211217
#[test]
212218
fn wrap_cmd_stop() {
213219
assert_eq!(
214-
wrap_worker_cmd("foo bar".to_string(), None, Some("unload.sh")),
220+
add_start_stop_worker_commands("foo bar".to_string(), None, Some("unload.sh")),
215221
"foo bar; unload.sh".to_string()
216222
);
217223
}
218224

219225
#[test]
220226
fn wrap_cmd_start_stop() {
221227
assert_eq!(
222-
wrap_worker_cmd("foo bar".to_string(), Some("init.sh"), Some("unload.sh")),
228+
add_start_stop_worker_commands(
229+
"foo bar".to_string(),
230+
Some("init.sh"),
231+
Some("unload.sh")
232+
),
223233
"init.sh && foo bar; unload.sh".to_string()
224234
);
225235
}

crates/hyperqueue/src/server/autoalloc/queue/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ pub struct QueueParameters {
2626

2727
pub worker_start_cmd: Option<String>,
2828
pub worker_stop_cmd: Option<String>,
29+
pub worker_wrap_cmd: Option<String>,
2930

3031
/// Resources descriptor constructed from worker CLI options
3132
pub cli_resource_descriptor: Option<ResourceDescriptor>,

crates/hyperqueue/src/server/autoalloc/queue/pbs.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ use crate::common::manager::info::ManagerType;
1111
use crate::common::manager::pbs::{format_pbs_duration, parse_pbs_datetime};
1212
use crate::common::utils::time::local_to_system_time;
1313
use crate::server::autoalloc::queue::common::{
14-
ExternalHandler, build_worker_args, check_command_output, create_allocation_dir,
15-
create_command, format_allocation_name, submit_script, wrap_worker_cmd,
14+
ExternalHandler, add_start_stop_worker_commands, build_worker_args, check_command_output,
15+
create_allocation_dir, create_command, format_allocation_name, submit_script,
1616
};
1717
use crate::server::autoalloc::queue::{
1818
AllocationExternalStatus, AllocationStatusMap, AllocationSubmissionResult, QueueHandler,
@@ -55,7 +55,7 @@ impl QueueHandler for PbsHandler {
5555
)?;
5656
let worker_args =
5757
build_worker_args(&hq_path, ManagerType::Pbs, &server_directory, &params);
58-
let worker_args = wrap_worker_cmd(
58+
let worker_args = add_start_stop_worker_commands(
5959
worker_args,
6060
params.worker_start_cmd.as_deref(),
6161
params.worker_stop_cmd.as_deref(),

crates/hyperqueue/src/server/autoalloc/queue/slurm.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ use crate::common::manager::slurm::{
1414
};
1515
use crate::common::utils::time::local_to_system_time;
1616
use crate::server::autoalloc::queue::common::{
17-
ExternalHandler, build_worker_args, create_allocation_dir, create_command,
18-
format_allocation_name, submit_script, wrap_worker_cmd,
17+
ExternalHandler, add_start_stop_worker_commands, build_worker_args, create_allocation_dir,
18+
create_command, format_allocation_name, submit_script,
1919
};
2020
use crate::server::autoalloc::queue::{
2121
AllocationExternalStatus, AllocationStatusMap, AllocationSubmissionResult, QueueHandler,
@@ -59,7 +59,7 @@ impl QueueHandler for SlurmHandler {
5959

6060
let worker_args =
6161
build_worker_args(&hq_path, ManagerType::Slurm, &server_directory, &params);
62-
let worker_args = wrap_worker_cmd(
62+
let worker_args = add_start_stop_worker_commands(
6363
worker_args,
6464
params.worker_start_cmd.as_deref(),
6565
params.worker_stop_cmd.as_deref(),

crates/hyperqueue/src/server/autoalloc/state.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,7 @@ mod tests {
589589
additional_args: vec![],
590590
worker_start_cmd: None,
591591
worker_stop_cmd: None,
592+
worker_wrap_cmd: None,
592593
cli_resource_descriptor: None,
593594
worker_args: vec![],
594595
idle_timeout: None,

crates/tako/benches/benchmarks/core.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use criterion::measurement::WallTime;
2-
use criterion::{BatchSize, BenchmarkGroup, BenchmarkId, Criterion, black_box};
2+
use criterion::{BatchSize, BenchmarkGroup, BenchmarkId, Criterion};
3+
use std::hint::black_box;
34
use tako::Set;
45
use tako::TaskId;
56
use tako::internal::server::core::Core;

docs/deployment/allocation.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,14 @@ that node. You can use it e.g. to clean up a previously initialized environment
204204
The execution of this command is best-effort! It is not guaranteed that the command will always be executed. For example,
205205
PBS/Slurm can kill the allocation without giving HQ a chance to run the command.
206206

207+
#### Worker wrap command
208+
209+
- Format: `--worker-wrap-cmd <cmds>`
210+
211+
Specifies a string that will be prepended before the command used to start a worker on a node inside an allocation.
212+
213+
For example, if you specify `--worker-wrap-cmd "podman run"`, each worker will be started using `podman run hq worker start ...`.
214+
207215
#### Worker time limit
208216

209217
- Format[^1]: `--worker-time-limit <duration>`

0 commit comments

Comments
 (0)