Skip to content

Commit 5b2ea99

Browse files
committed
Expose min-utilization autoalloc queue parameter
1 parent fde1535 commit 5b2ea99

7 files changed

Lines changed: 49 additions & 3 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@ The automatic allocator has been finally reimplemented, and is now much better:
1414
* It now uses information from the scheduler to determine how many allocations to spawn, and thus it can react to the
1515
current computational load much more accurately. It should also be less "eager".
1616
* It properly supports multi-node tasks.
17-
* It considers computational load across all allocation queues (before, each queue was treated separately, which led to creating too many submissions).
17+
* It considers computational load across all allocation queues (before, each queue was treated separately, which led to
18+
creating too many submissions).
19+
* It now exposes a `min-utilization` parameter, which can be used to avoid spawning an allocation that couldn't be utilized
20+
enough.
1821

1922
As this is a large behavioral change, we would be happy to hear your feedback!
2023

crates/hyperqueue/src/client/commands/autoalloc.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,15 @@ The limit must not be larger than the allocation time limit."#)
140140
)]
141141
worker_time_limit: Option<Duration>,
142142

143+
/// Minimal expected utilization required to submit an allocation into this queue
144+
///
145+
/// Autoalloc will not spawn an allocation unless the scheduler thinks it could use at least
146+
/// `min_utilization`% of the resources of workers in the allocation.
147+
///
148+
/// The default is 0.0.
149+
#[arg(long)]
150+
min_utilization: Option<f32>,
151+
143152
/// Additional arguments passed to the submit command
144153
#[arg(trailing_var_arg(true))]
145154
additional_args: Vec<String>,
@@ -241,11 +250,20 @@ fn args_to_params(
241250
worker_start_cmd,
242251
worker_stop_cmd,
243252
worker_time_limit,
253+
min_utilization,
244254
additional_args,
245255
on_server_lost,
246256
no_dry_run: _,
247257
} = args;
248258

259+
if let Some(min_utilization) = min_utilization {
260+
if !(0.0..=1.0).contains(&min_utilization) {
261+
return Err(anyhow::anyhow!(
262+
"Minimal utilization has to be in the interval [0.0, 1.0]."
263+
));
264+
}
265+
}
266+
249267
if let Some(ref idle_timeout) = worker_args.idle_timeout {
250268
if *idle_timeout > Duration::from_secs(60 * 10) {
251269
log::warn!(
@@ -317,6 +335,7 @@ wasted allocation duration."
317335
max_workers_per_alloc,
318336
backlog,
319337
timelimit: time_limit,
338+
min_utilization,
320339
name,
321340
additional_args,
322341
worker_start_cmd,

crates/hyperqueue/src/server/autoalloc/process.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ pub fn create_queue_info(params: AllocationQueueParams) -> QueueInfo {
174174
max_workers_per_alloc,
175175
backlog,
176176
timelimit,
177+
min_utilization,
177178
additional_args,
178179
max_worker_count,
179180
worker_start_cmd,
@@ -192,6 +193,7 @@ pub fn create_queue_info(params: AllocationQueueParams) -> QueueInfo {
192193
idle_timeout,
193194
worker_start_cmd,
194195
worker_stop_cmd,
196+
min_utilization,
195197
)
196198
}
197199

@@ -430,8 +432,7 @@ fn create_queue_worker_query(queue: &AllocationQueue) -> WorkerTypeQuery {
430432
// How many workers can we provide at the moment
431433
max_sn_workers: info.backlog() * info.max_workers_per_alloc(),
432434
max_workers_per_allocation: info.max_workers_per_alloc(),
433-
// TODO: expose this through the CLI
434-
min_utilization: 0.0,
435+
min_utilization: info.min_utilization().unwrap_or(0.0),
435436
}
436437
}
437438

@@ -2010,6 +2011,7 @@ mod tests {
20102011
timelimit: queue_info.timelimit(),
20112012
name: Some("Queue".to_string()),
20122013
max_worker_count: queue_info.max_worker_count(),
2014+
min_utilization: None,
20132015
additional_args: vec![],
20142016
worker_start_cmd: None,
20152017
worker_stop_cmd: None,
@@ -2424,6 +2426,8 @@ mod tests {
24242426
limiter_max_submit_fails: u64,
24252427
#[builder(default = "vec![Duration::ZERO]")]
24262428
limiter_delays: Vec<Duration>,
2429+
#[builder(default)]
2430+
min_utilization: Option<f32>,
24272431
}
24282432

24292433
impl QueueBuilder {
@@ -2437,6 +2441,7 @@ mod tests {
24372441
limiter_max_alloc_fails,
24382442
limiter_max_submit_fails,
24392443
limiter_delays,
2444+
min_utilization,
24402445
} = self.finish().unwrap();
24412446
(
24422447
QueueInfo::new(
@@ -2450,6 +2455,7 @@ mod tests {
24502455
None,
24512456
None,
24522457
None,
2458+
min_utilization,
24532459
),
24542460
RateLimiter::new(
24552461
limiter_delays,

crates/hyperqueue/src/server/autoalloc/queue/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ pub struct QueueInfo {
2424
idle_timeout: Option<Duration>,
2525
worker_start_cmd: Option<String>,
2626
worker_stop_cmd: Option<String>,
27+
min_utilization: Option<f32>,
2728
}
2829

2930
impl QueueInfo {
@@ -39,6 +40,7 @@ impl QueueInfo {
3940
idle_timeout: Option<Duration>,
4041
worker_start_cmd: Option<String>,
4142
worker_stop_cmd: Option<String>,
43+
min_utilization: Option<f32>,
4244
) -> Self {
4345
Self {
4446
manager,
@@ -51,6 +53,7 @@ impl QueueInfo {
5153
idle_timeout,
5254
worker_start_cmd,
5355
worker_stop_cmd,
56+
min_utilization,
5457
}
5558
}
5659

@@ -81,6 +84,10 @@ impl QueueInfo {
8184
pub fn worker_args(&self) -> &[String] {
8285
&self.worker_args
8386
}
87+
88+
pub fn min_utilization(&self) -> Option<f32> {
89+
self.min_utilization
90+
}
8491
}
8592

8693
#[derive(Debug)]

crates/hyperqueue/src/server/autoalloc/state.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,7 @@ mod tests {
509509
None,
510510
None,
511511
None,
512+
None,
512513
),
513514
None,
514515
Box::new(NullHandler),

crates/hyperqueue/src/transfer/messages.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,7 @@ pub struct AllocationQueueParams {
311311
pub timelimit: Duration,
312312
pub name: Option<String>,
313313
pub max_worker_count: Option<u32>,
314+
pub min_utilization: Option<f32>,
314315
pub additional_args: Vec<String>,
315316

316317
pub worker_start_cmd: Option<String>,

docs/deployment/allocation.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,15 @@ Maximum number of workers that can be queued or running in the allocation queue.
107107
limited by the manager (PBS/Slurm), but you can use this parameter to make the limit smaller, for example if you also want
108108
to manage allocations outside HyperQueue.
109109

110+
#### Minimal utilization
111+
Format: `--min-utilization <ratio>`
112+
113+
Minimal utilization determines how could the scheduler utilize workers from submitted allocations. If the schedules thinks that it can make use of `N%` of worker resources in a single allocation of this queue, `min-utilization` has to be at least `N`, otherwise the allocation will not be created.
114+
115+
It has to be a floating point number between 0.0 and 1.0.
116+
117+
The default minimal utilization is `0`, which means that an allocation will be created if the scheduler thinks that it can use any (non-zero) amount of resources of worker(s) in the allocation.
118+
110119
#### Worker resources
111120
You can specify [CPU](../jobs/cresources.md) and [generic](../jobs/resources.md) resources of workers spawned by the
112121
allocation queue. The name and syntax of these parameters is the same as when you create a

0 commit comments

Comments
 (0)