cccl/ci/bench.yaml at main · NVIDIA/cccl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# # CCCL PR benchmark request config.
#
# ## Overview:
#
# This file is used to request benchmark comparisons in PR CI.
#
# This file must match ci/bench.template.yaml to merge.
# CI branch protections will fail if they differ. Reset before merging.
#
# To update the defaults (e.g. new GPU pools), modify both this file and
# ci/bench.template.yaml together in the same PR.
#
# !! Strongly consider appending the following to your **commit messages** while benchmarking. !!
# This prevents wasteful non-benchmark CI jobs if they are not needed.
#
#     [bench-only]
#
# ## Quick start:
#
# 1. Add one or more benchmark regexes under benchmarks.filters.cub and/or
#    benchmarks.filters.python.
# 2. Enable at least one GPU by uncommenting or adding entries in benchmarks.gpus.
# 3. Push and inspect the dispatched benchmark jobs/artifacts.
# 4. Remove/reset benchmark-request edits before final merge.

benchmarks:

  # Benchmark filters grouped by project.
  filters:
    # CUB C++ benchmark filters (regex matched against ninja target names).
    cub:
      # Examples:
      # - '^cub\.bench\.for_each\.base'
      # - '^cub\.bench\.reduce\.(sum|min)\.'

    # Python benchmark filters (regex matched against paths under benchmarks/).
    python:
      # Examples:
      # - 'compute/reduce/sum\.py'
      # - 'compute/transform/.*\.py'
      # - 'coop/bench_warp_reduce\.py'

  # Select GPUs. These are limited and shared, be intentional and conservative.
  gpus:
    # - "t4"         # sm_75, 16 GB
    # - "rtx2080"    # sm_75,  8 GB
    # - "rtxa6000"   # sm_86, 48 GB
    # - "l4"         # sm_89, 24 GB
    # - "rtx4090"    # sm_89, 24 GB
    # - "h100"       # sm_90, 80 GB
    # - "rtxpro6000" # sm_120

  # Extra .devcontainer/launch.sh -d args
  # launch_args: "--cuda 13.1 --host gcc14"
  launch_args: "" # Latest nvcc + gcc

  # Advanced:
  base_ref: "origin/main"
  test_ref: "HEAD"
  arch: "native"
  nvbench_args: >-
    --timeout 30
    --skip-time 15e-6
    --stopping-criterion entropy
    --throttle-threshold 90
    --throttle-recovery-delay 0.15
  nvbench_compare_args: ""