KernelTuner · benvanwerkhoven · Jun 5, 2026 · Jun 5, 2026
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -18,6 +18,7 @@
 from kernel_tuner.accuracy import Tunable
 from kernel_tuner.observers.observer import BenchmarkObserver, ContinuousObserver, OutputObserver, PrologueObserver
 from kernel_tuner.observers.tegra import TegraObserver
+from kernel_tuner.backends.backend import GPUBackend
 
 try:
     import torch
@@ -230,6 +231,24 @@
         raise TypeError(f"Invalid observer: {observer!r} does not extend BenchmarkObserver")
 
 
+def _select_default_cuda_backend(lang):
+    """ Select default CUDA backend, looks for which backends are installed. """
+    # First try cuda-python (nvcuda)
+    from kernel_tuner.backends.nvcuda import CudaFunctions, driver
+    if driver:
+        return CudaFunctions
+    # Then try Cupy
+    if _get_cupy():
+        from kernel_tuner.backends.cupy import CupyFunctions
+        return CupyFunctions
+    # Then try PyCUDA
+    from kernel_tuner.backends.pycuda import PyCudaFunctions, pycuda_available
+    if pycuda_available:
+        return PyCudaFunctions
+    # Ran out of options
+    raise RuntimeError("Error: CUDA selected/detected, but missing CUDA dependencies, please run 'pip install cuda-python', or install cupy or pycuda.")
+
+
 class DeviceInterface(object):
     """Class that offers a High-Level Device Interface to the rest of the Kernel Tuner."""
 
@@ -286,67 +305,46 @@
         observer_args = dict(device=device, platform=platform, compiler=compiler, lang=lang)
         observers = [instantiate_observer(ob, observer_args) for ob in observers]
 
-        if lang.upper() == "CUDA":
+        backend_options = dict(compiler_options=compiler_options, iterations=iterations)
+
+        # first check for explicitly selected backends
+        if lang.upper() == "PYCUDA":
             from kernel_tuner.backends.pycuda import PyCudaFunctions
-            dev = PyCudaFunctions(
-                device,
-                compiler_options=compiler_options,
-                iterations=iterations,
-                observers=observers,
-            )
+            backend = PyCudaFunctions
         elif lang.upper() == "CUPY":
             from kernel_tuner.backends.cupy import CupyFunctions
-            dev = CupyFunctions(
-                device,
-                compiler_options=compiler_options,
-                iterations=iterations,
-                observers=observers,
-            )
+            backend = CupyFunctions
         elif lang.upper() == "NVCUDA":
             from kernel_tuner.backends.nvcuda import CudaFunctions
-            dev = CudaFunctions(
-                device,
-                compiler_options=compiler_options,
-                iterations=iterations,
-                observers=observers,
-            )
+            backend = CudaFunctions
+        elif lang.upper() == "CUDA":
+            # Select default CUDA backend, based on availability
+            backend = _select_default_cuda_backend(lang)
         elif lang.upper() == "OPENCL":
             from kernel_tuner.backends.opencl import OpenCLFunctions
-            dev = OpenCLFunctions(
-                device,
-                platform,
-                compiler_options=compiler_options,
-                iterations=iterations,
-                observers=observers,
-            )
-        elif lang.upper() in ["C", "FORTRAN"]:
-            from kernel_tuner.backends.compiler import CompilerFunctions
-            dev = CompilerFunctions(
-                compiler=compiler,
-                compiler_options=compiler_options,
-                iterations=iterations,
-                observers=observers,
-            )
+            backend = OpenCLFunctions
+            backend_options["platform"] = platform
         elif lang.upper() == "HIP":
             from kernel_tuner.backends.hip import HipFunctions
-            dev = HipFunctions(
-                device,
-                compiler_options=compiler_options,
-                iterations=iterations,
-                observers=observers,
-            )
+            backend = HipFunctions
+        elif lang.upper() in ["C", "FORTRAN"]:
+            from kernel_tuner.backends.compiler import CompilerFunctions
+            backend = CompilerFunctions
+            backend_options["compiler"] = compiler
+            backend_options["observers"] = observers
         elif lang.upper() == "HYPERTUNER":
             from kernel_tuner.backends.hypertuner import HypertunerFunctions
-            dev = HypertunerFunctions(
-                iterations=iterations,
-                compiler_options=compiler_options
-            )
+            backend = HypertunerFunctions
             self.requires_warmup = False
         else:
             raise NotImplementedError(
                 "Sorry, support for languages other than CUDA, OpenCL, HIP, C, and Fortran is not implemented yet"
             )
-        self.dev = dev
+
+        if issubclass(backend, GPUBackend):
+            backend_options["device"] = device
+            backend_options["observers"] = observers
+        self.dev = backend(**backend_options)
 
         # look for NVMLObserver and TegraObserver in observers, if present, enable special tunable parameters through nvml/tegra
         self.use_nvml = False
@@ -381,9 +379,9 @@
         self.iterations = iterations
 
         self.lang = lang
-        self.units = dev.units
-        self.name = dev.name
-        self.max_threads = dev.max_threads
+        self.units = self.dev.units
+        self.name = self.dev.name
+        self.max_threads = self.dev.max_threads
         if not quiet:
             print("Using: " + self.dev.name)
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -94,9 +94,9 @@ priority = "explicit"
 hip-python = { version = "^6.3.3.540.31", source = "testpypi", optional = true }  # Note: when released, switch this package to pypi and remove tool.poetry.source and move this to [project.optional-dependencies]
 
 [project.optional-dependencies]
-cuda = ["pycuda>=2025.1", "nvidia-ml-py>=12.535.108", "pynvml>=11.4.1"] # Attention: if pycuda is changed here, also change `session.install("pycuda")` in the Noxfile
+cuda = ["cuda-python>=12.6.0", "nvidia-ml-py>=12.535.108", "pynvml>=11.4.1"] # Attention: if pycuda is changed here, also change `session.install("pycuda")` in the Noxfile
 opencl = ["pyopencl"]                                                   # Attention: if pyopencl is changed here, also change `session.install("pyopencl")` in the Noxfile
-cuda_opencl = ["pycuda>=2024.1", "pyopencl"]                            # Attention: if pycuda is changed here, also change `session.install("pycuda")` in the Noxfile
+cuda_opencl = ["cuda-python>=12.6.0", "pyopencl"]                            # Attention: if pycuda is changed here, also change `session.install("pycuda")` in the Noxfile
 hip = ["hip-python"]
 tutorial = ["jupyter>=1.0.0", "matplotlib>=3.5.0", "nvidia-ml-py>=12.535.108"]
 

diff --git a/test/test_core.py b/test/test_core.py
@@ -11,7 +11,7 @@
 from kernel_tuner import core
 from kernel_tuner.interface import Options
 
-from .context import skip_if_no_pycuda
+from .context import skip_if_no_cuda
 
 
 mock_config = {"return_value.compile.return_value": "compile",
@@ -52,84 +52,51 @@ def env():
     yield dev, instance
 
 
-@skip_if_no_pycuda
-def test_default_verify_function(env):
-
-    # gpu_args = dev.ready_argument_list(args)
-    # func = dev.compile_kernel(instance, verbose)
-
-    dev, instance = env
-    args = instance.arguments
-    verbose = True
-
-    # 1st case, correct answer but not enough items in the list
-    answer = [args[1] + args[2]]
-    try:
-        core._default_verify_function(instance, answer, args, 1e-6, verbose)
-        print("Expected a TypeError to be raised")
-        assert False
-    except TypeError as expected_error:
-        print(str(expected_error))
-        assert "The length of argument list and provided results do not match." == str(expected_error)
-    except Exception:
-        print("Expected a TypeError to be raised")
-        assert False
-
-    # 2nd case, answer is of wrong type
-    answer = [np.ubyte([12]), None, None, None]
-    try:
-        core._default_verify_function(instance, answer, args, 1e-6, verbose)
-        # dev.check_kernel_output(func, gpu_args, instance, answer, 1e-6, None, verbose)
-        print("Expected a TypeError to be raised")
-        assert False
-    except TypeError as expected_error:
-        print(str(expected_error))
-        assert "Element 0" in str(expected_error)
-    except Exception:
-        print("Expected a TypeError to be raised")
-        assert False
-
-    instance.delete_temp_files()
-    assert True
+@skip_if_no_cuda
+def test_check_kernel_output():
 
+    kernel_string = """
+        __global__ void copy(float *out, float *in, int n) {
+            int i = threadIdx.x + blockDim.x * blockIdx.x;
+            if (i < n)
+                out[i] = in[i];
+        }
+        """
+
+    kernel_name = "copy"
+    lang = "CUDA"
 
-@patch('kernel_tuner.backends.pycuda.PyCudaFunctions')
-def test_check_kernel_output(dev_func_interface):
-    dev_func_interface.configure_mock(**mock_config)
+    # Create the object under test (DeviceInterface)
+    kernel_source = core.KernelSource(kernel_name, kernel_string, lang, None)
+    dev = core.DeviceInterface(kernel_source)
 
-    dev = core.DeviceInterface(core.KernelSource("name", "", lang="CUDA"))
-    dfi = dev.dev
+    # Setup GPU args
+    n = np.int32(2000)
+    input_data = np.random.random(n).astype(np.float32)
+    output_data = np.zeros_like(input_data)
+    args = [output_data, input_data, n]
+    gpu_args = dev.dev.ready_argument_list(args)
 
-    answer = [np.zeros(4).astype(np.float32)]
-    instance = core.KernelInstance("name", None, "kernel_string", "temp_files", (256, 1, 1), (1, 1, 1), {}, answer)
-    wrong = [np.array([1, 2, 3, 4]).astype(np.float32)]
-    atol = 1e-6
 
-    dev.check_kernel_output('func', answer, instance, answer, atol, None, True)
+    # Create kernel instance and compile GPU kernel
+    class FakeOptions(dict):
+        def __getattr__(self, name):
+            if not name in self:
+                return None
+            return self[name]
 
-    dfi.refresh_memory.assert_called()
-    dfi.run_kernel.assert_called_once_with('func', answer, (256, 1, 1), (1, 1, 1))
+    kernel_options = FakeOptions(dict(kernel_name=kernel_name, arguments=args, problem_size=n))
+    instance = dev.create_kernel_instance(kernel_source, kernel_options, {}, True)
+    func = dev.compile_kernel(instance, True)
 
-    print(dfi.mock_calls)
+    # Run check_kernel_output
+    # As the kernel only copies the data this should complete without throwing
+    # an exception
+    answer = [input_data, None, None]
+    dev.check_kernel_output(func, gpu_args, instance, answer, 1e-6, None, True)
 
-    assert dfi.refresh_memory.called == 1
-    assert dfi.memcpy_dtoh.called == 1
 
-    for name, args, _ in dfi.mock_calls:
-        if name == 'memcpy_dtoh':
-            assert all(args[0] == answer[0])
-            assert all(args[1] == answer[0])
 
-    # the following call to check_kernel_output is expected to fail because
-    # the answer is non-zero, while the memcpy_dtoh function on the Mocked object
-    # obviously does not result in the result_host array containing anything
-    # non-zero
-    try:
-        dev.check_kernel_output('func', wrong, instance, wrong, atol, None, True)
-        print("check_kernel_output failed to throw an exception")
-        assert False
-    except Exception:
-        assert True
 
 
 def test_default_verify_function_arrays():

diff --git a/test/test_cuda_functions.py b/test/test_cuda_functions.py
@@ -102,3 +102,36 @@ def test_compile_include():
 def test_tune_kernel(env):
     result, _ = tune_kernel(*env, lang="nvcuda", verbose=True)
     assert len(result) > 0
+
+@skip_if_no_cuda
+def test_copy_constant_memory_args():
+    kernel_string = """
+    __constant__ float my_constant_data[100];
+    __global__ void copy_data_kernel(float* output) {
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+        if (idx < 100) {
+            output[idx] = my_constant_data[idx];
+        }
+    }
+    """
+
+    kernel_name = "copy_data_kernel"
+    kernel_sources = KernelSource(kernel_name, kernel_string, "NVCUDA")
+    kernel_instance = KernelInstance(kernel_name, kernel_sources, kernel_string, [], None, None, dict(), [])
+    dev = nvcuda.CudaFunctions(0)
+    kernel = dev.compile(kernel_instance)
+
+    my_constant_data = np.full(100, 23).astype(np.float32)
+    cmem_args = {'my_constant_data': my_constant_data}
+    dev.copy_constant_memory_args(cmem_args)
+
+    output = np.full(100, 0).astype(np.float32)
+    gpu_args = dev.ready_argument_list([output])
+
+    threads = (100, 1, 1)
+    grid = (1, 1, 1)
+    dev.run_kernel(kernel, gpu_args, threads, grid)
+
+    dev.memcpy_dtoh(output, gpu_args[0])
+
+    assert (my_constant_data == output).all()
diff --git a/test/test_energy.py b/test/test_energy.py
@@ -2,11 +2,11 @@
 
 from kernel_tuner.energy import energy
 
-from .context import skip_if_no_pycuda, skip_if_no_pynvml
+from .context import skip_if_no_cuda, skip_if_no_pynvml
 
 cache_filename = os.path.dirname(os.path.realpath(__file__)) + "/synthetic_fp32_cache_NVIDIA_RTX_A4000.json"
 
-@skip_if_no_pycuda
+@skip_if_no_cuda
 @skip_if_no_pynvml
 def test_create_power_frequency_model():
 

diff --git a/test/test_observers.py b/test/test_observers.py
@@ -21,7 +21,7 @@
 from .test_runners import env  # noqa: F401
 
 
-@skip_if_no_pycuda
+@skip_if_no_cuda
 @skip_if_no_pynvml
 def test_nvml_observer(env):
     nvmlobserver = NVMLObserver(["nvml_energy", "temperature"])
@@ -33,7 +33,7 @@ def test_nvml_observer(env):
     assert "temperature" in result[0]
     assert result[0]["temperature"] > 0
 
-@skip_if_no_pycuda
+@skip_if_no_cuda
 def test_custom_observer(env):
     env[-1]["block_size_x"] = [128]
 
@@ -55,15 +55,15 @@ def __init__(self, args):
 
         def get_results(self):
             return {"observer_args": self.observer_args}
-        
+
     result, _ = kernel_tuner.tune_kernel(*env_compiler, observers=[lambda args: MyObserver(args)], compiler_options=["-fopenmp"])
 
     # Check if the observer has correctly received the lang option
     assert result[0]["observer_args"]["lang"] == "C"
 
 @skip_if_no_pycuda
 def test_register_observer_pycuda(env):
-    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='CUDA')
+    result, _ = kernel_tuner.tune_kernel(*env, observers=[RegisterObserver()], lang='PYCUDA')
     assert "num_regs" in result[0]
     assert result[0]["num_regs"] > 0