Skip to content

Commit 867e44a

Browse files
kohlercaCristiano KöhlerCozySocksAlways
authored
[Fix] "too many resources requested for launch" error in the ASSET joint probability matrix computation when using CUDA (#667)
* Implemented parameter to override number of threads in PMatNeighbors * Determine the number of threads in PMatNeighbors automatically or use the override * Added logging functionality for debugging * Added documentation on the new behavior for the number of threads in PMatNeighbors * Updated logging statements to avoid errors * Updated logging statements for simplified access to kernel parameters * Added unit test for computing PMatNeighbors with varying thread numbers * Added restriction to override number to avoid exceeding the maximum number of threads * Removed one parameter iteration as this is currently expected to fail * Added unit test for the high level cuda_threads parameter in the ASSET class * Added missing reasons for skipping * Added missing rate variable * Added cleanup if variables were not originally set * Added validation for cuda_thread parameter * Test cases not needed as tuple is enforced to be (int, int) * Added invalid combinations for the revised validation * Not allowing None in the tuple * Setting upper bound for number of threads per block * Complementary unit tests for upper bound --------- Co-authored-by: Cristiano Köhler <c.koehler@fz-juelich.de> Co-authored-by: Harris Jos <104043391+CozySocksAlways@users.noreply.github.com>
1 parent 867a0fc commit 867e44a

2 files changed

Lines changed: 210 additions & 6 deletions

File tree

elephant/asset/asset.py

Lines changed: 85 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,10 +1113,12 @@ class _PMatNeighbors(_GPUBackend):
11131113
The number of largest neighbors to collect for each entry in `mat`.
11141114
"""
11151115

1116-
def __init__(self, filter_shape, n_largest, max_chunk_size=None):
1116+
def __init__(self, filter_shape, n_largest, max_chunk_size=None,
1117+
cuda_threads=None):
11171118
super().__init__(max_chunk_size=max_chunk_size)
11181119
self.n_largest = n_largest
11191120
self.max_chunk_size = max_chunk_size
1121+
self.cuda_threads = cuda_threads
11201122

11211123
filter_size, filter_width = filter_shape
11221124
if filter_width >= filter_size:
@@ -1249,7 +1251,6 @@ def pycuda(self, mat):
12491251
self._check_input(mat)
12501252

12511253
device = pycuda.autoinit.device
1252-
n_threads = device.MAX_THREADS_PER_BLOCK
12531254

12541255
filt_size = self.filter_kernel.shape[0]
12551256
filt_rows, filt_cols = self.filter_kernel.nonzero()
@@ -1307,13 +1308,63 @@ def pycuda(self, mat):
13071308

13081309
drv.Context.synchronize()
13091310

1311+
kernel = module.get_function("pmat_neighbors")
1312+
1313+
# Adjust number of threads depending on the number of registers
1314+
# needed for the kernel, to avoid exceeding the resources
1315+
if self.cuda_threads:
1316+
# Override with the number in the parameter `cuda_threads`
1317+
n_threads = min(self.cuda_threads,
1318+
device.MAX_THREADS_PER_BLOCK)
1319+
else:
1320+
# Automatically determine the number of threads based on
1321+
# the register count.
1322+
regs_per_thread = kernel.NUM_REGS
1323+
max_regs_per_block = device.MAX_REGISTERS_PER_BLOCK
1324+
max_threads_by_regs = max_regs_per_block // regs_per_thread
1325+
1326+
# A safety margin of 10% with respect to the number of threads
1327+
# computed for the kernel is used in order to account for a
1328+
# fraction of registers that might be used by the GPU for
1329+
# control purposes.
1330+
max_threads_by_regs = int(max_threads_by_regs * 0.9)
1331+
1332+
n_threads = min(max_threads_by_regs,
1333+
device.MAX_THREADS_PER_BLOCK)
1334+
1335+
if n_threads > device.WARP_SIZE:
1336+
# It's more efficient to make the number of threads
1337+
# a multiple of the warp size (32).
1338+
n_threads -= n_threads % device.WARP_SIZE
1339+
13101340
grid_size = math.ceil(it_todo / n_threads)
1341+
1342+
if logger.level == logging.DEBUG:
1343+
logger.debug(f"Registers per thread: {kernel.NUM_REGS}")
1344+
1345+
shared_memory = kernel.SHARED_SIZE_BYTES
1346+
local_memory = kernel.LOCAL_SIZE_BYTES
1347+
const_memory = kernel.CONST_SIZE_BYTES
1348+
logger.debug(f"Memory: shared = {shared_memory}; "
1349+
f"local = {local_memory}, const = {const_memory}")
1350+
1351+
logger.debug("Maximum per block: threads = "
1352+
f"{device.MAX_THREADS_PER_BLOCK}; "
1353+
"registers = "
1354+
f"{device.MAX_REGISTERS_PER_BLOCK}; "
1355+
"shared memory = "
1356+
f"{device.MAX_SHARED_MEMORY_PER_BLOCK}")
1357+
1358+
logger.debug(f"It_todo: {it_todo}")
1359+
logger.debug(f"N threads: {n_threads}")
1360+
logger.debug(f"Max grid X: {device.MAX_GRID_DIM_X}")
1361+
logger.debug(f"Grid size: {grid_size}")
1362+
13111363
if grid_size > device.MAX_GRID_DIM_X:
13121364
raise ValueError("Cannot launch a CUDA kernel with "
13131365
f"{grid_size} num. of blocks. Adjust the "
13141366
"'max_chunk_size' parameter.")
13151367

1316-
kernel = module.get_function("pmat_neighbors")
13171368
kernel(lmat_gpu.gpudata, mat_gpu, grid=(grid_size, 1),
13181369
block=(n_threads, 1, 1))
13191370

@@ -2446,7 +2497,7 @@ def joint_probability_matrix(self, pmat, filter_shape, n_largest,
24462497
Double floating-point precision is typically x4 times slower than
24472498
the single floating-point equivalent.
24482499
Default: 'float'
2449-
cuda_threads : int, optional
2500+
cuda_threads : int or tuple of int, optional
24502501
[CUDA/OpenCL performance parameter that does not influence the
24512502
result.]
24522503
The number of CUDA/OpenCL threads per block (in X axis) between 1
@@ -2455,6 +2506,18 @@ def joint_probability_matrix(self, pmat, filter_shape, n_largest,
24552506
Old GPUs (Tesla K80) perform faster with `cuda_threads` larger
24562507
than 64 while new series (Tesla T4) with capabilities 6.x and more
24572508
work best with 32 threads.
2509+
The computation of the joint probability matrix consists of two
2510+
GPU-accelerated steps. In the first step, the optimal number of
2511+
CUDA threads is determined automatically. The `cuda_threads`
2512+
parameter primarily controls the number of threads used in the
2513+
second (main) computation step. However, if the `n_largest`
2514+
parameter is set to a high value, the first step may fail with a
2515+
"too many resources" CUDA error due to excessive register usage.
2516+
To avoid this, you can explicitly specify the number of threads
2517+
for both steps using a tuple for `cuda_threads`. In this case, the
2518+
first element of the tuple sets the thread count for the main
2519+
computation, and the second element overrides the automatically
2520+
determined thread count for the first step.
24582521
Default: 64
24592522
cuda_cwr_loops : int, optional
24602523
[CUDA/OpenCL performance parameter that does not influence the
@@ -2502,11 +2565,27 @@ def joint_probability_matrix(self, pmat, filter_shape, n_largest,
25022565

25032566
logger.info("Finding neighbors in probability matrix...")
25042567

2568+
# Get any override in the number of CUDA threads
2569+
if isinstance(cuda_threads, tuple) and len(cuda_threads) == 2 \
2570+
and all(isinstance(n_thr, int) for n_thr in cuda_threads):
2571+
jsf_threads, pmat_threads = cuda_threads
2572+
elif isinstance(cuda_threads, int):
2573+
jsf_threads = cuda_threads
2574+
pmat_threads = None
2575+
else:
2576+
raise ValueError("'cuda_threads' must be int or a tuple of int.")
2577+
2578+
if (not (0 < jsf_threads <= 1024) or
2579+
(pmat_threads is not None and not (0 < pmat_threads <= 1024))):
2580+
raise ValueError("The number of threads in 'cuda_threads' must be"
2581+
"a value > 0 and <= 1024.")
2582+
25052583
# Find for each P_ij in the probability matrix its neighbors and
25062584
# maximize them by the maximum value 1-p_value_min
25072585
pmat = np.asarray(pmat, dtype=np.float32)
25082586
pmat_neighb_obj = _PMatNeighbors(filter_shape=filter_shape,
2509-
n_largest=n_largest)
2587+
n_largest=n_largest,
2588+
cuda_threads=pmat_threads)
25102589
pmat_neighb = pmat_neighb_obj.compute(pmat)
25112590

25122591
logger.info("Finding unique set of values...")
@@ -2527,7 +2606,7 @@ def joint_probability_matrix(self, pmat, filter_shape, n_largest,
25272606
w + 1) # number of entries covered by kernel
25282607
jsf = _JSFUniformOrderStat3D(n=n, d=pmat_neighb.shape[1],
25292608
precision=precision,
2530-
cuda_threads=cuda_threads,
2609+
cuda_threads=jsf_threads,
25312610
cuda_cwr_loops=cuda_cwr_loops,
25322611
tolerance=tolerance)
25332612
jpvmat = jsf.compute(u=pmat_neighb)

elephant/test/test_asset.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,36 @@ def test_cluster_matrix_entries_chunked_array_file(self):
320320
array_file=Path(tmpdir) / f"test_dist_{working_memory}")
321321
assert_array_equal(cmat, cmat_true)
322322

323+
def test_pmat_neighbors_gpu_threads(self):
324+
# The number of threads must not influence the result.
325+
np.random.seed(12)
326+
n_largest = 3
327+
pmat1 = np.random.random_sample((40, 40)).astype(np.float32)
328+
np.fill_diagonal(pmat1, 0.5)
329+
pmat2 = np.random.random_sample((70, 23)).astype(np.float32)
330+
pmat3 = np.random.random_sample((27, 93)).astype(np.float32)
331+
for pmat in (pmat1, pmat2, pmat3):
332+
for filter_size in (4, 11):
333+
filter_shape = (filter_size, 3)
334+
# Check numbers for automatic (None) to more than the maximum
335+
# number of threads (2048), and one value that is not a factor
336+
# of the warp size (500 % 32 != 0)
337+
for n_threads in (None, 64, 128, 256, 500, 512, 1024, 2048):
338+
with warnings.catch_warnings():
339+
# ignore even filter sizes
340+
warnings.simplefilter('ignore', UserWarning)
341+
pmat_neigh = asset._PMatNeighbors(
342+
filter_shape=filter_shape, n_largest=n_largest,
343+
cuda_threads=n_threads
344+
)
345+
lmat_true = pmat_neigh.cpu(pmat)
346+
if HAVE_PYOPENCL:
347+
lmat_opencl = pmat_neigh.pyopencl(pmat)
348+
assert_array_almost_equal(lmat_opencl, lmat_true)
349+
if HAVE_CUDA:
350+
lmat_cuda = pmat_neigh.pycuda(pmat)
351+
assert_array_almost_equal(lmat_cuda, lmat_true)
352+
323353
def test_pmat_neighbors_gpu(self):
324354
np.random.seed(12)
325355
n_largest = 3
@@ -700,6 +730,101 @@ def test_watchdog(self):
700730
self.assertWarns(UserWarning, jsf.compute, u)
701731

702732

733+
@unittest.skipUnless(HAVE_SKLEARN and (HAVE_CUDA or HAVE_PYOPENCL),
734+
'requires sklearn and a GPU')
735+
class AssetTestJointProbabilityMatrixGPUThreads(unittest.TestCase):
736+
737+
@classmethod
738+
def setUpClass(cls):
739+
# Save the state of the environment variables
740+
cls.use_cuda = os.getenv("ELEPHANT_USE_CUDA", None)
741+
cls.use_opencl = os.getenv("ELEPHANT_USE_OPENCL", None)
742+
743+
# Force using CPU to compute expected values
744+
os.environ["ELEPHANT_USE_CUDA"] = "0"
745+
os.environ["ELEPHANT_USE_OPENCL"] = "0"
746+
747+
# Generate spike train data
748+
np.random.seed(1)
749+
n_spiketrains = 50
750+
rate = 50 * pq.Hz
751+
spiketrains = [homogeneous_poisson_process(rate, t_stop=100 * pq.ms)
752+
for _ in range(n_spiketrains)]
753+
754+
# Initialize ASSET object and compute IMAT/PMAT
755+
bin_size = 3 * pq.ms
756+
kernel_width = 9 * pq.ms
757+
758+
asset_obj = asset.ASSET(spiketrains, bin_size=bin_size)
759+
imat = asset_obj.intersection_matrix()
760+
cls.pmat = asset_obj.probability_matrix_analytical(
761+
kernel_width=kernel_width)
762+
763+
cls.filter_shape = (5, 1)
764+
cls.n_largest = 3
765+
cls.expected_jmat = asset_obj.joint_probability_matrix(
766+
cls.pmat,
767+
filter_shape=cls.filter_shape,
768+
n_largest=cls.n_largest,
769+
)
770+
cls.asset_obj = asset_obj
771+
772+
def test_invalid_threads_parameter(self):
773+
for cuda_threads in ("64", (64, 64, 64),
774+
0, (0, 0), (0, 64), (64, 0),
775+
-1, (-1, -1), (-1, 64), (64, -1),
776+
(64, None), 1025, (1025, 1024),
777+
(1024, 1025)):
778+
with self.assertRaises(ValueError):
779+
self.asset_obj.joint_probability_matrix(
780+
self.pmat,
781+
filter_shape=self.filter_shape,
782+
n_largest=self.n_largest,
783+
cuda_threads=cuda_threads,
784+
)
785+
786+
@unittest.skipUnless(HAVE_CUDA, "CUDA not available")
787+
def test_cuda_threads(self):
788+
os.environ["ELEPHANT_USE_CUDA"] = "1"
789+
os.environ["ELEPHANT_USE_OPENCL"] = "0"
790+
791+
for cuda_threads in (64, (64, 512), 1024, (1024, 1024)):
792+
jmat = self.asset_obj.joint_probability_matrix(
793+
self.pmat,
794+
filter_shape=self.filter_shape,
795+
n_largest=self.n_largest,
796+
cuda_threads=cuda_threads,
797+
)
798+
assert_array_almost_equal(jmat, self.expected_jmat)
799+
800+
@unittest.skipUnless(HAVE_PYOPENCL, "PyOpenCL not available")
801+
def test_pyopencl_threads(self):
802+
os.environ["ELEPHANT_USE_CUDA"] = "0"
803+
os.environ["ELEPHANT_USE_OPENCL"] = "1"
804+
805+
for cuda_threads in (64, (64, 512), 1024, (1024, 1024)):
806+
jmat = self.asset_obj.joint_probability_matrix(
807+
self.pmat,
808+
filter_shape=self.filter_shape,
809+
n_largest=self.n_largest,
810+
cuda_threads=cuda_threads,
811+
)
812+
assert_array_almost_equal(jmat, self.expected_jmat)
813+
814+
@classmethod
815+
def cleanUpClass(cls):
816+
# Restore environment flags
817+
if cls.use_cuda:
818+
os.environ["ELEPHANT_USE_CUDA"] = cls.use_cuda
819+
else:
820+
os.environ.pop("ELEPHANT_USE_CUDA")
821+
822+
if cls.use_opencl:
823+
os.environ["ELEPHANT_USE_OPENCL"] = cls.use_opencl
824+
else:
825+
os.environ.pop("ELEPHANT_USE_OPENCL")
826+
827+
703828
@unittest.skipUnless(HAVE_SKLEARN, 'requires sklearn')
704829
class AssetTestIntegration(unittest.TestCase):
705830
def setUp(self):

0 commit comments

Comments
 (0)