From b04a53c49ab4a69dee20c5f3bb3c3e3df06701b9 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Tue, 27 Sep 2016 00:10:48 +0200 Subject: [PATCH] Correct kernel launch bounds for CUDA sm_60 The GP100 architecture has half the SM size (64 ALUs) compared to 5.x hence 64 threads/block already achieves maximum occupancy and the incorrectly tweaked launch configuration is not optimal. This change reverts the incorrectly increased block size in cc4214a. Change-Id: I9be10acfb3650c778401d04dfcd52aa200f78ff4 --- src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu | 3 +-- src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh | 15 ++++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu index bd49f6aba2..3946419231 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -423,8 +423,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t *nb, * - The 1D block-grid contains as many blocks as super-clusters. */ int num_threads_z = 1; - if ((nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7) || - (nb->dev_info->prop.major == 6 && nb->dev_info->prop.minor == 0)) + if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7) { num_threads_z = 2; } diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh index 4d9455cc66..192b20d35e 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh @@ -96,12 +96,13 @@ * NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z * warp-pairs per block. * - * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16 - * blocks/multiproc, is the fastest even though this setup gives low occupancy. + * - On CC 2.0-3.5, and >=5.0 NTHREAD_Z == 1, translating to 64 th/block with 16 + * blocks/multiproc, is the fastest even though this setup gives low occupancy + * (except on 6.0). * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks * per multiprocessor is reduced proportionally to get the original number of max * threads in flight (and slightly lower performance). - * - On CC 3.7 and 6.0 there are enough registers to double the number of threads; using + * - On CC 3.7 there are enough registers to double the number of threads; using * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels * with low-register use). * @@ -112,18 +113,18 @@ /* Kernel launch bounds for different compute capabilities. The value of NTHREAD_Z * determines the number of threads per block and it is chosen such that * 16 blocks/multiprocessor can be kept in flight. - * - CC 3.0/3.5/5.x, >=6.1: NTHREAD_Z=1, (64, 16) bounds - * - CC 3.7, 6.0: NTHREAD_Z=2, (128, 16) bounds + * - CC 3.0,3.5, and >=5.0: NTHREAD_Z=1, (64, 16) bounds + * - CC 3.7: NTHREAD_Z=2, (128, 16) bounds * * Note: convenience macros, need to be undef-ed at the end of the file. */ -#if GMX_PTX_ARCH == 370 || GMX_PTX_ARCH == 600 +#if GMX_PTX_ARCH == 370 #define NTHREAD_Z (2) #define MIN_BLOCKS_PER_MP (16) #else #define NTHREAD_Z (1) #define MIN_BLOCKS_PER_MP (16) -#endif /* GMX_PTX_ARCH == 370 || GMX_PTX_ARCH == 600 */ +#endif /* GMX_PTX_ARCH == 370 */ #define THREADS_PER_BLOCK (c_clSize*c_clSize*NTHREAD_Z) #if GMX_PTX_ARCH >= 350 -- 2.11.4.GIT