From b04a53c49ab4a69dee20c5f3bb3c3e3df06701b9 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Tue, 27 Sep 2016 00:10:48 +0200
Subject: [PATCH] Correct kernel launch bounds for CUDA sm_60

The GP100 architecture has half the SM size (64 ALUs) compared to 5.x
hence 64 threads/block already achieves maximum occupancy and the
incorrectly tweaked launch configuration is not optimal. This change
reverts the incorrectly increased block size in cc4214a.

Change-Id: I9be10acfb3650c778401d04dfcd52aa200f78ff4
---
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu         |  3 +--
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh | 15 ++++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index bd49f6aba2..3946419231 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -423,8 +423,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
      * - The 1D block-grid contains as many blocks as super-clusters.
      */
     int num_threads_z = 1;
-    if ((nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7) ||
-        (nb->dev_info->prop.major == 6 && nb->dev_info->prop.minor == 0))
+    if (nb->dev_info->prop.major == 3 && nb->dev_info->prop.minor == 7)
     {
         num_threads_z = 2;
     }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index 4d9455cc66..192b20d35e 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -96,12 +96,13 @@
  * NTHREAD_Z controls the number of j-clusters processed concurrently on NTHREAD_Z
  * warp-pairs per block.
  *
- * - On CC 2.0-3.5, 5.0, and 5.2, NTHREAD_Z == 1, translating to 64 th/block with 16
- * blocks/multiproc, is the fastest even though this setup gives low occupancy.
+ * - On CC 2.0-3.5, and >=5.0 NTHREAD_Z == 1, translating to 64 th/block with 16
+ * blocks/multiproc, is the fastest even though this setup gives low occupancy
+ * (except on 6.0).
  * NTHREAD_Z > 1 results in excessive register spilling unless the minimum blocks
  * per multiprocessor is reduced proportionally to get the original number of max
  * threads in flight (and slightly lower performance).
- * - On CC 3.7 and 6.0 there are enough registers to double the number of threads; using
+ * - On CC 3.7 there are enough registers to double the number of threads; using
  * NTHREADS_Z == 2 is fastest with 16 blocks (TODO: test with RF and other kernels
  * with low-register use).
  *
@@ -112,18 +113,18 @@
 /* Kernel launch bounds for different compute capabilities. The value of NTHREAD_Z
  * determines the number of threads per block and it is chosen such that
  * 16 blocks/multiprocessor can be kept in flight.
- * - CC 3.0/3.5/5.x, >=6.1: NTHREAD_Z=1, (64, 16) bounds
- * - CC 3.7, 6.0:           NTHREAD_Z=2, (128, 16) bounds
+ * - CC 3.0,3.5, and >=5.0: NTHREAD_Z=1, (64, 16) bounds
+ * - CC 3.7:                NTHREAD_Z=2, (128, 16) bounds
  *
  * Note: convenience macros, need to be undef-ed at the end of the file.
  */
-#if GMX_PTX_ARCH == 370 || GMX_PTX_ARCH == 600
+#if GMX_PTX_ARCH == 370
     #define NTHREAD_Z           (2)
     #define MIN_BLOCKS_PER_MP   (16)
 #else
     #define NTHREAD_Z           (1)
     #define MIN_BLOCKS_PER_MP   (16)
-#endif /* GMX_PTX_ARCH == 370 || GMX_PTX_ARCH == 600 */
+#endif /* GMX_PTX_ARCH == 370 */
 #define THREADS_PER_BLOCK   (c_clSize*c_clSize*NTHREAD_Z)
 
 #if GMX_PTX_ARCH >= 350
-- 
2.11.4.GIT