From 360ddad45f0244048491c994acab120f8fc41353 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Mon, 27 Mar 2017 19:19:19 +0200
Subject: [PATCH] Allow disabling the explicit use of CUDA textures

This change implements fallback for the explicit CUDA texture loads
in the non-bonded kernels. This can be done by defining
DISABLE_CUDA_TEXTURES. When disabled texture objects/references
are not initialized either.

Also removed unnecessary extern declarations of texture references
in nbnxn_cuda_kernel_utils.cuh; this was only needed because texture
reference accesses were previously compiled unconditionally (and were
also generated in the nvcc host pass).

Change-Id: Id7cdd6f80da0abe6be5639e80bed6530c3ce25c0
---
 src/gromacs/gpu_utils/cuda_arch_utils.cuh          |  14 ++-
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu         |  15 ++-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu       | 104 ++++++++++++---------
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh |   8 +-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh   |   4 +
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh   |  70 +++++++++-----
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh        |   5 +-
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h    |   7 +-
 8 files changed, 156 insertions(+), 71 deletions(-)

diff --git a/src/gromacs/gpu_utils/cuda_arch_utils.cuh b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
index 8be983365c..8940e927c5 100644
--- a/src/gromacs/gpu_utils/cuda_arch_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2012,2014,2015,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -58,4 +58,16 @@
 static const int warp_size      = 32;
 static const int warp_size_log2 = 5;
 
+/*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
+ *
+ *  This option will not influence functionality. All features using textures ought
+ *  to have fallback for texture-less reads (direct/LDG loads), all new code needs
+ *  to provide fallback code.
+ */
+#if defined GMX_DISABLE_CUDA_TEXTURES
+#define DISABLE_CUDA_TEXTURES 1
+#else
+#define DISABLE_CUDA_TEXTURES 0
+#endif
+
 #endif /* CUDA_ARCH_UTILS_CUH_ */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 3f62e86de4..35774bec6f 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -62,6 +62,16 @@
 
 #include "nbnxn_cuda_types.h"
 
+/*
+ * Texture references are created at compile-time and need to be declared
+ * at file scope as global variables (see http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#texture-reference-api).
+ * The texture references below are used in two translation units;
+ * we declare them here along the kernels that use them (when compiling legacy Fermi kernels),
+ * and provide getters (see below) used by the data_mgmt module where the
+ * textures are bound/unbound.
+ * (In principle we could do it the other way arond, but that would likely require
+ * device linking and we'd rather avoid technical hurdles.)
+ */
 /*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
 texture<float, 1, cudaReadModeElementType> nbfp_texref;
 
@@ -695,18 +705,21 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
 /*! Return the reference to the nbfp texture. */
 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
 {
+    assert(!c_disableCudaTextures);
     return nbfp_texref;
 }
 
 /*! Return the reference to the nbfp_comb texture. */
 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
 {
+    assert(!c_disableCudaTextures);
     return nbfp_comb_texref;
 }
 
 /*! Return the reference to the coulomb_tab. */
 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
 {
+    assert(!c_disableCudaTextures);
     return coulomb_tab_texref;
 }
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 0d60b620e2..1215452128 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -100,6 +100,7 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
  */
 static bool use_texobj(const gmx_device_info_t *dev_info)
 {
+    assert(!c_disableCudaTextures);
     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
     return (dev_info->prop.major >= 3);
 }
@@ -128,6 +129,8 @@ static void setup1DFloatTexture(cudaTextureObject_t &texObj,
                                 void                *devPtr,
                                 size_t               sizeInBytes)
 {
+    assert(!c_disableCudaTextures);
+
     cudaError_t      stat;
     cudaResourceDesc rd;
     cudaTextureDesc  td;
@@ -158,6 +161,8 @@ static void setup1DFloatTexture(const struct texture<float, 1, cudaReadModeEleme
                                 const void                                              *devPtr,
                                 size_t                                                   sizeInBytes)
 {
+    assert(!c_disableCudaTextures);
+
     cudaError_t           stat;
     cudaChannelFormatDesc cd;
 
@@ -194,15 +199,18 @@ static void init_ewald_coulomb_force_table(const interaction_const_t *ic,
     nbp->coulomb_tab_size  = ic->tabq_size;
     nbp->coulomb_tab_scale = ic->tabq_scale;
 
-    if (use_texobj(dev_info))
-    {
-        setup1DFloatTexture(nbp->coulomb_tab_texobj, nbp->coulomb_tab,
-                            nbp->coulomb_tab_size*sizeof(*nbp->coulomb_tab));
-    }
-    else
+    if (!c_disableCudaTextures)
     {
-        setup1DFloatTexture(&nbnxn_cuda_get_coulomb_tab_texref(), nbp->coulomb_tab,
-                            nbp->coulomb_tab_size*sizeof(*nbp->coulomb_tab));
+        if (use_texobj(dev_info))
+        {
+            setup1DFloatTexture(nbp->coulomb_tab_texobj, nbp->coulomb_tab,
+                                nbp->coulomb_tab_size*sizeof(*nbp->coulomb_tab));
+        }
+        else
+        {
+            setup1DFloatTexture(&nbnxn_cuda_get_coulomb_tab_texref(), nbp->coulomb_tab,
+                                nbp->coulomb_tab_size*sizeof(*nbp->coulomb_tab));
+        }
     }
 }
 
@@ -339,13 +347,16 @@ static void initParamLookupTable(float                    * &devPtr,
     CU_RET_ERR(stat, "cudaMalloc failed in initParamLookupTable");
     cu_copy_H2D(devPtr, (void *)hostPtr, sizeInBytes);
 
-    if (use_texobj(devInfo))
+    if (!c_disableCudaTextures)
     {
-        setup1DFloatTexture(texObj, devPtr, sizeInBytes);
-    }
-    else
-    {
-        setup1DFloatTexture(texRef, devPtr, sizeInBytes);
+        if (use_texobj(devInfo))
+        {
+            setup1DFloatTexture(texObj, devPtr, sizeInBytes);
+        }
+        else
+        {
+            setup1DFloatTexture(texRef, devPtr, sizeInBytes);
+        }
     }
 }
 
@@ -867,17 +878,20 @@ static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t            *nbparam,
 
     if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
     {
-        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (use_texobj(dev_info))
-        {
-            stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
-            CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
-        }
-        else
+        if (!c_disableCudaTextures)
         {
-            GMX_UNUSED_VALUE(dev_info);
-            stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
-            CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
+            /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+            if (use_texobj(dev_info))
+            {
+                stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj);
+                CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed");
+            }
+            else
+            {
+                GMX_UNUSED_VALUE(dev_info);
+                stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
+                CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed");
+            }
         }
         cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size);
     }
@@ -946,32 +960,38 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb)
 
     if (!useLjCombRule(nb->nbparam))
     {
-        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (use_texobj(nb->dev_info))
-        {
-            stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
-            CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
-        }
-        else
+        if (!c_disableCudaTextures)
         {
-            stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
-            CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
+            /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+            if (use_texobj(nb->dev_info))
+            {
+                stat = cudaDestroyTextureObject(nbparam->nbfp_texobj);
+                CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_texobj failed");
+            }
+            else
+            {
+                stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_texref());
+                CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_texref failed");
+            }
         }
         cu_free_buffered(nbparam->nbfp);
     }
 
     if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
     {
-        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
-        if (use_texobj(nb->dev_info))
-        {
-            stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
-            CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
-        }
-        else
+        if (!c_disableCudaTextures)
         {
-            stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
-            CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
+            /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+            if (use_texobj(nb->dev_info))
+            {
+                stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
+                CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
+            }
+            else
+            {
+                stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
+                CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
+            }
         }
         cu_free_buffered(nbparam->nbfp_comb);
     }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index 38ca79ee82..415ead4c03 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -54,8 +54,8 @@
  * code that is in double precision.
  */
 
-#if GMX_PTX_ARCH < 300
-#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300"
+#if GMX_PTX_ARCH < 300 && GMX_PTX_ARCH != 0
+#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300 or host pass"
 #endif
 
 #if defined EL_EWALD_ANA || defined EL_EWALD_TAB
@@ -317,7 +317,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif
 
 #ifdef LJ_EWALD
+    #if DISABLE_CUDA_TEXTURES
+            E_lj += LDG(&nbparam.nbfp[atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2]);
+    #else
             E_lj += tex1Dfetch<float>(nbparam.nbfp_texobj, atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2);
+    #endif
 #endif
         }
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh
index f615978ab2..98ba1d0916 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh
@@ -258,7 +258,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif
 
 #ifdef LJ_EWALD
+    #if DISABLE_CUDA_TEXTURES
+            E_lj += LDG(&nbparam.nbfp[atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2]);
+    #else
             E_lj += tex1Dfetch(nbfp_texref, atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2);
+    #endif
 #endif
         }
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
index c7af49b1b7..a0d8979381 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -42,8 +42,6 @@
  *  \author Szilárd Páll <pall.szilard@gmail.com>
  *  \ingroup module_mdlib
  */
-#include "config.h"
-
 #include <assert.h>
 
 /* Note that floating-point constants in CUDA code should be suffixed
@@ -60,8 +58,8 @@
 #ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
 #define NBNXN_CUDA_KERNEL_UTILS_CUH
 
-/* Use texture objects if supported by the target hardware. */
-#if GMX_PTX_ARCH >= 300
+/* Use texture objects if supported by the target hardware (and in host pass). */
+#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 /* Note: convenience macro, needs to be undef-ed at the end of the file. */
 #define USE_TEXOBJ
 #endif
@@ -79,19 +77,6 @@ static const int    c_fbufStride  = c_clSizeSq;
 static const float  c_oneSixth    = 0.16666667f;
 static const float  c_oneTwelveth = 0.08333333f;
 
-/* With multiple compilation units this ensures that texture refs are available
-   in the the kernels' compilation units. */
-#if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
-/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
-extern texture<float, 1, cudaReadModeElementType> nbfp_texref;
-
-/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
-extern texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
-
-/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
-extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
-#endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
-
 
 /*! Convert LJ sigma,epsilon parameters to C6,C12. */
 static __forceinline__ __device__
@@ -233,17 +218,25 @@ void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
 }
 
 
-/*! \brief Fetch C6 grid contribution coefficients and return the product of these. */
+/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
+ *
+ *  Depending on what is supported, it fetches parameters either
+ *  using direct load, texture objects, or texrefs.
+ */
 static __forceinline__ __device__
 float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
                                 int                typei,
                                 int                typej)
 {
+#if DISABLE_CUDA_TEXTURES
+    return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
+#else
 #ifdef USE_TEXOBJ
     return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
 #else
     return tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
 #endif /* USE_TEXOBJ */
+#endif /* DISABLE_CUDA_TEXTURES */
 }
 
 
@@ -308,12 +301,23 @@ void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
     *E_lj    += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
 }
 
-/*! Fetch per-type LJ parameters. */
+/*! Fetch per-type LJ parameters.
+ *
+ *  Depending on what is supported, it fetches parameters either
+ *  using direct load, texture objects, or texrefs.
+ */
 static __forceinline__ __device__
 float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
                               int                type)
 {
     float2 c6c12;
+#if DISABLE_CUDA_TEXTURES
+    /* Force an 8-byte fetch to save a memory instruction. */
+    float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
+    c6c12 = LDG(&nbfp_comb[type]);
+#else
+    /* NOTE: as we always do 8-byte aligned loads, we could
+       fetch float2 here too just as above. */
 #ifdef USE_TEXOBJ
     c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
     c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
@@ -321,6 +325,7 @@ float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
     c6c12.x = tex1Dfetch(nbfp_comb_texref, 2*type);
     c6c12.y = tex1Dfetch(nbfp_comb_texref, 2*type + 1);
 #endif /* USE_TEXOBJ */
+#endif /* DISABLE_CUDA_TEXTURES */
 
     return c6c12;
 }
@@ -378,6 +383,8 @@ void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
 
 /*! Fetch two consecutive values from the Ewald correction F*r table.
  *
+ *  Depending on what is supported, it fetches parameters either
+ *  using direct load, texture objects, or texrefs.
  */
 static __forceinline__ __device__
 float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
@@ -385,6 +392,11 @@ float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
 {
     float2 d;
 
+#if DISABLE_CUDA_TEXTURES
+    /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
+    d.x = LDG(&nbparam.coulomb_tab[index]);
+    d.y = LDG(&nbparam.coulomb_tab[index + 1]);
+#else
 #ifdef USE_TEXOBJ
     d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
     d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
@@ -392,6 +404,7 @@ float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
     d.x =  tex1Dfetch(coulomb_tab_texref, index);
     d.y =  tex1Dfetch(coulomb_tab_texref, index + 1);
 #endif // USE_TEXOBJ
+#endif // DISABLE_CUDA_TEXTURES
 
     return d;
 }
@@ -427,6 +440,8 @@ float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
 
 /*! Fetch C6 and C12 from the parameter table.
  *
+ *  Depending on what is supported, it fetches parameters either
+ *  using direct load, texture objects, or texrefs.
  */
 static __forceinline__ __device__
 void fetch_nbfp_c6_c12(float               &c6,
@@ -434,6 +449,16 @@ void fetch_nbfp_c6_c12(float               &c6,
                        const cu_nbparam_t   nbparam,
                        int                  baseIndex)
 {
+#if DISABLE_CUDA_TEXTURES
+    /* Force an 8-byte fetch to save a memory instruction. */
+    float2 *nbfp = (float2 *)nbparam.nbfp;
+    float2  c6c12;
+    c6c12 = LDG(&nbfp[baseIndex]);
+    c6    = c6c12.x;
+    c12   = c6c12.y;
+#else
+    /* NOTE: as we always do 8-byte aligned loads, we could
+       fetch float2 here too just as above. */
 #ifdef USE_TEXOBJ
     c6  = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
     c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
@@ -441,6 +466,7 @@ void fetch_nbfp_c6_c12(float               &c6,
     c6  = tex1Dfetch(nbfp_texref, 2*baseIndex);
     c12 = tex1Dfetch(nbfp_texref, 2*baseIndex + 1);
 #endif
+#endif // DISABLE_CUDA_TEXTURES
 }
 
 
@@ -506,7 +532,7 @@ void reduce_force_j_generic(float *f_buf, float3 *fout,
 /*! Final j-force reduction; this implementation only with power of two
  *  array sizes and with sm >= 3.0
  */
-#if GMX_PTX_ARCH >= 300
+#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 static __forceinline__ __device__
 void reduce_force_j_warp_shfl(float3 f, float3 *fout,
                               int tidxi, int aidx)
@@ -632,7 +658,7 @@ void reduce_force_i(float *f_buf, float3 *f,
 /*! Final i-force reduction; this implementation works only with power of two
  *  array sizes and with sm >= 3.0
  */
-#if GMX_PTX_ARCH >= 300
+#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 static __forceinline__ __device__
 void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
                               float *fshift_buf, bool bCalcFshift,
@@ -709,7 +735,7 @@ void reduce_energy_pow2(volatile float *buf,
 /*! Energy reduction; this implementation works only with power of two
  *  array sizes and with sm >= 3.0
  */
-#if GMX_PTX_ARCH >= 300
+#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 static __forceinline__ __device__
 void reduce_energy_warp_shfl(float E_lj, float E_el,
                              float *e_lj, float *e_el,
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
index e56ce65411..a9be593e84 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2016, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -52,7 +52,8 @@
  *  \ingroup module_mdlib
  */
 
-#if GMX_PTX_ARCH >= 300
+/* Use the standard non-Fermi kernel in host pass too (to avoid texref API calls). */
+#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
 #define FLAVOR_LEVEL_GENERATOR "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh"
 #else
 #define FLAVOR_LEVEL_GENERATOR "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_fermi.cuh"
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index 8519b5598e..09eccc34c0 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2012, The GROMACS development team.
- * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -46,6 +46,7 @@
 #ifndef NBNXN_CUDA_TYPES_H
 #define NBNXN_CUDA_TYPES_H
 
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/mdlib/nbnxn_consts.h"
 #include "gromacs/mdlib/nbnxn_pairlist.h"
@@ -60,6 +61,10 @@ static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
 /*! \brief cluster size = number of atoms per cluster. */
 static const int c_clSize          = c_nbnxnGpuClusterSize;
 
+/*! \brief True if the use of texture fetch in the CUDA kernels is disabled. */
+static const bool c_disableCudaTextures = DISABLE_CUDA_TEXTURES;
+
+
 #ifdef __cplusplus
 extern "C" {
 #endif
-- 
2.11.4.GIT