From 8be6f1ac64cb0338d55e706e0ff1c60a7ceeaf38 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Wed, 25 Jan 2017 02:53:17 +0100
Subject: [PATCH] Enable compiling CUDA device code with clang

clang can be used as a device compiler by setting GMX_CLANG_CUDA=ON. A
CUDA toolkit (>=7.0) is also needed. Workarounds required:
- texture operations are not supported, use the LDG/direct load-based
  fallback in such cases;
- CMake does not support natively clang for CUDA, but it's easy to
  convince it by setting CXX as compiler and few extra flags for *.cu.

Note that clang support is experimental and it is aimed at improving
portability and to allow using clang sanitizers without hassle in
CUDA builds.

TODO/investigate:
- CMake seems to not track some files properly with clang, changes
  to nbnxn_cuda_kernel{,_fermi}.cuh do not trigger a recompile (likely
  due to the indirect include through a macro in nbnxn_cuda_kernels.cuh).
- Full rebuild is triggered even if only CUDA compile flags are changed.

Change-Id: I3543469d9f0fda37c186ba8bb474980018bd5c54
---
 CMakeLists.txt                                     |   4 +
 admin/builds/gromacs.py                            |   6 +-
 admin/builds/post-submit-matrix.txt                |   5 +-
 .../gmxClangCudaUtils.cmake                        |  40 ++------
 cmake/gmxManageClangCudaConfig.cmake               | 107 +++++++++++++++++++++
 cmake/gmxManageGPU.cmake                           |  91 ++++++++++++------
 docs/dev-manual/build-system.rst                   |   8 ++
 docs/install-guide/index.rst                       |  20 ++++
 src/CMakeLists.txt                                 |   9 +-
 src/buildinfo.h.cmakein                            |  10 +-
 src/gromacs/CMakeLists.txt                         |  20 +++-
 src/gromacs/gpu_utils/cuda_arch_utils.cuh          |   4 +-
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu         |  25 ++++-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh   |   7 +-
 src/gromacs/utility/binaryinformation.cpp          |   4 +-
 15 files changed, 270 insertions(+), 90 deletions(-)
 copy src/CMakeLists.txt => cmake/gmxClangCudaUtils.cmake (60%)
 create mode 100644 cmake/gmxManageClangCudaConfig.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3aa958f4b1..4d6d437327 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -574,6 +574,10 @@ include(gmxManageLmfit)
 if(GMX_GPU)
     # now that we have detected the dependencies, do the second configure pass
     gmx_gpu_setup()
+    if (GMX_CLANG_CUDA)
+        list(APPEND GMX_EXTRA_LIBRARIES ${GMX_CUDA_CLANG_LINK_LIBS})
+        link_directories("${GMX_CUDA_CLANG_LINK_DIRS}")
+    endif()
 endif()
 
 if(CYGWIN)
diff --git a/admin/builds/gromacs.py b/admin/builds/gromacs.py
index 657a14f96a..06640c16f1 100644
--- a/admin/builds/gromacs.py
+++ b/admin/builds/gromacs.py
@@ -50,6 +50,7 @@ extra_options = {
     'thread-mpi': Option.bool,
     'gpu': Option.bool,
     'opencl': Option.bool,
+    'clang_cuda': Option.bool,
     'openmp': Option.bool,
     'nranks': Option.string,
     'npme': Option.string,
@@ -99,7 +100,10 @@ def do_build(context):
             cmake_opts['GMX_USE_OPENCL'] = 'ON'
         else:
             cmake_opts['CUDA_TOOLKIT_ROOT_DIR'] = context.env.cuda_root
-            cmake_opts['CUDA_HOST_COMPILER'] = context.env.cuda_host_compiler
+            if context.opts.clang_cuda:
+                cmake_opts['GMX_CLANG_CUDA'] = 'ON'
+            else:
+                cmake_opts['CUDA_HOST_COMPILER'] = context.env.cuda_host_compiler
     else:
         cmake_opts['GMX_GPU'] = 'OFF'
     if context.opts.thread_mpi is False:
diff --git a/admin/builds/post-submit-matrix.txt b/admin/builds/post-submit-matrix.txt
index eabee21af5..fa6babcb7d 100644
--- a/admin/builds/post-submit-matrix.txt
+++ b/admin/builds/post-submit-matrix.txt
@@ -35,9 +35,8 @@ gcc-7 npme=1 nranks=2 no-openmp fftpack release-with-assert
 
 # Test SSE4.1 SIMD
 # Test single-rank GPU
-# TODO Test clang + OpenMP + CUDA
-# TODO change to clang-4 and cuda-8.0
-gcc-4.8 openmp nranks=1 gpu cuda-7.5 simd=sse4.1
+# Test clang + OpenMP + CUDA
+clang-4 simd=sse4.1 openmp nranks=1 gpu cuda-8.0 clang_cuda
 
 # Test MPMD PME with library MPI
 # Test clang + OpenMP
diff --git a/src/CMakeLists.txt b/cmake/gmxClangCudaUtils.cmake
similarity index 60%
copy from src/CMakeLists.txt
copy to cmake/gmxClangCudaUtils.cmake
index f120b01add..e47a25766d 100644
--- a/src/CMakeLists.txt
+++ b/cmake/gmxClangCudaUtils.cmake
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2009,2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+# Copyright (c) 2017, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -32,35 +32,9 @@
 # To help us fund GROMACS development, we humbly ask that you cite
 # the research papers on the package. Check out http://www.gromacs.org.
 
-######################################
-# Output compiler and CFLAGS used
-######################################
-include(GetCompilerInfo.cmake)
-get_compiler_info(C BUILD_C_COMPILER BUILD_CFLAGS)
-get_compiler_info(CXX BUILD_CXX_COMPILER BUILD_CXXFLAGS)
-if(GMX_USE_CUDA)
-    GMX_SET_CUDA_NVCC_FLAGS()
-    get_cuda_compiler_info(CUDA_NVCC_COMPILER_INFO CUDA_NVCC_COMPILER_FLAGS)
-endif()
-
-configure_file(config.h.cmakein config.h)
-configure_file(gmxpre-config.h.cmakein gmxpre-config.h)
-configure_file(buildinfo.h.cmakein buildinfo.h ESCAPE_QUOTES)
-
-if (BUILD_TESTING)
-    if(NOT GMX_DEVELOPER_BUILD)
-        set(UNITTEST_TARGET_OPTIONS EXCLUDE_FROM_ALL)
-    endif()
-    if (GMX_BUILD_UNITTESTS)
-        add_subdirectory(external/gmock-1.7.0)
-    endif()
-    include(testutils/TestMacros.cmake)
-    add_subdirectory(testutils)
-endif()
-
-add_subdirectory(gromacs)
-add_subdirectory(programs)
-
-if (NOT GMX_FAHCORE)
-    add_subdirectory(contrib)
-endif()
+function(gmx_compile_cuda_file_with_clang)
+    foreach(_file ${ARGN})
+        set_source_files_properties(${_file} PROPERTIES LANGUAGE CXX)
+        set_source_files_properties(${_file} PROPERTIES COMPILE_FLAGS "${GMX_CUDA_CLANG_FLAGS}")
+    endforeach()
+endfunction()
diff --git a/cmake/gmxManageClangCudaConfig.cmake b/cmake/gmxManageClangCudaConfig.cmake
new file mode 100644
index 0000000000..40a492b002
--- /dev/null
+++ b/cmake/gmxManageClangCudaConfig.cmake
@@ -0,0 +1,107 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2017, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+function (gmx_test_clang_cuda_support)
+
+    if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") OR
+        (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "3.9"))
+        message(FATAL_ERROR "clang 3.9 or later required with GMX_CLANG_CUDA=ON!")
+    endif()
+
+    # NOTE: we'd ideally like to use a compile check here, but the link-stage
+    # fails as the clang invocation generated seems to not handle well some
+    # (GPU code) in the object file generated during compilation.
+    # SET(CMAKE_REQUIRED_FLAGS ${FLAGS})
+    # SET(CMAKE_REQUIRED_LIBRARIES ${LIBS})
+    # CHECK_CXX_SOURCE_COMPILES("int main() { int c; cudaGetDeviceCount(&c); return 0; }" _CLANG_CUDA_COMPILES)
+endfunction ()
+
+
+if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.0" AND
+    NOT CUDA_VERSION VERSION_LESS "8.0")
+    message(FATAL_ERROR "clang ${CMAKE_CXX_COMPILER_VERSION} for CUDA is only compatible with CUDA version <8.0")
+endif()
+
+if (GMX_CUDA_TARGET_COMPUTE)
+    message(WARNING "Values passed in GMX_CUDA_TARGET_COMPUTE will be ignored; clang will by default include PTX in the binary.")
+endif()
+
+if (GMX_CUDA_TARGET_SM)
+    set(_CUDA_CLANG_GENCODE_FLAGS)
+    set(_target_sm_list ${GMX_CUDA_TARGET_SM})
+    foreach(_target ${_target_sm_list})
+        list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_${_target}")
+    endforeach()
+else()
+    list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_20")
+    list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_30")
+    list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_35")
+    list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_37")
+    list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_50")
+    list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_52")
+    if (NOT CUDA_VERSION VERSION_LESS 8.0)
+        list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_60")
+        list(APPEND _CUDA_CLANG_GENCODE_FLAGS "--cuda-gpu-arch=sm_61")
+    endif()
+    # TODO: test CUDA 9.0 and figure out which clang releases support it
+    #       and the sm_70 arch.
+endif()
+if (GMX_CUDA_TARGET_SM)
+    set_property(CACHE GMX_CUDA_TARGET_SM PROPERTY HELPSTRING "List of CUDA GPU architecture codes to compile for (without the sm_ prefix)")
+    set_property(CACHE GMX_CUDA_TARGET_SM PROPERTY TYPE STRING)
+endif()
+
+# default flags
+list(APPEND _CUDA_CLANG_FLAGS "-x cuda" "-ffast-math")
+# CUDA toolkit
+list(APPEND _CUDA_CLANG_FLAGS "--cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
+# codegen flags
+list(APPEND _CUDA_CLANG_FLAGS "${_CUDA_CLANG_GENCODE_FLAGS}")
+foreach(_flag ${_CUDA_CLANG_FLAGS})
+    set(GMX_CUDA_CLANG_FLAGS "${GMX_CUDA_CLANG_FLAGS} ${_flag}")
+endforeach()
+
+if (CUDA_USE_STATIC_CUDA_RUNTIME)
+    set(GMX_CUDA_CLANG_LINK_LIBS "cudart_static")
+else()
+    set(GMX_CUDA_CLANG_LINK_LIBS "cudart")
+endif()
+set(GMX_CUDA_CLANG_LINK_LIBS "${GMX_CUDA_CLANG_LINK_LIBS}" "dl" "rt")
+if (CUDA_64_BIT_DEVICE_CODE)
+    set(GMX_CUDA_CLANG_LINK_DIRS "${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+else()
+    set(GMX_CUDA_CLANG_LINK_DIRS "${CUDA_TOOLKIT_ROOT_DIR}/lib")
+endif()
+
+gmx_test_clang_cuda_support()
diff --git a/cmake/gmxManageGPU.cmake b/cmake/gmxManageGPU.cmake
index 60a74291ec..435ed0c17e 100644
--- a/cmake/gmxManageGPU.cmake
+++ b/cmake/gmxManageGPU.cmake
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
+# Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -42,6 +42,12 @@ if (NOT DEFINED GMX_GPU)
 endif()
 option(GMX_GPU "Enable GPU acceleration" OFF)
 
+option(GMX_CLANG_CUDA "Use clang for CUDA" OFF)
+if (GMX_CLANG_CUDA)
+    # CUDA 7.0 or later required, override req. version
+    set(REQUIRED_CUDA_VERSION 7.0)
+endif()
+
 if(GMX_GPU AND GMX_DOUBLE)
     message(FATAL_ERROR "GPU acceleration is not available in double precision!")
 endif()
@@ -174,44 +180,59 @@ endif()
 #   COMPILER_FLAGS  - [output variable] flags for the compiler
 #
 macro(get_cuda_compiler_info COMPILER_INFO COMPILER_FLAGS)
-    if(CUDA_NVCC_EXECUTABLE)
+    if(NOT GMX_CLANG_CUDA)
+        if(CUDA_NVCC_EXECUTABLE)
 
-        # Get the nvcc version string. This is multi-line, but since it is only 4 lines
-        # and might change in the future it is better to store than trying to parse out
-        # the version from the current format.
-        execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} --version
-            RESULT_VARIABLE _nvcc_version_res
-            OUTPUT_VARIABLE _nvcc_version_out
-            ERROR_VARIABLE  _nvcc_version_err
-            OUTPUT_STRIP_TRAILING_WHITESPACE)
-        if (${_nvcc_version_res} EQUAL 0)
-            # Fix multi-line mess: Replace newline with ";" so we can use it in a define
-            string(REPLACE "\n" ";" _nvcc_info_singleline ${_nvcc_version_out})
-            SET(${COMPILER_INFO} "${CUDA_NVCC_EXECUTABLE} ${_nvcc_info_singleline}")
-            string(TOUPPER ${CMAKE_BUILD_TYPE} _build_type)
-            SET(_compiler_flags "${CUDA_NVCC_FLAGS_${_build_type}}")
-            if(CUDA_PROPAGATE_HOST_FLAGS)
-                string(REGEX REPLACE "[ ]+" ";" _cxx_flags_nospace "${BUILD_CXXFLAGS}")
+            # Get the nvcc version string. This is multi-line, but since it is only 4 lines
+            # and might change in the future it is better to store than trying to parse out
+            # the version from the current format.
+            execute_process(COMMAND ${CUDA_NVCC_EXECUTABLE} --version
+                RESULT_VARIABLE _nvcc_version_res
+                OUTPUT_VARIABLE _nvcc_version_out
+                ERROR_VARIABLE  _nvcc_version_err
+                OUTPUT_STRIP_TRAILING_WHITESPACE)
+            if (${_nvcc_version_res} EQUAL 0)
+                # Fix multi-line mess: Replace newline with ";" so we can use it in a define
+                string(REPLACE "\n" ";" _nvcc_info_singleline ${_nvcc_version_out})
+                SET(${COMPILER_INFO} "${CUDA_NVCC_EXECUTABLE} ${_nvcc_info_singleline}")
+                string(TOUPPER ${CMAKE_BUILD_TYPE} _build_type)
+                SET(_compiler_flags "${CUDA_NVCC_FLAGS_${_build_type}}")
+                if(CUDA_PROPAGATE_HOST_FLAGS)
+                    string(REGEX REPLACE "[ ]+" ";" _cxx_flags_nospace "${BUILD_CXXFLAGS}")
+                endif()
+                SET(${COMPILER_FLAGS} "${CUDA_NVCC_FLAGS}${CUDA_NVCC_FLAGS_${_build_type}}; ${_cxx_flags_nospace}")
+            else()
+                SET(${COMPILER_INFO} "N/A")
+                SET(${COMPILER_FLAGS} "N/A")
             endif()
-            SET(${COMPILER_FLAGS} "${CUDA_NVCC_FLAGS}${CUDA_NVCC_FLAGS_${_build_type}}; ${_cxx_flags_nospace}")
-        else()
-            SET(${COMPILER_INFO} "N/A")
-            SET(${COMPILER_FLAGS} "N/A")
         endif()
+    else()
+        # CXX compiler is the CUDA compiler
+        set(${COMPILER_INFO} "${CMAKE_CXX_COMPILER}  ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+        # there are some extra flags
+        set(${COMPILER_FLAGS} "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${_build_type}} ${GMX_CUDA_CLANG_FLAGS}")
     endif()
 endmacro ()
 
+macro(enable_multiple_cuda_compilation_units)
+    message(STATUS "Enabling multiple compilation units for the CUDA non-bonded module.")
+    set_property(CACHE GMX_CUDA_NB_SINGLE_COMPILATION_UNIT PROPERTY VALUE OFF)
+endmacro()
+
 include(CMakeDependentOption)
 include(gmxOptionUtilities)
 macro(gmx_gpu_setup)
     if(GMX_GPU)
-        if(NOT CUDA_NVCC_EXECUTABLE)
-            message(FATAL_ERROR "nvcc is required for a CUDA build, please set CUDA_TOOLKIT_ROOT_DIR appropriately")
+        if(NOT GMX_CLANG_CUDA)
+            if(NOT CUDA_NVCC_EXECUTABLE)
+                message(FATAL_ERROR "nvcc is required for a CUDA build, please set CUDA_TOOLKIT_ROOT_DIR appropriately")
+            endif()
+            # set up nvcc options
+            include(gmxManageNvccConfig)
+        else()
+            include(gmxManageClangCudaConfig)
         endif()
 
-        # set up nvcc options
-        include(gmxManageNvccConfig)
-
         gmx_check_if_changed(_cuda_version_changed CUDA_VERSION)
 
         # Generate CUDA RT API version string which will end up in config.h
@@ -249,19 +270,25 @@ macro(gmx_gpu_setup)
         endif()
     endif() # GMX_GPU
 
+    if (GMX_CLANG_CUDA)
+        set (_GMX_CUDA_NB_SINGLE_COMPILATION_UNIT_DEFAULT FALSE)
+    else()
+        set (_GMX_CUDA_NB_SINGLE_COMPILATION_UNIT_DEFAULT TRUE)
+    endif()
     cmake_dependent_option(GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
-        "Whether to compile the CUDA non-bonded module using a single compilation unit." ON
+        "Whether to compile the CUDA non-bonded module using a single compilation unit." ${_GMX_CUDA_NB_SINGLE_COMPILATION_UNIT_DEFAULT}
         "GMX_GPU" ON)
     mark_as_advanced(GMX_CUDA_NB_SINGLE_COMPILATION_UNIT)
 
-    if (GMX_GPU)
+    if (GMX_GPU AND NOT GMX_CLANG_CUDA)
         # We need to use single compilation unit for kernels:
-        # - when compiling for CC 2.x devices where buggy kernel code is generated
+        # when compiling with nvcc for CC 2.x devices where buggy kernel code is generated
         gmx_check_if_changed(_gmx_cuda_target_changed GMX_CUDA_TARGET_SM GMX_CUDA_TARGET_COMPUTE CUDA_NVCC_FLAGS)
+
         if(_gmx_cuda_target_changed OR NOT GMX_GPU_DETECTION_DONE)
             if((NOT GMX_CUDA_TARGET_SM AND NOT GMX_CUDA_TARGET_COMPUTE) OR
-               (GMX_CUDA_TARGET_SM MATCHES "2[01]" OR GMX_CUDA_TARGET_COMPUTE MATCHES "2[01]"))
-               message(STATUS "Enabling single compilation unit for the CUDA non-bonded module. Multiple compilation units are not compatible with CC 2.x devices, to enable the feature specify only CC >=3.0 target architectures in GMX_CUDA_TARGET_SM/GMX_CUDA_TARGET_COMPUTE.")
+                (GMX_CUDA_TARGET_SM MATCHES "2[01]" OR GMX_CUDA_TARGET_COMPUTE MATCHES "2[01]"))
+                message(STATUS "Enabling single compilation unit for the CUDA non-bonded module. Multiple compilation units are not compatible with CC 2.x devices, to enable the feature specify only CC >=3.0 target architectures in GMX_CUDA_TARGET_SM/GMX_CUDA_TARGET_COMPUTE.")
                 set_property(CACHE GMX_CUDA_NB_SINGLE_COMPILATION_UNIT PROPERTY VALUE ON)
             else()
                 message(STATUS "Enabling multiple compilation units for the CUDA non-bonded module.")
diff --git a/docs/dev-manual/build-system.rst b/docs/dev-manual/build-system.rst
index b055116cf6..459a9a737a 100644
--- a/docs/dev-manual/build-system.rst
+++ b/docs/dev-manual/build-system.rst
@@ -257,6 +257,14 @@ Variables affecting compilation/linking
 
 .. cmake:: GMX_GPU
 
+.. cmake:: GMX_CLANG_CUDA
+
+   Use clang for compiling CUDA GPU code, both host and device.
+
+.. cmake:: GMX_CUDA_CLANG_FLAGS
+
+    Pass additional CUDA-only compiler flags to clang using this variable.
+
 .. cmake:: GMX_LIB_INSTALL_DIR
 
    Sets the installation directory for libraries (default is determined by
diff --git a/docs/install-guide/index.rst b/docs/install-guide/index.rst
index 98b6fa8b07..e7c4a0110a 100644
--- a/docs/install-guide/index.rst
+++ b/docs/install-guide/index.rst
@@ -612,6 +612,26 @@ Linux, Mac OS X and Windows operating systems, but Linux is the
 best-tested and supported of these. Linux running on POWER 8, ARM v7 and v8
 CPUs also works well.
 
+Experimental support is available for compiling CUDA code, both for host and
+device, using clang (version 3.9 or later).
+A CUDA toolkit (>= v7.0) is still required but it is used only for GPU device code
+generation and to link against the CUDA runtime library.
+The clang CUDA support simplifies compilation and provides benefits for development
+(e.g. allows the use code sanitizers in CUDA host-code).
+Additionally, using clang for both CPU and GPU compilation can be beneficial
+to avoid compatibility issues between the GNU toolchain and the CUDA toolkit.
+clang for CUDA can be triggered using the ``GMX_CLANG_CUDA=ON`` CMake option.
+Target architectures can be selected with  ``GMX_CUDA_TARGET_SM``,
+virtual architecture code is always embedded for all requested architectures
+(hence GMX_CUDA_TARGET_COMPUTE is ignored).
+Note that this is mainly a developer-oriented feature and it is not recommended
+for production use as the performance can be significantly lower than that
+of code compiled with nvcc (and it has also received less testing).
+However, note that with clang 5.0 the performance gap is significantly narrowed
+(at the time of writing, about 20% slower GPU kernels), so this version
+could be considered in non performance-critical use-cases.
+
+
 OpenCL GPU acceleration
 ^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f120b01add..517ca813bc 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2009,2010,2011,2012,2013,2014,2015,2016, by the GROMACS development team, led by
+# Copyright (c) 2009,2010,2011,2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -39,8 +39,11 @@ include(GetCompilerInfo.cmake)
 get_compiler_info(C BUILD_C_COMPILER BUILD_CFLAGS)
 get_compiler_info(CXX BUILD_CXX_COMPILER BUILD_CXXFLAGS)
 if(GMX_USE_CUDA)
-    GMX_SET_CUDA_NVCC_FLAGS()
-    get_cuda_compiler_info(CUDA_NVCC_COMPILER_INFO CUDA_NVCC_COMPILER_FLAGS)
+    if(NOT GMX_CLANG_CUDA)
+        GMX_SET_CUDA_NVCC_FLAGS()
+    endif()
+
+    get_cuda_compiler_info(CUDA_COMPILER_INFO CUDA_COMPILER_FLAGS)
 endif()
 
 configure_file(config.h.cmakein config.h)
diff --git a/src/buildinfo.h.cmakein b/src/buildinfo.h.cmakein
index 1f03cdf30a..21a31e23b3 100644
--- a/src/buildinfo.h.cmakein
+++ b/src/buildinfo.h.cmakein
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -90,11 +90,11 @@
 /** Location of data files in the installation directory */
 #define DATA_INSTALL_DIR        "@DATA_INSTALL_DIR@"
 
-/** CUDA nvcc compiler version information */
-#define CUDA_NVCC_COMPILER_INFO "@CUDA_NVCC_COMPILER_INFO@"
+/** CUDA compiler version information */
+#define CUDA_COMPILER_INFO "@CUDA_COMPILER_INFO@"
 
-/** CUDA nvcc compiler flags */
-#define CUDA_NVCC_COMPILER_FLAGS "@CUDA_NVCC_COMPILER_FLAGS@"
+/** CUDA compiler flags */
+#define CUDA_COMPILER_FLAGS "@CUDA_COMPILER_FLAGS@"
 
 /** OpenCL include dir */
 #define OPENCL_INCLUDE_DIR "@OPENCL_INCLUDE_DIR@"
diff --git a/src/gromacs/CMakeLists.txt b/src/gromacs/CMakeLists.txt
index 4ee526788a..ba8f6d2a55 100644
--- a/src/gromacs/CMakeLists.txt
+++ b/src/gromacs/CMakeLists.txt
@@ -34,6 +34,10 @@
 
 set(LIBGROMACS_SOURCES)
 
+if (GMX_CLANG_CUDA)
+    include(gmxClangCudaUtils)
+endif()
+
 set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
 set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
 
@@ -151,11 +155,25 @@ gmx_configure_version_file(
     REMOTE_HASH)
 list(APPEND LIBGROMACS_SOURCES ${GENERATED_VERSION_FILE})
 
+# set up CUDA compilation with clang
+if (GMX_CLANG_CUDA)
+    foreach (_file ${LIBGROMACS_SOURCES})
+        get_filename_component(_ext ${_file} EXT)
+        if (${_ext} STREQUAL ".cu")
+            gmx_compile_cuda_file_with_clang(${_file})
+        endif()
+    endforeach()
+endif()
+
 if (GMX_USE_CUDA)
     # Work around FindCUDA that prevents using target_link_libraries()
     # with keywords otherwise...
     set(CUDA_LIBRARIES PRIVATE ${CUDA_LIBRARIES})
-    cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
+    if (NOT GMX_CLANG_CUDA)
+        cuda_add_library(libgromacs ${LIBGROMACS_SOURCES})
+    else()
+        add_library(libgromacs ${LIBGROMACS_SOURCES})
+    endif()
 else()
     add_library(libgromacs ${LIBGROMACS_SOURCES})
 endif()
diff --git a/src/gromacs/gpu_utils/cuda_arch_utils.cuh b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
index 4639acdd23..e1bf50cc3b 100644
--- a/src/gromacs/gpu_utils/cuda_arch_utils.cuh
+++ b/src/gromacs/gpu_utils/cuda_arch_utils.cuh
@@ -140,11 +140,13 @@ T gmx_shfl_down_sync(const unsigned int activeMask,
 
 /*! \brief Allow disabling CUDA textures using the GMX_DISABLE_CUDA_TEXTURES macro.
  *
+ *  Disable texture support-missing in clang (all versions up to <=5.0-dev as of writing).
+ *
  *  This option will not influence functionality. All features using textures ought
  *  to have fallback for texture-less reads (direct/LDG loads), all new code needs
  *  to provide fallback code.
  */
-#if defined GMX_DISABLE_CUDA_TEXTURES
+#if defined(GMX_DISABLE_CUDA_TEXTURES) || (defined(__clang__) && defined(__CUDA__))
 #define DISABLE_CUDA_TEXTURES 1
 #else
 #define DISABLE_CUDA_TEXTURES 0
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index bb72046172..bcdd4aed0c 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -128,8 +128,8 @@ texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
  * build-time checks to prevent this, the user could manually tweaks nvcc flags
  * which would lead to buggy kernels getting compiled.
  */
-#if GMX_PTX_ARCH > 0 && GMX_PTX_ARCH <= 210
-#error Due to an CUDA compiler bug, the CUDA non-bonded module can not be compiled with multiple compilation units for CC 2.x devices. If you have changed the nvcc flags manually, either use the GMX_CUDA_TARGET_* variables instead or set GMX_CUDA_NB_SINGLE_COMPILATION_UNIT=ON CMake option.
+#if GMX_PTX_ARCH > 0 && GMX_PTX_ARCH <= 210 && !defined(__clang__)
+#error Due to an CUDA nvcc compiler bug, the CUDA non-bonded module can not be compiled with multiple compilation units for CC 2.x devices. If you have changed the nvcc flags manually, either use the GMX_CUDA_TARGET_* variables instead or set GMX_CUDA_NB_SINGLE_COMPILATION_UNIT=ON CMake option.
 #endif
 #endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
 
@@ -932,21 +932,36 @@ void nbnxn_gpu_wait_for_gpu(gmx_nbnxn_cuda_t *nb,
     plist->haveFreshList = false;
 }
 
+/*! \brief Return the reference to the nbfp texture.
+ *
+ *  Note: it can return junk when c_disableCudaTextures==false, but we don't
+ *  assert on that condition because the data_mgmt module ends up calling this
+ *  function even if texture references are not used.
+ */
 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref()
 {
-    assert(!c_disableCudaTextures);
     return nbfp_texref;
 }
 
+/*! \brief Return the reference to the nbfp_comb texture.
+ *
+ *  Note: it can return junk when c_disableCudaTextures==false, but we don't
+ *  assert on that condition because the data_mgmt module ends up calling this
+ *  function even if texture references are not used.
+ */
 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
 {
-    assert(!c_disableCudaTextures);
     return nbfp_comb_texref;
 }
 
+/*! \brief Return the reference to the coulomb_tab.
+ *
+ *  Note: it can return junk when c_disableCudaTextures==false, but we don't
+ *  assert on that condition because the data_mgmt module ends up calling this
+ *  function even if texture references are not used.
+ */
 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
 {
-    assert(!c_disableCudaTextures);
     return coulomb_tab_texref;
 }
 
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
index 71f1901434..2626bf101d 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -706,14 +706,13 @@ void reduce_energy_pow2(volatile float *buf,
                         float *e_lj, float *e_el,
                         unsigned int tidx)
 {
-    int     i, j;
-    float   e1, e2;
+    float        e1, e2;
 
-    i = warp_size/2;
+    unsigned int i = warp_size/2;
 
     /* Can't just use i as loop variable because than nvcc refuses to unroll. */
 #pragma unroll 10
-    for (j = warp_size_log2 - 1; j > 0; j--)
+    for (int j = warp_size_log2 - 1; j > 0; j--)
     {
         if (tidx < i)
         {
diff --git a/src/gromacs/utility/binaryinformation.cpp b/src/gromacs/utility/binaryinformation.cpp
index a222478069..efa2c0f287 100644
--- a/src/gromacs/utility/binaryinformation.cpp
+++ b/src/gromacs/utility/binaryinformation.cpp
@@ -302,8 +302,8 @@ void gmx_print_version_info(gmx::TextWriter *writer)
     writer->writeLine(formatString("OpenCL version:     %s", OPENCL_VERSION_STRING));
 #endif
 #if GMX_GPU == GMX_GPU_CUDA
-    writer->writeLine(formatString("CUDA compiler:      %s\n", CUDA_NVCC_COMPILER_INFO));
-    writer->writeLine(formatString("CUDA compiler flags:%s\n", CUDA_NVCC_COMPILER_FLAGS));
+    writer->writeLine(formatString("CUDA compiler:      %s\n", CUDA_COMPILER_INFO));
+    writer->writeLine(formatString("CUDA compiler flags:%s\n", CUDA_COMPILER_FLAGS));
     auto driverVersion = gmx::getCudaDriverVersion();
     writer->writeLine(formatString("CUDA driver:        %d.%d\n", driverVersion.first, driverVersion.second));
     auto runtimeVersion = gmx::getCudaRuntimeVersion();
-- 
2.11.4.GIT