libc/src/__support/GPU/nvptx/utils.h

   1 //===-------------- NVPTX implementation of GPU utils -----------*- C++ -*-===//
   2 //
   3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4 // See https://llvm.org/LICENSE.txt for license information.
   5 // SPDX-License-id: Apache-2.0 WITH LLVM-exception
   6 //
   7 //===----------------------------------------------------------------------===//
   8
   9 #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
  10 #define LLVM_LIBC_SRC___SUPPORT_GPU_NVPTX_IO_H
  11
  12 #include "src/__support/common.h"
  13 #include "src/__support/macros/config.h"
  14
  15 #include <stdint.h>
  16
  17 namespace LIBC_NAMESPACE_DECL {
  18 namespace gpu {
  19
  20 /// Type aliases to the address spaces used by the NVPTX backend.
  21 template <typename T> using Private = [[clang::opencl_private]] T;
  22 template <typename T> using Constant = [[clang::opencl_constant]] T;
  23 template <typename T> using Local = [[clang::opencl_local]] T;
  24 template <typename T> using Global = [[clang::opencl_global]] T;
  25
  26 /// Returns the number of CUDA blocks in the 'x' dimension.
  27 LIBC_INLINE uint32_t get_num_blocks_x() {
  28   return __nvvm_read_ptx_sreg_nctaid_x();
  29 }
  30
  31 /// Returns the number of CUDA blocks in the 'y' dimension.
  32 LIBC_INLINE uint32_t get_num_blocks_y() {
  33   return __nvvm_read_ptx_sreg_nctaid_y();
  34 }
  35
  36 /// Returns the number of CUDA blocks in the 'z' dimension.
  37 LIBC_INLINE uint32_t get_num_blocks_z() {
  38   return __nvvm_read_ptx_sreg_nctaid_z();
  39 }
  40
  41 /// Returns the total number of CUDA blocks.
  42 LIBC_INLINE uint64_t get_num_blocks() {
  43   return get_num_blocks_x() * get_num_blocks_y() * get_num_blocks_z();
  44 }
  45
  46 /// Returns the 'x' dimension of the current CUDA block's id.
  47 LIBC_INLINE uint32_t get_block_id_x() { return __nvvm_read_ptx_sreg_ctaid_x(); }
  48
  49 /// Returns the 'y' dimension of the current CUDA block's id.
  50 LIBC_INLINE uint32_t get_block_id_y() { return __nvvm_read_ptx_sreg_ctaid_y(); }
  51
  52 /// Returns the 'z' dimension of the current CUDA block's id.
  53 LIBC_INLINE uint32_t get_block_id_z() { return __nvvm_read_ptx_sreg_ctaid_z(); }
  54
  55 /// Returns the absolute id of the CUDA block.
  56 LIBC_INLINE uint64_t get_block_id() {
  57   return get_block_id_x() + get_num_blocks_x() * get_block_id_y() +
  58          get_num_blocks_x() * get_num_blocks_y() * get_block_id_z();
  59 }
  60
  61 /// Returns the number of CUDA threads in the 'x' dimension.
  62 LIBC_INLINE uint32_t get_num_threads_x() {
  63   return __nvvm_read_ptx_sreg_ntid_x();
  64 }
  65
  66 /// Returns the number of CUDA threads in the 'y' dimension.
  67 LIBC_INLINE uint32_t get_num_threads_y() {
  68   return __nvvm_read_ptx_sreg_ntid_y();
  69 }
  70
  71 /// Returns the number of CUDA threads in the 'z' dimension.
  72 LIBC_INLINE uint32_t get_num_threads_z() {
  73   return __nvvm_read_ptx_sreg_ntid_z();
  74 }
  75
  76 /// Returns the total number of threads in the block.
  77 LIBC_INLINE uint64_t get_num_threads() {
  78   return get_num_threads_x() * get_num_threads_y() * get_num_threads_z();
  79 }
  80
  81 /// Returns the 'x' dimension id of the thread in the current CUDA block.
  82 LIBC_INLINE uint32_t get_thread_id_x() { return __nvvm_read_ptx_sreg_tid_x(); }
  83
  84 /// Returns the 'y' dimension id of the thread in the current CUDA block.
  85 LIBC_INLINE uint32_t get_thread_id_y() { return __nvvm_read_ptx_sreg_tid_y(); }
  86
  87 /// Returns the 'z' dimension id of the thread in the current CUDA block.
  88 LIBC_INLINE uint32_t get_thread_id_z() { return __nvvm_read_ptx_sreg_tid_z(); }
  89
  90 /// Returns the absolute id of the thread in the current CUDA block.
  91 LIBC_INLINE uint64_t get_thread_id() {
  92   return get_thread_id_x() + get_num_threads_x() * get_thread_id_y() +
  93          get_num_threads_x() * get_num_threads_y() * get_thread_id_z();
  94 }
  95
  96 /// Returns the size of a CUDA warp, always 32 on NVIDIA hardware.
  97 LIBC_INLINE uint32_t get_lane_size() { return 32; }
  98
  99 /// Returns the id of the thread inside of a CUDA warp executing together.
 100 [[clang::convergent]] LIBC_INLINE uint32_t get_lane_id() {
 101   return __nvvm_read_ptx_sreg_laneid();
 102 }
 103
 104 /// Returns the bit-mask of active threads in the current warp.
 105 [[clang::convergent]] LIBC_INLINE uint64_t get_lane_mask() {
 106   return __nvvm_activemask();
 107 }
 108
 109 /// Copies the value from the first active thread in the warp to the rest.
 110 [[clang::convergent]] LIBC_INLINE uint32_t broadcast_value(uint64_t lane_mask,
 111                                                            uint32_t x) {
 112   uint32_t mask = static_cast<uint32_t>(lane_mask);
 113   uint32_t id = __builtin_ffs(mask) - 1;
 114   return __nvvm_shfl_sync_idx_i32(mask, x, id, get_lane_size() - 1);
 115 }
 116
 117 /// Returns a bitmask of threads in the current lane for which \p x is true.
 118 [[clang::convergent]] LIBC_INLINE uint64_t ballot(uint64_t lane_mask, bool x) {
 119   uint32_t mask = static_cast<uint32_t>(lane_mask);
 120   return __nvvm_vote_ballot_sync(mask, x);
 121 }
 122
 123 /// Waits for all the threads in the block to converge and issues a fence.
 124 [[clang::convergent]] LIBC_INLINE void sync_threads() { __syncthreads(); }
 125
 126 /// Waits for all pending memory operations to complete in program order.
 127 [[clang::convergent]] LIBC_INLINE void memory_fence() { __nvvm_membar_sys(); }
 128
 129 /// Waits for all threads in the warp to reconverge for independent scheduling.
 130 [[clang::convergent]] LIBC_INLINE void sync_lane(uint64_t mask) {
 131   __nvvm_bar_warp_sync(static_cast<uint32_t>(mask));
 132 }
 133
 134 /// Shuffles the the lanes inside the warp according to the given index.
 135 [[clang::convergent]] LIBC_INLINE uint32_t shuffle(uint64_t lane_mask,
 136                                                    uint32_t idx, uint32_t x) {
 137   uint32_t mask = static_cast<uint32_t>(lane_mask);
 138   uint32_t bitmask = (mask >> idx) & 1;
 139   return -bitmask & __nvvm_shfl_sync_idx_i32(mask, x, idx, get_lane_size() - 1);
 140 }
 141
 142 /// Returns the current value of the GPU's processor clock.
 143 LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
 144
 145 /// Returns a global fixed-frequency timer at nanosecond frequency.
 146 LIBC_INLINE uint64_t fixed_frequency_clock() {
 147   return __builtin_readsteadycounter();
 148 }
 149
 150 /// Terminates execution of the calling thread.
 151 [[noreturn]] LIBC_INLINE void end_program() { __nvvm_exit(); }
 152
 153 /// Returns a unique identifier for the process cluster the current warp is
 154 /// executing on. Here we use the identifier for the symmetric multiprocessor.
 155 LIBC_INLINE uint32_t get_cluster_id() { return __nvvm_read_ptx_sreg_smid(); }
 156
 157 } // namespace gpu
 158 } // namespace LIBC_NAMESPACE_DECL
 159
 160 #endif