1 /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
9 #ifndef __CLANG_CUDA_INTRINSICS_H__
10 #define __CLANG_CUDA_INTRINSICS_H__
12 #error "This file is for CUDA compilation only."
15 // sm_30 intrinsics: __shfl_{up,down,xor}.
17 #define __SM_30_INTRINSICS_H__
18 #define __SM_30_INTRINSICS_HPP__
20 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
22 #pragma push_macro("__MAKE_SHUFFLES")
23 #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \
25 inline __device__ int __FnName(int __val, __Type __offset, \
26 int __width = warpSize) { \
27 return __IntIntrinsic(__val, __offset, \
28 ((warpSize - __width) << 8) | (__Mask)); \
30 inline __device__ float __FnName(float __val, __Type __offset, \
31 int __width = warpSize) { \
32 return __FloatIntrinsic(__val, __offset, \
33 ((warpSize - __width) << 8) | (__Mask)); \
35 inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \
36 int __width = warpSize) { \
37 return static_cast<unsigned int>( \
38 ::__FnName(static_cast<int>(__val), __offset, __width)); \
40 inline __device__ long long __FnName(long long __val, __Type __offset, \
41 int __width = warpSize) { \
45 _Static_assert(sizeof(__val) == sizeof(__Bits)); \
46 _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
48 memcpy(&__tmp, &__val, sizeof(__val)); \
49 __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \
50 __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \
52 memcpy(&__ret, &__tmp, sizeof(__tmp)); \
55 inline __device__ long __FnName(long __val, __Type __offset, \
56 int __width = warpSize) { \
57 _Static_assert(sizeof(long) == sizeof(long long) || \
58 sizeof(long) == sizeof(int)); \
59 if (sizeof(long) == sizeof(long long)) { \
60 return static_cast<long>( \
61 ::__FnName(static_cast<long long>(__val), __offset, __width)); \
62 } else if (sizeof(long) == sizeof(int)) { \
63 return static_cast<long>( \
64 ::__FnName(static_cast<int>(__val), __offset, __width)); \
67 inline __device__ unsigned long __FnName( \
68 unsigned long __val, __Type __offset, int __width = warpSize) { \
69 return static_cast<unsigned long>( \
70 ::__FnName(static_cast<long>(__val), __offset, __width)); \
72 inline __device__ unsigned long long __FnName( \
73 unsigned long long __val, __Type __offset, int __width = warpSize) { \
74 return static_cast<unsigned long long>( \
75 ::__FnName(static_cast<long long>(__val), __offset, __width)); \
77 inline __device__ double __FnName(double __val, __Type __offset, \
78 int __width = warpSize) { \
80 _Static_assert(sizeof(__tmp) == sizeof(__val)); \
81 memcpy(&__tmp, &__val, sizeof(__val)); \
82 __tmp = ::__FnName(__tmp, __offset, __width); \
84 memcpy(&__ret, &__tmp, sizeof(__ret)); \
88 __MAKE_SHUFFLES(__shfl
, __nvvm_shfl_idx_i32
, __nvvm_shfl_idx_f32
, 0x1f, int);
89 // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
91 __MAKE_SHUFFLES(__shfl_up
, __nvvm_shfl_up_i32
, __nvvm_shfl_up_f32
, 0,
93 __MAKE_SHUFFLES(__shfl_down
, __nvvm_shfl_down_i32
, __nvvm_shfl_down_f32
, 0x1f,
95 __MAKE_SHUFFLES(__shfl_xor
, __nvvm_shfl_bfly_i32
, __nvvm_shfl_bfly_f32
, 0x1f,
97 #pragma pop_macro("__MAKE_SHUFFLES")
99 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
101 #if CUDA_VERSION >= 9000
102 #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
103 // __shfl_sync_* variants available in CUDA-9
104 #pragma push_macro("__MAKE_SYNC_SHUFFLES")
105 #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
107 inline __device__ int __FnName(unsigned int __mask, int __val, \
108 __Type __offset, int __width = warpSize) { \
109 return __IntIntrinsic(__mask, __val, __offset, \
110 ((warpSize - __width) << 8) | (__Mask)); \
112 inline __device__ float __FnName(unsigned int __mask, float __val, \
113 __Type __offset, int __width = warpSize) { \
114 return __FloatIntrinsic(__mask, __val, __offset, \
115 ((warpSize - __width) << 8) | (__Mask)); \
117 inline __device__ unsigned int __FnName(unsigned int __mask, \
118 unsigned int __val, __Type __offset, \
119 int __width = warpSize) { \
120 return static_cast<unsigned int>( \
121 ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
123 inline __device__ long long __FnName(unsigned int __mask, long long __val, \
125 int __width = warpSize) { \
129 _Static_assert(sizeof(__val) == sizeof(__Bits)); \
130 _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
132 memcpy(&__tmp, &__val, sizeof(__val)); \
133 __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
134 __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
136 memcpy(&__ret, &__tmp, sizeof(__tmp)); \
139 inline __device__ unsigned long long __FnName( \
140 unsigned int __mask, unsigned long long __val, __Type __offset, \
141 int __width = warpSize) { \
142 return static_cast<unsigned long long>( \
143 ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \
145 inline __device__ long __FnName(unsigned int __mask, long __val, \
146 __Type __offset, int __width = warpSize) { \
147 _Static_assert(sizeof(long) == sizeof(long long) || \
148 sizeof(long) == sizeof(int)); \
149 if (sizeof(long) == sizeof(long long)) { \
150 return static_cast<long>(::__FnName( \
151 __mask, static_cast<long long>(__val), __offset, __width)); \
152 } else if (sizeof(long) == sizeof(int)) { \
153 return static_cast<long>( \
154 ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
157 inline __device__ unsigned long __FnName( \
158 unsigned int __mask, unsigned long __val, __Type __offset, \
159 int __width = warpSize) { \
160 return static_cast<unsigned long>( \
161 ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \
163 inline __device__ double __FnName(unsigned int __mask, double __val, \
164 __Type __offset, int __width = warpSize) { \
166 _Static_assert(sizeof(__tmp) == sizeof(__val)); \
167 memcpy(&__tmp, &__val, sizeof(__val)); \
168 __tmp = ::__FnName(__mask, __tmp, __offset, __width); \
170 memcpy(&__ret, &__tmp, sizeof(__ret)); \
173 __MAKE_SYNC_SHUFFLES(__shfl_sync
, __nvvm_shfl_sync_idx_i32
,
174 __nvvm_shfl_sync_idx_f32
, 0x1f, int);
175 // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
177 __MAKE_SYNC_SHUFFLES(__shfl_up_sync
, __nvvm_shfl_sync_up_i32
,
178 __nvvm_shfl_sync_up_f32
, 0, unsigned int);
179 __MAKE_SYNC_SHUFFLES(__shfl_down_sync
, __nvvm_shfl_sync_down_i32
,
180 __nvvm_shfl_sync_down_f32
, 0x1f, unsigned int);
181 __MAKE_SYNC_SHUFFLES(__shfl_xor_sync
, __nvvm_shfl_sync_bfly_i32
,
182 __nvvm_shfl_sync_bfly_f32
, 0x1f, int);
183 #pragma pop_macro("__MAKE_SYNC_SHUFFLES")
185 inline __device__
void __syncwarp(unsigned int mask
= 0xffffffff) {
186 return __nvvm_bar_warp_sync(mask
);
189 inline __device__
void __barrier_sync(unsigned int id
) {
190 __nvvm_barrier_sync(id
);
193 inline __device__
void __barrier_sync_count(unsigned int id
,
194 unsigned int count
) {
195 __nvvm_barrier_sync_cnt(id
, count
);
198 inline __device__
int __all_sync(unsigned int mask
, int pred
) {
199 return __nvvm_vote_all_sync(mask
, pred
);
202 inline __device__
int __any_sync(unsigned int mask
, int pred
) {
203 return __nvvm_vote_any_sync(mask
, pred
);
206 inline __device__
int __uni_sync(unsigned int mask
, int pred
) {
207 return __nvvm_vote_uni_sync(mask
, pred
);
210 inline __device__
unsigned int __ballot_sync(unsigned int mask
, int pred
) {
211 return __nvvm_vote_ballot_sync(mask
, pred
);
214 inline __device__
unsigned int __activemask() {
215 #if CUDA_VERSION < 9020
216 return __nvvm_vote_ballot(1);
219 asm volatile("activemask.b32 %0;" : "=r"(mask
));
224 inline __device__
unsigned int __fns(unsigned mask
, unsigned base
, int offset
) {
225 return __nvvm_fns(mask
, base
, offset
);
228 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
230 // Define __match* builtins CUDA-9 headers expect to see.
231 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
232 inline __device__
unsigned int __match32_any_sync(unsigned int mask
,
233 unsigned int value
) {
234 return __nvvm_match_any_sync_i32(mask
, value
);
237 inline __device__
unsigned int
238 __match64_any_sync(unsigned int mask
, unsigned long long value
) {
239 return __nvvm_match_any_sync_i64(mask
, value
);
242 inline __device__
unsigned int
243 __match32_all_sync(unsigned int mask
, unsigned int value
, int *pred
) {
244 return __nvvm_match_all_sync_i32p(mask
, value
, pred
);
247 inline __device__
unsigned int
248 __match64_all_sync(unsigned int mask
, unsigned long long value
, int *pred
) {
249 return __nvvm_match_all_sync_i64p(mask
, value
, pred
);
251 #include "crt/sm_70_rt.hpp"
253 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
254 #endif // __CUDA_VERSION >= 9000
256 // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
258 // Prevent the vanilla sm_32 intrinsics header from being included.
259 #define __SM_32_INTRINSICS_H__
260 #define __SM_32_INTRINSICS_HPP__
262 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
264 inline __device__
char __ldg(const char *ptr
) { return __nvvm_ldg_c(ptr
); }
265 inline __device__
short __ldg(const short *ptr
) { return __nvvm_ldg_s(ptr
); }
266 inline __device__
int __ldg(const int *ptr
) { return __nvvm_ldg_i(ptr
); }
267 inline __device__
long __ldg(const long *ptr
) { return __nvvm_ldg_l(ptr
); }
268 inline __device__
long long __ldg(const long long *ptr
) {
269 return __nvvm_ldg_ll(ptr
);
271 inline __device__
unsigned char __ldg(const unsigned char *ptr
) {
272 return __nvvm_ldg_uc(ptr
);
274 inline __device__
signed char __ldg(const signed char *ptr
) {
275 return __nvvm_ldg_uc((const unsigned char *)ptr
);
277 inline __device__
unsigned short __ldg(const unsigned short *ptr
) {
278 return __nvvm_ldg_us(ptr
);
280 inline __device__
unsigned int __ldg(const unsigned int *ptr
) {
281 return __nvvm_ldg_ui(ptr
);
283 inline __device__
unsigned long __ldg(const unsigned long *ptr
) {
284 return __nvvm_ldg_ul(ptr
);
286 inline __device__
unsigned long long __ldg(const unsigned long long *ptr
) {
287 return __nvvm_ldg_ull(ptr
);
289 inline __device__
float __ldg(const float *ptr
) { return __nvvm_ldg_f(ptr
); }
290 inline __device__
double __ldg(const double *ptr
) { return __nvvm_ldg_d(ptr
); }
292 inline __device__ char2
__ldg(const char2
*ptr
) {
293 typedef char c2
__attribute__((ext_vector_type(2)));
294 // We can assume that ptr is aligned at least to char2's alignment, but the
295 // load will assume that ptr is aligned to char2's alignment. This is only
296 // safe if alignof(c2) <= alignof(char2).
297 c2 rv
= __nvvm_ldg_c2(reinterpret_cast<const c2
*>(ptr
));
303 inline __device__ char4
__ldg(const char4
*ptr
) {
304 typedef char c4
__attribute__((ext_vector_type(4)));
305 c4 rv
= __nvvm_ldg_c4(reinterpret_cast<const c4
*>(ptr
));
313 inline __device__ short2
__ldg(const short2
*ptr
) {
314 typedef short s2
__attribute__((ext_vector_type(2)));
315 s2 rv
= __nvvm_ldg_s2(reinterpret_cast<const s2
*>(ptr
));
321 inline __device__ short4
__ldg(const short4
*ptr
) {
322 typedef short s4
__attribute__((ext_vector_type(4)));
323 s4 rv
= __nvvm_ldg_s4(reinterpret_cast<const s4
*>(ptr
));
331 inline __device__ int2
__ldg(const int2
*ptr
) {
332 typedef int i2
__attribute__((ext_vector_type(2)));
333 i2 rv
= __nvvm_ldg_i2(reinterpret_cast<const i2
*>(ptr
));
339 inline __device__ int4
__ldg(const int4
*ptr
) {
340 typedef int i4
__attribute__((ext_vector_type(4)));
341 i4 rv
= __nvvm_ldg_i4(reinterpret_cast<const i4
*>(ptr
));
349 inline __device__ longlong2
__ldg(const longlong2
*ptr
) {
350 typedef long long ll2
__attribute__((ext_vector_type(2)));
351 ll2 rv
= __nvvm_ldg_ll2(reinterpret_cast<const ll2
*>(ptr
));
358 inline __device__ uchar2
__ldg(const uchar2
*ptr
) {
359 typedef unsigned char uc2
__attribute__((ext_vector_type(2)));
360 uc2 rv
= __nvvm_ldg_uc2(reinterpret_cast<const uc2
*>(ptr
));
366 inline __device__ uchar4
__ldg(const uchar4
*ptr
) {
367 typedef unsigned char uc4
__attribute__((ext_vector_type(4)));
368 uc4 rv
= __nvvm_ldg_uc4(reinterpret_cast<const uc4
*>(ptr
));
376 inline __device__ ushort2
__ldg(const ushort2
*ptr
) {
377 typedef unsigned short us2
__attribute__((ext_vector_type(2)));
378 us2 rv
= __nvvm_ldg_us2(reinterpret_cast<const us2
*>(ptr
));
384 inline __device__ ushort4
__ldg(const ushort4
*ptr
) {
385 typedef unsigned short us4
__attribute__((ext_vector_type(4)));
386 us4 rv
= __nvvm_ldg_us4(reinterpret_cast<const us4
*>(ptr
));
394 inline __device__ uint2
__ldg(const uint2
*ptr
) {
395 typedef unsigned int ui2
__attribute__((ext_vector_type(2)));
396 ui2 rv
= __nvvm_ldg_ui2(reinterpret_cast<const ui2
*>(ptr
));
402 inline __device__ uint4
__ldg(const uint4
*ptr
) {
403 typedef unsigned int ui4
__attribute__((ext_vector_type(4)));
404 ui4 rv
= __nvvm_ldg_ui4(reinterpret_cast<const ui4
*>(ptr
));
412 inline __device__ ulonglong2
__ldg(const ulonglong2
*ptr
) {
413 typedef unsigned long long ull2
__attribute__((ext_vector_type(2)));
414 ull2 rv
= __nvvm_ldg_ull2(reinterpret_cast<const ull2
*>(ptr
));
421 inline __device__ float2
__ldg(const float2
*ptr
) {
422 typedef float f2
__attribute__((ext_vector_type(2)));
423 f2 rv
= __nvvm_ldg_f2(reinterpret_cast<const f2
*>(ptr
));
429 inline __device__ float4
__ldg(const float4
*ptr
) {
430 typedef float f4
__attribute__((ext_vector_type(4)));
431 f4 rv
= __nvvm_ldg_f4(reinterpret_cast<const f4
*>(ptr
));
439 inline __device__ double2
__ldg(const double2
*ptr
) {
440 typedef double d2
__attribute__((ext_vector_type(2)));
441 d2 rv
= __nvvm_ldg_d2(reinterpret_cast<const d2
*>(ptr
));
448 // TODO: Implement these as intrinsics, so the backend can work its magic on
449 // these. Alternatively, we could implement these as plain C and try to get
450 // llvm to recognize the relevant patterns.
451 inline __device__
unsigned __funnelshift_l(unsigned low32
, unsigned high32
,
452 unsigned shiftWidth
) {
454 asm("shf.l.wrap.b32 %0, %1, %2, %3;"
456 : "r"(low32
), "r"(high32
), "r"(shiftWidth
));
459 inline __device__
unsigned __funnelshift_lc(unsigned low32
, unsigned high32
,
460 unsigned shiftWidth
) {
462 asm("shf.l.clamp.b32 %0, %1, %2, %3;"
464 : "r"(low32
), "r"(high32
), "r"(shiftWidth
));
467 inline __device__
unsigned __funnelshift_r(unsigned low32
, unsigned high32
,
468 unsigned shiftWidth
) {
470 asm("shf.r.wrap.b32 %0, %1, %2, %3;"
472 : "r"(low32
), "r"(high32
), "r"(shiftWidth
));
475 inline __device__
unsigned __funnelshift_rc(unsigned low32
, unsigned high32
,
476 unsigned shiftWidth
) {
478 asm("shf.r.clamp.b32 %0, %1, %2, %3;"
480 : "r"(low32
), "r"(high32
), "r"(shiftWidth
));
484 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
486 #if CUDA_VERSION >= 11000
488 __device__
inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr
) {
489 return (size_t)(void __attribute__((address_space(1))) *)__ptr
;
491 __device__
inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr
) {
492 return (size_t)(void __attribute__((address_space(3))) *)__ptr
;
494 __device__
inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr
) {
495 return (size_t)(void __attribute__((address_space(4))) *)__ptr
;
497 __device__
inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr
) {
498 return (size_t)(void __attribute__((address_space(5))) *)__ptr
;
500 __device__
inline void *__nv_cvta_global_to_generic_impl(size_t __ptr
) {
501 return (void *)(void __attribute__((address_space(1))) *)__ptr
;
503 __device__
inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr
) {
504 return (void *)(void __attribute__((address_space(3))) *)__ptr
;
506 __device__
inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr
) {
507 return (void *)(void __attribute__((address_space(4))) *)__ptr
;
509 __device__
inline void *__nv_cvta_local_to_generic_impl(size_t __ptr
) {
510 return (void *)(void __attribute__((address_space(5))) *)__ptr
;
512 __device__
inline cuuint32_t
__nvvm_get_smem_pointer(void *__ptr
) {
513 return __nv_cvta_generic_to_shared_impl(__ptr
);
517 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
518 __device__
inline unsigned __reduce_add_sync(unsigned __mask
,
520 return __nvvm_redux_sync_add(__mask
, __value
);
522 __device__
inline unsigned __reduce_min_sync(unsigned __mask
,
524 return __nvvm_redux_sync_umin(__mask
, __value
);
526 __device__
inline unsigned __reduce_max_sync(unsigned __mask
,
528 return __nvvm_redux_sync_umax(__mask
, __value
);
530 __device__
inline int __reduce_min_sync(unsigned __mask
, int __value
) {
531 return __nvvm_redux_sync_min(__mask
, __value
);
533 __device__
inline int __reduce_max_sync(unsigned __mask
, int __value
) {
534 return __nvvm_redux_sync_max(__mask
, __value
);
536 __device__
inline unsigned __reduce_or_sync(unsigned __mask
, unsigned __value
) {
537 return __nvvm_redux_sync_or(__mask
, __value
);
539 __device__
inline unsigned __reduce_and_sync(unsigned __mask
,
541 return __nvvm_redux_sync_and(__mask
, __value
);
543 __device__
inline unsigned __reduce_xor_sync(unsigned __mask
,
545 return __nvvm_redux_sync_xor(__mask
, __value
);
548 __device__
inline void __nv_memcpy_async_shared_global_4(void *__dst
,
550 unsigned __src_size
) {
551 __nvvm_cp_async_ca_shared_global_4(
552 (void __attribute__((address_space(3))) *)__dst
,
553 (const void __attribute__((address_space(1))) *)__src
, __src_size
);
555 __device__
inline void __nv_memcpy_async_shared_global_8(void *__dst
,
557 unsigned __src_size
) {
558 __nvvm_cp_async_ca_shared_global_8(
559 (void __attribute__((address_space(3))) *)__dst
,
560 (const void __attribute__((address_space(1))) *)__src
, __src_size
);
562 __device__
inline void __nv_memcpy_async_shared_global_16(void *__dst
,
564 unsigned __src_size
) {
565 __nvvm_cp_async_ca_shared_global_16(
566 (void __attribute__((address_space(3))) *)__dst
,
567 (const void __attribute__((address_space(1))) *)__src
, __src_size
);
570 __device__
inline void *
571 __nv_associate_access_property(const void *__ptr
, unsigned long long __prop
) {
572 // TODO: it appears to provide compiler with some sort of a hint. We do not
573 // know what exactly it is supposed to do. However, CUDA headers suggest that
574 // just passing through __ptr should not affect correctness. They do so on
575 // pre-sm80 GPUs where this builtin is not available.
578 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
580 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
581 __device__
inline unsigned __isCtaShared(const void *ptr
) {
582 return __isShared(ptr
);
585 __device__
inline unsigned __isClusterShared(const void *__ptr
) {
586 return __nvvm_isspacep_shared_cluster(__ptr
);
589 __device__
inline void *__cluster_map_shared_rank(const void *__ptr
,
591 return __nvvm_mapa((void *)__ptr
, __rank
);
594 __device__
inline unsigned __cluster_query_shared_rank(const void *__ptr
) {
595 return __nvvm_getctarank((void *)__ptr
);
598 __device__
inline uint2
599 __cluster_map_shared_multicast(const void *__ptr
,
600 unsigned int __cluster_cta_mask
) {
601 return make_uint2((unsigned)__cvta_generic_to_shared(__ptr
),
605 __device__
inline unsigned __clusterDimIsSpecified() {
606 return __nvvm_is_explicit_cluster();
609 __device__
inline dim3
__clusterDim() {
610 return dim3(__nvvm_read_ptx_sreg_cluster_nctaid_x(),
611 __nvvm_read_ptx_sreg_cluster_nctaid_y(),
612 __nvvm_read_ptx_sreg_cluster_nctaid_z());
615 __device__
inline dim3
__clusterRelativeBlockIdx() {
616 return dim3(__nvvm_read_ptx_sreg_cluster_ctaid_x(),
617 __nvvm_read_ptx_sreg_cluster_ctaid_y(),
618 __nvvm_read_ptx_sreg_cluster_ctaid_z());
621 __device__
inline dim3
__clusterGridDimInClusters() {
622 return dim3(__nvvm_read_ptx_sreg_nclusterid_x(),
623 __nvvm_read_ptx_sreg_nclusterid_y(),
624 __nvvm_read_ptx_sreg_nclusterid_z());
627 __device__
inline dim3
__clusterIdx() {
628 return dim3(__nvvm_read_ptx_sreg_clusterid_x(),
629 __nvvm_read_ptx_sreg_clusterid_y(),
630 __nvvm_read_ptx_sreg_clusterid_z());
633 __device__
inline unsigned __clusterRelativeBlockRank() {
634 return __nvvm_read_ptx_sreg_cluster_ctarank();
637 __device__
inline unsigned __clusterSizeInBlocks() {
638 return __nvvm_read_ptx_sreg_cluster_nctarank();
641 __device__
inline void __cluster_barrier_arrive() {
642 __nvvm_barrier_cluster_arrive();
645 __device__
inline void __cluster_barrier_arrive_relaxed() {
646 __nvvm_barrier_cluster_arrive_relaxed();
649 __device__
inline void __cluster_barrier_wait() {
650 __nvvm_barrier_cluster_wait();
653 __device__
inline void __threadfence_cluster() { __nvvm_fence_sc_cluster(); }
655 __device__
inline float2
atomicAdd(float2
*__ptr
, float2 __val
) {
657 __asm__("atom.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
658 : "=f"(__ret
.x
), "=f"(__ret
.y
)
659 : "l"(__ptr
), "f"(__val
.x
), "f"(__val
.y
));
663 __device__
inline float2
atomicAdd_block(float2
*__ptr
, float2 __val
) {
665 __asm__("atom.cta.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
666 : "=f"(__ret
.x
), "=f"(__ret
.y
)
667 : "l"(__ptr
), "f"(__val
.x
), "f"(__val
.y
));
671 __device__
inline float2
atomicAdd_system(float2
*__ptr
, float2 __val
) {
673 __asm__("atom.sys.add.v2.f32 {%0, %1}, [%2], {%3, %4};"
674 : "=f"(__ret
.x
), "=f"(__ret
.y
)
675 : "l"(__ptr
), "f"(__val
.x
), "f"(__val
.y
));
679 __device__
inline float4
atomicAdd(float4
*__ptr
, float4 __val
) {
681 __asm__("atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
682 : "=f"(__ret
.x
), "=f"(__ret
.y
), "=f"(__ret
.z
), "=f"(__ret
.w
)
683 : "l"(__ptr
), "f"(__val
.x
), "f"(__val
.y
), "f"(__val
.z
), "f"(__val
.w
));
687 __device__
inline float4
atomicAdd_block(float4
*__ptr
, float4 __val
) {
690 "atom.cta.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
691 : "=f"(__ret
.x
), "=f"(__ret
.y
), "=f"(__ret
.z
), "=f"(__ret
.w
)
692 : "l"(__ptr
), "f"(__val
.x
), "f"(__val
.y
), "f"(__val
.z
), "f"(__val
.w
));
696 __device__
inline float4
atomicAdd_system(float4
*__ptr
, float4 __val
) {
699 "atom.sys.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};"
700 : "=f"(__ret
.x
), "=f"(__ret
.y
), "=f"(__ret
.z
), "=f"(__ret
.w
)
701 : "l"(__ptr
), "f"(__val
.x
), "f"(__val
.y
), "f"(__val
.z
), "f"(__val
.w
)
706 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900
707 #endif // CUDA_VERSION >= 11000
709 #endif // defined(__CLANG_CUDA_INTRINSICS_H__)