clang/lib/Headers/__clang_cuda_texture_intrinsics.h

   1 /*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
   2  *
   3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   4  * See https://llvm.org/LICENSE.txt for license information.
   5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   6  *
   7  *===-----------------------------------------------------------------------===
   8  *
   9  * This header provides in-header implmentations for NVCC's built-in
  10  * __nv_tex_surf_handler() which is used by CUDA's texture-related headers.  The
  11  * built-in is unusual as it's actually a set of function overloads that use the
  12  * first string literal argument as one of the overload parameters.
  13  */
  14 #ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
  15 #define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
  16 #ifndef __CUDA__
  17 #error "This file is for CUDA compilation only."
  18 #endif
  19
  20 // __nv_tex_surf_handler() provided by this header as a macro.
  21 #define __nv_tex_surf_handler(__op, __ptr, ...)                                \
  22   ::__cuda_tex::__tex_fetch<                                                   \
  23       ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr,           \
  24                                                               __VA_ARGS__)
  25
  26 #pragma push_macro("__ASM_OUT")
  27 #pragma push_macro("__ASM_OUTP")
  28 #pragma push_macro("__Args")
  29 #pragma push_macro("__ID")
  30 #pragma push_macro("__IDV")
  31 #pragma push_macro("__IMPL_2DGATHER")
  32 #pragma push_macro("__IMPL_ALIAS")
  33 #pragma push_macro("__IMPL_ALIASI")
  34 #pragma push_macro("__IMPL_F1")
  35 #pragma push_macro("__IMPL_F3")
  36 #pragma push_macro("__IMPL_F3N")
  37 #pragma push_macro("__IMPL_F3S")
  38 #pragma push_macro("__IMPL_S")
  39 #pragma push_macro("__IMPL_S3")
  40 #pragma push_macro("__IMPL_S3I")
  41 #pragma push_macro("__IMPL_S3N")
  42 #pragma push_macro("__IMPL_S3NI")
  43 #pragma push_macro("__IMPL_S3S")
  44 #pragma push_macro("__IMPL_S3SI")
  45 #pragma push_macro("__IMPL_SI")
  46 #pragma push_macro("__L")
  47 #pragma push_macro("__STRIP_PARENS")
  48
  49 // Put all functions into anonymous namespace so they have internal linkage.
  50 // The device-only function here must be internal in order to avoid ODR
  51 // violations in case they are used from the files compiled with
  52 // -fgpu-rdc. E.g. a library and an app using it may be built with a different
  53 // version of this header file.
  54 namespace {
  55
  56 // Put the implmentation into its own namespace so we don't pollute the TU.
  57 namespace __cuda_tex {
  58
  59 // First, we need a perfect hash function and a few constexpr helper functions
  60 // for converting a string literal into a numeric value which can be used to
  61 // parametrize a template. We can not use string literals for that as that would
  62 // require C++20.
  63 //
  64 // The hash function was generated with 'gperf' and then manually converted into
  65 // its constexpr equivalent.
  66 //
  67 // NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
  68 // function has a collision for any of the texture operations, the compilation
  69 // will fail due to an attempt to redefine a tag with the same value. If the
  70 // header compiles, then the hash function is good enough for the job.
  71
  72 constexpr int __tex_len(const char *s) {
  73   return (s[0] == 0)    ? 0
  74          : (s[1] == 0)  ? 1
  75          : (s[2] == 0)  ? 2
  76          : (s[3] == 0)  ? 3
  77          : (s[4] == 0)  ? 4
  78          : (s[5] == 0)  ? 5
  79          : (s[6] == 0)  ? 6
  80          : (s[7] == 0)  ? 7
  81          : (s[8] == 0)  ? 8
  82          : (s[9] == 0)  ? 9
  83          : (s[10] == 0) ? 10
  84          : (s[11] == 0) ? 11
  85          : (s[12] == 0) ? 12
  86          : (s[13] == 0) ? 13
  87          : (s[14] == 0) ? 14
  88          : (s[15] == 0) ? 15
  89          : (s[16] == 0) ? 16
  90          : (s[17] == 0) ? 17
  91          : (s[18] == 0) ? 18
  92          : (s[19] == 0) ? 19
  93          : (s[20] == 0) ? 20
  94          : (s[21] == 0) ? 21
  95          : (s[22] == 0) ? 22
  96          : (s[23] == 0) ? 23
  97          : (s[24] == 0) ? 24
  98          : (s[25] == 0) ? 25
  99          : (s[26] == 0) ? 26
 100          : (s[27] == 0) ? 27
 101          : (s[28] == 0) ? 28
 102          : (s[29] == 0) ? 29
 103          : (s[30] == 0) ? 30
 104          : (s[31] == 0) ? 31
 105                         : 32;
 106 }
 107
 108 constexpr int __tex_hash_map(int c) {
 109   return (c == 49)    ? 10
 110          : (c == 50)  ? 0
 111          : (c == 51)  ? 100
 112          : (c == 52)  ? 30
 113          : (c == 67)  ? 10
 114          : (c == 68)  ? 0
 115          : (c == 69)  ? 25
 116          : (c == 72)  ? 70
 117          : (c == 77)  ? 0
 118          : (c == 96)  ? 44
 119          : (c == 99)  ? 10
 120          : (c == 100) ? 5
 121          : (c == 101) ? 60
 122          : (c == 102) ? 40
 123          : (c == 103) ? 70
 124          : (c == 104) ? 25
 125          : (c == 112) ? 0
 126          : (c == 114) ? 45
 127          : (c == 117) ? 5
 128          : (c == 118) ? 85
 129          : (c == 120) ? 20
 130                       : 225;
 131 }
 132
 133 constexpr int __tex_op_hash(const char *str) {
 134   return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
 135          __tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
 136 }
 137
 138 // Tag type to identify particular texture operation.
 139 template <int N> struct __Tag;
 140 #define __ID(__op) __Tag<__tex_op_hash(__op)>
 141 // Tags for variants of particular operation. E.g. tex2Dgather can translate
 142 // into 4 different instructions.
 143 #define __IDV(__op, __variant)                                                 \
 144   __Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
 145
 146 // Helper classes for figuring out key data types for derived types.
 147 // E.g. char2 has __base_t = char, __fetch_t = char4
 148 template <class> struct __TypeInfoT;
 149 // Type info for the fundamental types.
 150 template <> struct __TypeInfoT<float> {
 151   using __base_t = float;
 152   using __fetch_t = float4;
 153 };
 154 template <> struct __TypeInfoT<char> {
 155   using __base_t = char;
 156   using __fetch_t = int4;
 157 };
 158 template <> struct __TypeInfoT<signed char> {
 159   using __base_t = signed char;
 160   using __fetch_t = int4;
 161 };
 162 template <> struct __TypeInfoT<unsigned char> {
 163   using __base_t = unsigned char;
 164   using __fetch_t = uint4;
 165 };
 166 template <> struct __TypeInfoT<short> {
 167   using __base_t = short;
 168   using __fetch_t = int4;
 169 };
 170 template <> struct __TypeInfoT<unsigned short> {
 171   using __base_t = unsigned short;
 172   using __fetch_t = uint4;
 173 };
 174 template <> struct __TypeInfoT<int> {
 175   using __base_t = int;
 176   using __fetch_t = int4;
 177 };
 178 template <> struct __TypeInfoT<unsigned int> {
 179   using __base_t = unsigned int;
 180   using __fetch_t = uint4;
 181 };
 182
 183 // Derived base/fetch types for N-element vectors.
 184 template <class __T> struct __TypeInfoT {
 185   using __base_t = decltype(__T::x);
 186   using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
 187 };
 188
 189 // Classes that implement specific texture ops.
 190 template <class __op> struct __tex_fetch_v4;
 191
 192 // Helper macros to strip parens from a macro argument.
 193 #define __Args(...) __VA_ARGS__
 194 #define __STRIP_PARENS(__X) __X
 195 #define __L(__X) __STRIP_PARENS(__Args __X)
 196
 197 // Construct inline assembly output args.
 198 // Results are stored in a temp var __r.
 199 // isResident bool is pointed to by __ir
 200 // Asm args for return values. It's a 4-element vector
 201 #define __ASM_OUT(__t)                                                         \
 202   ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
 203 // .. possibly combined with a predicate.
 204 #define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
 205
 206 // Implements a single variant of texture fetch instruction.
 207 #define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args)        \
 208   template <>                                                                  \
 209   __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) {        \
 210     __rt __r;                                                                  \
 211     asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args));             \
 212     return __r;                                                                \
 213   }
 214
 215 // Implements texture fetch instructions for int4/uint4/float4 data types.
 216 #define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)        \
 217   __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
 218             __ASM_OUT("r"), __asm_args)                                        \
 219   __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
 220             __ASM_OUT("r"), __asm_args)                                        \
 221   __IMPL_F1(float4, float4, __args,                                            \
 222             __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"),       \
 223             __asm_args)
 224 // Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
 225 // types. Similar to above, but returns a boolean 'isPresent' value in addition
 226 // to texture data,
 227 #define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
 228   __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args,   \
 229             __ASM_OUTP("r"), __asm_args)                                       \
 230   __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
 231             __ASM_OUTP("r"), __asm_args)                                       \
 232   __IMPL_F1(float4, float4, __args,                                            \
 233             __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"),      \
 234             __asm_args)
 235
 236 // Similar to F3, but for integer data which is returned as normalized floats.
 237 // Only instantiates fetch functions for int4/uint4.
 238 #define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)       \
 239   __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
 240             __ASM_OUT("r"), __asm_args)                                        \
 241   __IMPL_F1(float4, uint4, __args,                                             \
 242             __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"),       \
 243             __asm_args)
 244
 245 // Instantiates __tex_fetch_v4 with regular fetch functions.
 246 #define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
 247   template <> struct __tex_fetch_v4<__op> {                                    \
 248     template <class T>                                                         \
 249     __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
 250     __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args)            \
 251   }
 252
 253 // Same, but for sparse ops. Only available on sm_60+
 254 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
 255 #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
 256                     __asm_args)                                                \
 257   template <> struct __tex_fetch_v4<__op> {                                    \
 258     template <class T>                                                         \
 259     __device__ static T __run(cudaTextureObject_t __obj, __L(__args));         \
 260     __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
 261   }
 262 #else
 263 #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
 264 #endif
 265
 266 // Same, but for normalized float ops.
 267 #define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args,            \
 268                     __asm_args)                                                \
 269   template <> struct __tex_fetch_v4<__op> {                                    \
 270     template <class T>                                                         \
 271     __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args));    \
 272     __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args)           \
 273   }
 274
 275 // Regular and normalized float ops share a lot of similarities.  This macro
 276 // instantiates both variants -- normal for __op and normalized for __opn.
 277 #define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,       \
 278                   __asm_args)                                                  \
 279   __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args);      \
 280   __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
 281
 282 // Convenience macros which converts string literal __op into a __Tag,
 283 #define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)  \
 284   __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
 285 #define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
 286   __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
 287 #define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
 288   __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
 289 #define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args,        \
 290                  __asm_args)                                                   \
 291   __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
 292             __asm_args)
 293
 294 // CUDA headers have some 'legacy' texture oprerations that duplicate
 295 // functionality. So, we just inherit it, instead of refining a copy.
 296 #define __IMPL_ALIASI(__op, __opn)                                             \
 297   template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
 298 #define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
 299
 300 // Now we can instantiate everything we need for each specific texture fetch
 301 // variant.
 302 __IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
 303          "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
 304 __IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
 305          "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
 306 __IMPL_ALIAS("__itex1D", "__tex1D_v2");
 307 __IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
 308
 309 __IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
 310          (float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
 311          "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
 312          ("f"(__x), "f"(__dPdx), "f"(__dPdy)));
 313 __IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
 314
 315 __IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
 316          (float __x, int __layer), "tex.a1d.v4", "f32",
 317          "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
 318 __IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
 319
 320 __IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
 321          (float __x, int __layer, float __dPdx, float __dPdy),
 322          "tex.grad.a1d.v4", "f32",
 323          "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
 324          ("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
 325 __IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
 326
 327 __IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
 328          (float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
 329          "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
 330          ("r"(__layer), "f"(__x), "f"(__level)));
 331 __IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
 332
 333 __IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
 334          "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
 335          ("f"(__x), "f"(__level)));
 336 __IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
 337
 338 // 2D
 339 __IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
 340          "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
 341 __IMPL_ALIAS("__itex2D", "__tex2D_v2");
 342
 343 __IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
 344            "{.reg .pred %%p0;\n\t"
 345            "tex.2d.v4",
 346            "f32",
 347            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
 348            " selp.u16 %4, 1, 0, %%p0; }",
 349            ("f"(__x), "f"(__y)));
 350
 351 __IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
 352          (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy),
 353          "tex.grad.2d.v4", "f32",
 354          "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
 355          ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
 356           "f"(__dPdy->y)));
 357 __IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
 358
 359 __IMPL_S3S("__itex2DGrad_sparse",
 360            (float __x, float __y, const float2 *__dPdx, const float2 *__dPdy,
 361             unsigned char *__ir),
 362            "{.reg .pred %%p0;\n\t"
 363            "tex.grad.2d.v4",
 364            "f32",
 365            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
 366            "selp.u16 %4, 1, 0, %%p0; }",
 367            ("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
 368             "f"(__dPdy->y)));
 369
 370 __IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
 371          (float __x, float __y, int __layer), "tex.a2d.v4", "f32",
 372          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
 373          ("r"(__layer), "f"(__x), "f"(__y)));
 374 __IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
 375
 376 __IMPL_S3S("__itex2DLayered_sparse",
 377            (float __x, float __y, int __layer, unsigned char *__ir),
 378            "{.reg .pred %%p0;\n\t"
 379            "tex.a2d.v4",
 380            "f32",
 381            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
 382            "selp.u16 %4, 1, 0, %%p0; }",
 383            ("r"(__layer), "f"(__x), "f"(__y)));
 384
 385 __IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
 386          (float __x, float __y, int __layer, const float2 *__dPdx,
 387           const float2 *__dPdy),
 388          "tex.grad.a2d.v4", "f32",
 389          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
 390          ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
 391           "f"(__dPdy->x), "f"(__dPdy->y)));
 392 __IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
 393
 394 __IMPL_S3S(
 395     "__itex2DLayeredGrad_sparse",
 396     (float __x, float __y, int __layer, const float2 *__dPdx,
 397      const float2 *__dPdy, unsigned char *__ir),
 398     "{.reg .pred %%p0;\n\t"
 399     "tex.grad.a2d.v4",
 400     "f32",
 401     "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
 402     "selp.u16 %4, 1, 0, %%p0; }",
 403     ("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
 404      "f"(__dPdy->x), "f"(__dPdy->y)));
 405
 406 __IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
 407          (float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
 408          "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
 409          ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
 410 __IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
 411
 412 __IMPL_S3S("__itex2DLayeredLod_sparse",
 413            (float __x, float __y, int __layer, float __level,
 414             unsigned char *__ir),
 415            "{.reg .pred %%p0;\n\t"
 416            "tex.level.a2d.v4",
 417            "f32",
 418            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
 419            "selp.u16 %4, 1, 0, %%p0; }",
 420            ("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
 421
 422 __IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
 423          (float __x, float __y, float __level), "tex.level.2d.v4", "f32",
 424          "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
 425          ("f"(__x), "f"(__y), "f"(__level)));
 426 __IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
 427
 428 __IMPL_S3S("__itex2DLod_sparse",
 429            (float __x, float __y, float __level, unsigned char *__ir),
 430            "{.reg .pred %%p0;\n\t"
 431            "tex.level.2d.v4",
 432            "f32",
 433            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
 434            "selp.u16 %4, 1, 0, %%p0; }",
 435            ("f"(__x), "f"(__y), "f"(__level)));
 436
 437 // 2D gather is special. Unlike other variants that translate into exactly one
 438 // asm instruction, it uses one of the four different instructions selected by
 439 // __comp.  We implement each instruction variant separately, and dispatch the
 440 // right one from the manually implemented 'umbrella' fetch.
 441 #define __IMPL_2DGATHER(variant, instr)                                        \
 442   __IMPL_SI(__IDV("__tex2Dgather_v2", variant),                                \
 443             __IDV("__tex2Dgather_rmnf_v2", variant),                           \
 444             (float __x, float __y, int __comp), instr, "f32",                  \
 445             "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));        \
 446   __IMPL_ALIASI(__IDV("__itex2Dgather", variant),                              \
 447                 __IDV("__tex2Dgather_v2", variant));                           \
 448   __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant),                         \
 449               (float __x, float __y, unsigned char *__ir, int __comp),         \
 450               "{.reg .pred %%p0;\n\t" instr, "f32",                            \
 451               "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"                     \
 452               "selp.u16 %4, 1, 0, %%p0; }",                                    \
 453               ("f"(__x), "f"(__y)));
 454 __IMPL_2DGATHER(0, "tld4.r.2d.v4");
 455 __IMPL_2DGATHER(1, "tld4.g.2d.v4");
 456 __IMPL_2DGATHER(2, "tld4.b.2d.v4");
 457 __IMPL_2DGATHER(3, "tld4.a.2d.v4");
 458
 459 // Umbrella dispatcher -- calls into specific 2Dgather variant.
 460 template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
 461   template <class __T>
 462   __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
 463                               int __comp) {
 464     switch (__comp) {
 465     case 0:
 466       return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
 467           __obj, __x, __y, __comp);
 468     case 1:
 469       return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
 470           __obj, __x, __y, __comp);
 471     case 2:
 472       return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
 473           __obj, __x, __y, __comp);
 474     case 3:
 475       return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
 476           __obj, __x, __y, __comp);
 477     }
 478   }
 479 };
 480 __IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
 481
 482 template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
 483   template <class __T>
 484   __device__ static float4 __run(cudaTextureObject_t __obj, float __x,
 485                                  float __y, int __comp) {
 486     switch (__comp) {
 487     case 0:
 488       return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
 489           __obj, __x, __y, __comp);
 490     case 1:
 491       return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
 492           __obj, __x, __y, __comp);
 493     case 2:
 494       return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
 495           __obj, __x, __y, __comp);
 496     case 3:
 497       return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
 498           __obj, __x, __y, __comp);
 499     }
 500   }
 501 };
 502
 503 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
 504 template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
 505   template <class __T>
 506   __device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
 507                               unsigned char *__ir, int __comp) {
 508     switch (__comp) {
 509     case 0:
 510       return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
 511           __obj, __x, __y, __ir, __comp);
 512     case 1:
 513       return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
 514           __obj, __x, __y, __ir, __comp);
 515     case 2:
 516       return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
 517           __obj, __x, __y, __ir, __comp);
 518     case 3:
 519       return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
 520           __obj, __x, __y, __ir, __comp);
 521     }
 522   }
 523 };
 524 #endif
 525
 526 // 3D
 527 __IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
 528          "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
 529          ("f"(__x), "f"(__y), "f"(__z)));
 530 __IMPL_ALIAS("__itex3D", "__tex3D_v2");
 531
 532 __IMPL_S3S("__itex3D_sparse",
 533            (float __x, float __y, float __z, unsigned char *__ir),
 534            "{.reg .pred %%p0;\n\t"
 535            "tex.3d.v4",
 536            "f32",
 537            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
 538            "selp.u16 %4, 1, 0, %%p0; }",
 539            ("f"(__x), "f"(__y), "f"(__z)));
 540
 541 __IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
 542          (float __x, float __y, float __z, const float4 *__dPdx,
 543           const float4 *__dPdy),
 544          "tex.grad.3d.v4", "f32",
 545          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
 546          "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
 547          ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
 548           "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
 549 __IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
 550
 551 __IMPL_S3S("__itex3DGrad_sparse",
 552            (float __x, float __y, float __z, const float4 *__dPdx,
 553             const float4 *__dPdy, unsigned char *__ir),
 554            "{.reg .pred %%p0;\n\t"
 555            "tex.grad.3d.v4",
 556            "f32",
 557            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
 558            "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
 559            "selp.u16 %4, 1, 0, %%p0; }",
 560            ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
 561             "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
 562
 563 __IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
 564          (float __x, float __y, float __z, float __level), "tex.level.3d.v4",
 565          "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
 566          ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
 567 __IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
 568
 569 __IMPL_S3S("__itex3DLod_sparse",
 570            (float __x, float __y, float __z, float __level,
 571             unsigned char *__ir),
 572            "{.reg .pred %%p0;\n\t"
 573            "tex.level.3d.v4",
 574            "f32",
 575            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
 576            "selp.u16 %4, 1, 0, %%p0; }",
 577            ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
 578
 579 // Cubemap
 580 __IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
 581          (float __x, float __y, float __z), "tex.cube.v4", "f32",
 582          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
 583          ("f"(__x), "f"(__y), "f"(__z)));
 584 __IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
 585
 586 __IMPL_S3S("__itexCubemap_sparse",
 587            (float __x, float __y, float __z, unsigned char *__ir),
 588            "{.reg .pred %%p0;\n\t"
 589            "tex.cube.v4",
 590            "f32",
 591            "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
 592            "selp.u16 %4, 1, 0, %%p0; }",
 593            ("f"(__x), "f"(__y), "f"(__z)));
 594
 595 __IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
 596          (float __x, float __y, float __z, const float4 *__dPdx,
 597           const float4 *__dPdy),
 598          "tex.grad.cube.v4", "f32",
 599          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
 600          "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
 601          ("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
 602           "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
 603 __IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
 604
 605 __IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
 606          (float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
 607          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
 608          ("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
 609 __IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
 610
 611 __IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
 612          (float __x, float __y, float __z, int __layer, const float4 *__dPdx,
 613           const float4 *__dPdy),
 614          "tex.grad.acube.v4", "f32",
 615          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
 616          "{%9, %10, %11, %11}, {%12, %13, %14, %14};",
 617          ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
 618           "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
 619           "f"(__dPdy->z)));
 620 __IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
 621
 622 __IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
 623          (float __x, float __y, float __z, int __layer, float __level),
 624          "tex.level.acube.v4", "f32",
 625          "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
 626          ("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
 627 __IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
 628
 629 __IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
 630          (float __x, float __y, float __z, float __level), "tex.level.cube.v4",
 631          "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
 632          ("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
 633 __IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
 634
 635 // Helper class for extracting slice of data from V4 fetch results.
 636 template <class __DestT, class __SrcT> struct __convert {
 637   template <int __NElements = sizeof(__DestT) /
 638                               sizeof(typename __TypeInfoT<__DestT>::__base_t)>
 639   __device__ static __DestT __run(__SrcT __v);
 640   template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
 641   template <> __device__ static __DestT __run<2>(__SrcT __v) {
 642     return {__v.x, __v.y};
 643   }
 644   template <> __device__ static __DestT __run<3>(__SrcT __v) {
 645     return {__v.x, __v.y, __v.z};
 646   }
 647   template <> __device__ static __DestT __run<4>(__SrcT __v) {
 648     return {__v.x, __v.y, __v.z, __v.w};
 649   }
 650 };
 651
 652 // These are the top-level function overloads the __nv_tex_surf_handler expands
 653 // to.  Each overload deals with one of the several ways __nv_tex_surf_handler
 654 // is called by CUDA headers. In the end, each of the overloads does the same
 655 // job -- it figures out which `__tex_fetch_v4::run` variant should be used to
 656 // fetch texture data and which `__convert::run` is needed to convert it into
 657 // appropriate return type.
 658
 659 // __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
 660 //   Data type and return type are based on ret.
 661 template <class __op, class __T, class... __Args>
 662 __device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
 663                                    __Args... __args) {
 664   using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
 665   *__ptr = __convert<__T, __FetchT>::__run(
 666       __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
 667 }
 668
 669 #if CUDA_VERSION < 12000
 670 // texture<> objects get magically converted into a texture reference.  However,
 671 // there's no way to convert them to cudaTextureObject_t on C++ level. So, we
 672 // cheat a bit and use inline assembly to do it. It costs us an extra register
 673 // and a move, but that is easy for ptxas to optimize away.
 674 template <class __T>
 675 __device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
 676   cudaTextureObject_t __obj;
 677   asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
 678   return __obj;
 679 }
 680
 681 // __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
 682 //   Data type and return type is based on ret.
 683 template <class __op, class __T, class __HandleT, class... __Args>
 684 __device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
 685                                    __Args... __args) {
 686   using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
 687   *__ptr = __convert<__T, __FetchT>::__run(
 688       __tex_fetch_v4<__op>::template __run<__FetchT>(
 689           __tex_handle_to_obj(__handle), __args...));
 690 }
 691
 692 // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
 693 // cudaReadModeNormalizedFloat fetches always return float4.
 694 template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
 695 __device__ static void
 696 __tex_fetch(__DataT *, __RetT *__ptr,
 697             texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
 698             __Args... __args) {
 699   using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
 700   *__ptr = __convert<__RetT, float4>::__run(
 701       __tex_fetch_v4<__op>::template __run<__FetchT>(
 702           __tex_handle_to_obj(__handle), __args...));
 703 }
 704
 705 // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
 706 // For cudaReadModeElementType fetch return type is based on type_dummy.
 707 template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
 708 __device__ static void
 709 __tex_fetch(__DataT *, __RetT *__ptr,
 710             texture<__DataT, __TexT, cudaReadModeElementType> __handle,
 711             __Args... __args) {
 712   using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
 713   *__ptr = __convert<__RetT, __FetchT>::__run(
 714       __tex_fetch_v4<__op>::template __run<__FetchT>(
 715           __tex_handle_to_obj(__handle), __args...));
 716 }
 717 #endif // CUDA_VERSION
 718 } // namespace __cuda_tex
 719 } // namespace
 720 #pragma pop_macro("__ASM_OUT")
 721 #pragma pop_macro("__ASM_OUTP")
 722 #pragma pop_macro("__Args")
 723 #pragma pop_macro("__ID")
 724 #pragma pop_macro("__IDV")
 725 #pragma pop_macro("__IMPL_2DGATHER")
 726 #pragma pop_macro("__IMPL_ALIAS")
 727 #pragma pop_macro("__IMPL_ALIASI")
 728 #pragma pop_macro("__IMPL_F1")
 729 #pragma pop_macro("__IMPL_F3")
 730 #pragma pop_macro("__IMPL_F3N")
 731 #pragma pop_macro("__IMPL_F3S")
 732 #pragma pop_macro("__IMPL_S")
 733 #pragma pop_macro("__IMPL_S3")
 734 #pragma pop_macro("__IMPL_S3I")
 735 #pragma pop_macro("__IMPL_S3N")
 736 #pragma pop_macro("__IMPL_S3NI")
 737 #pragma pop_macro("__IMPL_S3S")
 738 #pragma pop_macro("__IMPL_S3SI")
 739 #pragma pop_macro("__IMPL_SI")
 740 #pragma pop_macro("__L")
 741 #pragma pop_macro("__STRIP_PARENS")
 742 #endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__