1 /*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 *===-----------------------------------------------------------------------===
9 * This header provides in-header implmentations for NVCC's built-in
10 * __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The
11 * built-in is unusual as it's actually a set of function overloads that use the
12 * first string literal argument as one of the overload parameters.
14 #ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
15 #define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
17 #error "This file is for CUDA compilation only."
20 // __nv_tex_surf_handler() provided by this header as a macro.
21 #define __nv_tex_surf_handler(__op, __ptr, ...) \
22 ::__cuda_tex::__tex_fetch< \
23 ::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \
26 #pragma push_macro("__ASM_OUT")
27 #pragma push_macro("__ASM_OUTP")
28 #pragma push_macro("__Args")
29 #pragma push_macro("__ID")
30 #pragma push_macro("__IDV")
31 #pragma push_macro("__IMPL_2DGATHER")
32 #pragma push_macro("__IMPL_ALIAS")
33 #pragma push_macro("__IMPL_ALIASI")
34 #pragma push_macro("__IMPL_F1")
35 #pragma push_macro("__IMPL_F3")
36 #pragma push_macro("__IMPL_F3N")
37 #pragma push_macro("__IMPL_F3S")
38 #pragma push_macro("__IMPL_S")
39 #pragma push_macro("__IMPL_S3")
40 #pragma push_macro("__IMPL_S3I")
41 #pragma push_macro("__IMPL_S3N")
42 #pragma push_macro("__IMPL_S3NI")
43 #pragma push_macro("__IMPL_S3S")
44 #pragma push_macro("__IMPL_S3SI")
45 #pragma push_macro("__IMPL_SI")
46 #pragma push_macro("__L")
47 #pragma push_macro("__STRIP_PARENS")
49 // Put all functions into anonymous namespace so they have internal linkage.
50 // The device-only function here must be internal in order to avoid ODR
51 // violations in case they are used from the files compiled with
52 // -fgpu-rdc. E.g. a library and an app using it may be built with a different
53 // version of this header file.
56 // Put the implmentation into its own namespace so we don't pollute the TU.
57 namespace __cuda_tex
{
59 // First, we need a perfect hash function and a few constexpr helper functions
60 // for converting a string literal into a numeric value which can be used to
61 // parametrize a template. We can not use string literals for that as that would
64 // The hash function was generated with 'gperf' and then manually converted into
65 // its constexpr equivalent.
67 // NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
68 // function has a collision for any of the texture operations, the compilation
69 // will fail due to an attempt to redefine a tag with the same value. If the
70 // header compiles, then the hash function is good enough for the job.
72 constexpr int __tex_len(const char *s
) {
73 return (s
[0] == 0) ? 0
108 constexpr int __tex_hash_map(int c
) {
109 return (c
== 49) ? 10
133 constexpr int __tex_op_hash(const char *str
) {
134 return __tex_len(str
) + __tex_hash_map(str
[7] + 1) + __tex_hash_map(str
[6]) +
135 __tex_hash_map(str
[5]) + __tex_hash_map(str
[__tex_len(str
) - 1]);
138 // Tag type to identify particular texture operation.
139 template <int N
> struct __Tag
;
140 #define __ID(__op) __Tag<__tex_op_hash(__op)>
141 // Tags for variants of particular operation. E.g. tex2Dgather can translate
142 // into 4 different instructions.
143 #define __IDV(__op, __variant) \
144 __Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
146 // Helper classes for figuring out key data types for derived types.
147 // E.g. char2 has __base_t = char, __fetch_t = char4
148 template <class> struct __TypeInfoT
;
149 // Type info for the fundamental types.
150 template <> struct __TypeInfoT
<float> {
151 using __base_t
= float;
152 using __fetch_t
= float4
;
154 template <> struct __TypeInfoT
<char> {
155 using __base_t
= char;
156 using __fetch_t
= int4
;
158 template <> struct __TypeInfoT
<signed char> {
159 using __base_t
= signed char;
160 using __fetch_t
= int4
;
162 template <> struct __TypeInfoT
<unsigned char> {
163 using __base_t
= unsigned char;
164 using __fetch_t
= uint4
;
166 template <> struct __TypeInfoT
<short> {
167 using __base_t
= short;
168 using __fetch_t
= int4
;
170 template <> struct __TypeInfoT
<unsigned short> {
171 using __base_t
= unsigned short;
172 using __fetch_t
= uint4
;
174 template <> struct __TypeInfoT
<int> {
175 using __base_t
= int;
176 using __fetch_t
= int4
;
178 template <> struct __TypeInfoT
<unsigned int> {
179 using __base_t
= unsigned int;
180 using __fetch_t
= uint4
;
183 // Derived base/fetch types for N-element vectors.
184 template <class __T
> struct __TypeInfoT
{
185 using __base_t
= decltype(__T::x
);
186 using __fetch_t
= typename __TypeInfoT
<__base_t
>::__fetch_t
;
189 // Classes that implement specific texture ops.
190 template <class __op
> struct __tex_fetch_v4
;
192 // Helper macros to strip parens from a macro argument.
193 #define __Args(...) __VA_ARGS__
194 #define __STRIP_PARENS(__X) __X
195 #define __L(__X) __STRIP_PARENS(__Args __X)
197 // Construct inline assembly output args.
198 // Results are stored in a temp var __r.
199 // isResident bool is pointed to by __ir
200 // Asm args for return values. It's a 4-element vector
201 #define __ASM_OUT(__t) \
202 ("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
203 // .. possibly combined with a predicate.
204 #define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
206 // Implements a single variant of texture fetch instruction.
207 #define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \
209 __device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \
211 asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \
215 // Implements texture fetch instructions for int4/uint4/float4 data types.
216 #define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
217 __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
218 __ASM_OUT("r"), __asm_args) \
219 __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
220 __ASM_OUT("r"), __asm_args) \
221 __IMPL_F1(float4, float4, __args, \
222 __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \
224 // Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
225 // types. Similar to above, but returns a boolean 'isPresent' value in addition
227 #define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
228 __IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
229 __ASM_OUTP("r"), __asm_args) \
230 __IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
231 __ASM_OUTP("r"), __asm_args) \
232 __IMPL_F1(float4, float4, __args, \
233 __asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \
236 // Similar to F3, but for integer data which is returned as normalized floats.
237 // Only instantiates fetch functions for int4/uint4.
238 #define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
239 __IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
240 __ASM_OUT("r"), __asm_args) \
241 __IMPL_F1(float4, uint4, __args, \
242 __asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \
245 // Instantiates __tex_fetch_v4 with regular fetch functions.
246 #define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
247 template <> struct __tex_fetch_v4<__op> { \
249 __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
250 __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
253 // Same, but for sparse ops. Only available on sm_60+
254 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
255 #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \
257 template <> struct __tex_fetch_v4<__op> { \
259 __device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
260 __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
263 #define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
266 // Same, but for normalized float ops.
267 #define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \
269 template <> struct __tex_fetch_v4<__op> { \
271 __device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \
272 __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
275 // Regular and normalized float ops share a lot of similarities. This macro
276 // instantiates both variants -- normal for __op and normalized for __opn.
277 #define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
279 __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \
280 __IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
282 // Convenience macros which converts string literal __op into a __Tag,
283 #define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
284 __IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
285 #define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
286 __IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
287 #define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
288 __IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
289 #define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
291 __IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
294 // CUDA headers have some 'legacy' texture oprerations that duplicate
295 // functionality. So, we just inherit it, instead of refining a copy.
296 #define __IMPL_ALIASI(__op, __opn) \
297 template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
298 #define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
300 // Now we can instantiate everything we need for each specific texture fetch
302 __IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x
), "tex.1d.v4", "f32",
303 "{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x
)));
304 __IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x
), "tex.1d.v4",
305 "s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x
)));
306 __IMPL_ALIAS("__itex1D", "__tex1D_v2");
307 __IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
309 __IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
310 (float __x
, float __dPdx
, float __dPdy
), "tex.grad.1d.v4", "f32",
311 "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
312 ("f"(__x
), "f"(__dPdx
), "f"(__dPdy
)));
313 __IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
315 __IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
316 (float __x
, int __layer
), "tex.a1d.v4", "f32",
317 "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer
), "f"(__x
)));
318 __IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
320 __IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
321 (float __x
, int __layer
, float __dPdx
, float __dPdy
),
322 "tex.grad.a1d.v4", "f32",
323 "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
324 ("r"(__layer
), "f"(__x
), "f"(__dPdx
), "f"(__dPdy
)));
325 __IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
327 __IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
328 (float __x
, int __layer
, float __level
), "tex.level.a1d.v4", "f32",
329 "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
330 ("r"(__layer
), "f"(__x
), "f"(__level
)));
331 __IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
333 __IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x
, float __level
),
334 "tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
335 ("f"(__x
), "f"(__level
)));
336 __IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
339 __IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x
, float __y
), "tex.2d.v4",
340 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x
), "f"(__y
)));
341 __IMPL_ALIAS("__itex2D", "__tex2D_v2");
343 __IMPL_S3S("__itex2D_sparse", (float __x
, float __y
, unsigned char *__ir
),
344 "{.reg .pred %%p0;\n\t"
347 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
348 " selp.u16 %4, 1, 0, %%p0; }",
349 ("f"(__x
), "f"(__y
)));
351 __IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
352 (float __x
, float __y
, const float2
*__dPdx
, const float2
*__dPdy
),
353 "tex.grad.2d.v4", "f32",
354 "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
355 ("f"(__x
), "f"(__y
), "f"(__dPdx
->x
), "f"(__dPdx
->y
), "f"(__dPdy
->x
),
357 __IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
359 __IMPL_S3S("__itex2DGrad_sparse",
360 (float __x
, float __y
, const float2
*__dPdx
, const float2
*__dPdy
,
361 unsigned char *__ir
),
362 "{.reg .pred %%p0;\n\t"
365 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
366 "selp.u16 %4, 1, 0, %%p0; }",
367 ("f"(__x
), "f"(__y
), "f"(__dPdx
->x
), "f"(__dPdx
->y
), "f"(__dPdy
->x
),
370 __IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
371 (float __x
, float __y
, int __layer
), "tex.a2d.v4", "f32",
372 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
373 ("r"(__layer
), "f"(__x
), "f"(__y
)));
374 __IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
376 __IMPL_S3S("__itex2DLayered_sparse",
377 (float __x
, float __y
, int __layer
, unsigned char *__ir
),
378 "{.reg .pred %%p0;\n\t"
381 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
382 "selp.u16 %4, 1, 0, %%p0; }",
383 ("r"(__layer
), "f"(__x
), "f"(__y
)));
385 __IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
386 (float __x
, float __y
, int __layer
, const float2
*__dPdx
,
387 const float2
*__dPdy
),
388 "tex.grad.a2d.v4", "f32",
389 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
390 ("r"(__layer
), "f"(__x
), "f"(__y
), "f"(__dPdx
->x
), "f"(__dPdx
->y
),
391 "f"(__dPdy
->x
), "f"(__dPdy
->y
)));
392 __IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
395 "__itex2DLayeredGrad_sparse",
396 (float __x
, float __y
, int __layer
, const float2
*__dPdx
,
397 const float2
*__dPdy
, unsigned char *__ir
),
398 "{.reg .pred %%p0;\n\t"
401 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
402 "selp.u16 %4, 1, 0, %%p0; }",
403 ("r"(__layer
), "f"(__x
), "f"(__y
), "f"(__dPdx
->x
), "f"(__dPdx
->y
),
404 "f"(__dPdy
->x
), "f"(__dPdy
->y
)));
406 __IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
407 (float __x
, float __y
, int __layer
, float __level
), "tex.level.a2d.v4",
408 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
409 ("r"(__layer
), "f"(__x
), "f"(__y
), "f"(__level
)));
410 __IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
412 __IMPL_S3S("__itex2DLayeredLod_sparse",
413 (float __x
, float __y
, int __layer
, float __level
,
414 unsigned char *__ir
),
415 "{.reg .pred %%p0;\n\t"
418 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
419 "selp.u16 %4, 1, 0, %%p0; }",
420 ("r"(__layer
), "f"(__x
), "f"(__y
), "f"(__level
)));
422 __IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
423 (float __x
, float __y
, float __level
), "tex.level.2d.v4", "f32",
424 "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
425 ("f"(__x
), "f"(__y
), "f"(__level
)));
426 __IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
428 __IMPL_S3S("__itex2DLod_sparse",
429 (float __x
, float __y
, float __level
, unsigned char *__ir
),
430 "{.reg .pred %%p0;\n\t"
433 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
434 "selp.u16 %4, 1, 0, %%p0; }",
435 ("f"(__x
), "f"(__y
), "f"(__level
)));
437 // 2D gather is special. Unlike other variants that translate into exactly one
438 // asm instruction, it uses one of the four different instructions selected by
439 // __comp. We implement each instruction variant separately, and dispatch the
440 // right one from the manually implemented 'umbrella' fetch.
441 #define __IMPL_2DGATHER(variant, instr) \
442 __IMPL_SI(__IDV("__tex2Dgather_v2", variant), \
443 __IDV("__tex2Dgather_rmnf_v2", variant), \
444 (float __x, float __y, int __comp), instr, "f32", \
445 "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \
446 __IMPL_ALIASI(__IDV("__itex2Dgather", variant), \
447 __IDV("__tex2Dgather_v2", variant)); \
448 __IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \
449 (float __x, float __y, unsigned char *__ir, int __comp), \
450 "{.reg .pred %%p0;\n\t" instr, "f32", \
451 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t" \
452 "selp.u16 %4, 1, 0, %%p0; }", \
453 ("f"(__x), "f"(__y)));
454 __IMPL_2DGATHER(0, "tld4.r.2d.v4");
455 __IMPL_2DGATHER(1, "tld4.g.2d.v4");
456 __IMPL_2DGATHER(2, "tld4.b.2d.v4");
457 __IMPL_2DGATHER(3, "tld4.a.2d.v4");
459 // Umbrella dispatcher -- calls into specific 2Dgather variant.
460 template <> struct __tex_fetch_v4
<__ID("__tex2Dgather_v2")> {
462 __device__
static __T
__run(cudaTextureObject_t __obj
, float __x
, float __y
,
466 return __tex_fetch_v4
<__IDV("__tex2Dgather_v2", 0)>::__run
<__T
>(
467 __obj
, __x
, __y
, __comp
);
469 return __tex_fetch_v4
<__IDV("__tex2Dgather_v2", 1)>::__run
<__T
>(
470 __obj
, __x
, __y
, __comp
);
472 return __tex_fetch_v4
<__IDV("__tex2Dgather_v2", 2)>::__run
<__T
>(
473 __obj
, __x
, __y
, __comp
);
475 return __tex_fetch_v4
<__IDV("__tex2Dgather_v2", 3)>::__run
<__T
>(
476 __obj
, __x
, __y
, __comp
);
480 __IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
482 template <> struct __tex_fetch_v4
<__ID("__tex2Dgather_rmnf_v2")> {
484 __device__
static float4
__run(cudaTextureObject_t __obj
, float __x
,
485 float __y
, int __comp
) {
488 return __tex_fetch_v4
<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run
<__T
>(
489 __obj
, __x
, __y
, __comp
);
491 return __tex_fetch_v4
<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run
<__T
>(
492 __obj
, __x
, __y
, __comp
);
494 return __tex_fetch_v4
<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run
<__T
>(
495 __obj
, __x
, __y
, __comp
);
497 return __tex_fetch_v4
<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run
<__T
>(
498 __obj
, __x
, __y
, __comp
);
503 #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
504 template <> struct __tex_fetch_v4
<__ID("__itex2Dgather_sparse")> {
506 __device__
static __T
__run(cudaTextureObject_t __obj
, float __x
, float __y
,
507 unsigned char *__ir
, int __comp
) {
510 return __tex_fetch_v4
<__IDV("__itex2Dgather_sparse", 0)>::__run
<__T
>(
511 __obj
, __x
, __y
, __ir
, __comp
);
513 return __tex_fetch_v4
<__IDV("__itex2Dgather_sparse", 1)>::__run
<__T
>(
514 __obj
, __x
, __y
, __ir
, __comp
);
516 return __tex_fetch_v4
<__IDV("__itex2Dgather_sparse", 2)>::__run
<__T
>(
517 __obj
, __x
, __y
, __ir
, __comp
);
519 return __tex_fetch_v4
<__IDV("__itex2Dgather_sparse", 3)>::__run
<__T
>(
520 __obj
, __x
, __y
, __ir
, __comp
);
527 __IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x
, float __y
, float __z
),
528 "tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
529 ("f"(__x
), "f"(__y
), "f"(__z
)));
530 __IMPL_ALIAS("__itex3D", "__tex3D_v2");
532 __IMPL_S3S("__itex3D_sparse",
533 (float __x
, float __y
, float __z
, unsigned char *__ir
),
534 "{.reg .pred %%p0;\n\t"
537 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
538 "selp.u16 %4, 1, 0, %%p0; }",
539 ("f"(__x
), "f"(__y
), "f"(__z
)));
541 __IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
542 (float __x
, float __y
, float __z
, const float4
*__dPdx
,
543 const float4
*__dPdy
),
544 "tex.grad.3d.v4", "f32",
545 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
546 "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
547 ("f"(__x
), "f"(__y
), "f"(__z
), "f"(__dPdx
->x
), "f"(__dPdx
->y
),
548 "f"(__dPdx
->z
), "f"(__dPdy
->x
), "f"(__dPdy
->y
), "f"(__dPdy
->z
)));
549 __IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
551 __IMPL_S3S("__itex3DGrad_sparse",
552 (float __x
, float __y
, float __z
, const float4
*__dPdx
,
553 const float4
*__dPdy
, unsigned char *__ir
),
554 "{.reg .pred %%p0;\n\t"
557 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
558 "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
559 "selp.u16 %4, 1, 0, %%p0; }",
560 ("f"(__x
), "f"(__y
), "f"(__z
), "f"(__dPdx
->x
), "f"(__dPdx
->y
),
561 "f"(__dPdx
->z
), "f"(__dPdy
->x
), "f"(__dPdy
->y
), "f"(__dPdy
->z
)));
563 __IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
564 (float __x
, float __y
, float __z
, float __level
), "tex.level.3d.v4",
565 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
566 ("f"(__x
), "f"(__y
), "f"(__z
), "f"(__level
)));
567 __IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
569 __IMPL_S3S("__itex3DLod_sparse",
570 (float __x
, float __y
, float __z
, float __level
,
571 unsigned char *__ir
),
572 "{.reg .pred %%p0;\n\t"
575 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
576 "selp.u16 %4, 1, 0, %%p0; }",
577 ("f"(__x
), "f"(__y
), "f"(__z
), "f"(__level
)));
580 __IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
581 (float __x
, float __y
, float __z
), "tex.cube.v4", "f32",
582 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
583 ("f"(__x
), "f"(__y
), "f"(__z
)));
584 __IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
586 __IMPL_S3S("__itexCubemap_sparse",
587 (float __x
, float __y
, float __z
, unsigned char *__ir
),
588 "{.reg .pred %%p0;\n\t"
591 "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
592 "selp.u16 %4, 1, 0, %%p0; }",
593 ("f"(__x
), "f"(__y
), "f"(__z
)));
595 __IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
596 (float __x
, float __y
, float __z
, const float4
*__dPdx
,
597 const float4
*__dPdy
),
598 "tex.grad.cube.v4", "f32",
599 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
600 "{%8, %9, %10, %10}, {%11, %12, %13, %13};",
601 ("f"(__x
), "f"(__y
), "f"(__z
), "f"(__dPdx
->x
), "f"(__dPdx
->y
),
602 "f"(__dPdx
->z
), "f"(__dPdy
->x
), "f"(__dPdy
->y
), "f"(__dPdy
->z
)));
603 __IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
605 __IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
606 (float __x
, float __y
, float __z
, int __layer
), "tex.acube.v4", "f32",
607 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
608 ("r"(__layer
), "f"(__x
), "f"(__y
), "f"(__z
)));
609 __IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
611 __IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
612 (float __x
, float __y
, float __z
, int __layer
, const float4
*__dPdx
,
613 const float4
*__dPdy
),
614 "tex.grad.acube.v4", "f32",
615 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
616 "{%9, %10, %11, %11}, {%12, %13, %14, %14};",
617 ("r"(__layer
), "f"(__x
), "f"(__y
), "f"(__z
), "f"(__dPdx
->x
),
618 "f"(__dPdx
->y
), "f"(__dPdx
->z
), "f"(__dPdy
->x
), "f"(__dPdy
->y
),
620 __IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
622 __IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
623 (float __x
, float __y
, float __z
, int __layer
, float __level
),
624 "tex.level.acube.v4", "f32",
625 "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
626 ("r"(__layer
), "f"(__x
), "f"(__y
), "f"(__z
), "f"(__level
)));
627 __IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
629 __IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
630 (float __x
, float __y
, float __z
, float __level
), "tex.level.cube.v4",
631 "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
632 ("f"(__x
), "f"(__y
), "f"(__z
), "f"(__level
)));
633 __IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
635 // Helper class for extracting slice of data from V4 fetch results.
636 template <class __DestT
, class __SrcT
> struct __convert
{
637 template <int __NElements
= sizeof(__DestT
) /
638 sizeof(typename __TypeInfoT
<__DestT
>::__base_t
)>
639 __device__
static __DestT
__run(__SrcT __v
);
640 template <> __device__
static __DestT __run
<1>(__SrcT __v
) { return {__v
.x
}; }
641 template <> __device__
static __DestT __run
<2>(__SrcT __v
) {
642 return {__v
.x
, __v
.y
};
644 template <> __device__
static __DestT __run
<3>(__SrcT __v
) {
645 return {__v
.x
, __v
.y
, __v
.z
};
647 template <> __device__
static __DestT __run
<4>(__SrcT __v
) {
648 return {__v
.x
, __v
.y
, __v
.z
, __v
.w
};
652 // These are the top-level function overloads the __nv_tex_surf_handler expands
653 // to. Each overload deals with one of the several ways __nv_tex_surf_handler
654 // is called by CUDA headers. In the end, each of the overloads does the same
655 // job -- it figures out which `__tex_fetch_v4::run` variant should be used to
656 // fetch texture data and which `__convert::run` is needed to convert it into
657 // appropriate return type.
659 // __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
660 // Data type and return type are based on ret.
661 template <class __op
, class __T
, class... __Args
>
662 __device__
static void __tex_fetch(__T
*__ptr
, cudaTextureObject_t __handle
,
664 using __FetchT
= typename __TypeInfoT
<__T
>::__fetch_t
;
665 *__ptr
= __convert
<__T
, __FetchT
>::__run(
666 __tex_fetch_v4
<__op
>::template __run
<__FetchT
>(__handle
, __args
...));
669 #if CUDA_VERSION < 12000
670 // texture<> objects get magically converted into a texture reference. However,
671 // there's no way to convert them to cudaTextureObject_t on C++ level. So, we
672 // cheat a bit and use inline assembly to do it. It costs us an extra register
673 // and a move, but that is easy for ptxas to optimize away.
675 __device__ cudaTextureObject_t
__tex_handle_to_obj(__T __handle
) {
676 cudaTextureObject_t __obj
;
677 asm("mov.b64 %0, %1; " : "=l"(__obj
) : "l"(__handle
));
681 // __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
682 // Data type and return type is based on ret.
683 template <class __op
, class __T
, class __HandleT
, class... __Args
>
684 __device__
static void __tex_fetch(__T
*__ptr
, __HandleT __handle
,
686 using __FetchT
= typename __TypeInfoT
<__T
>::__fetch_t
;
687 *__ptr
= __convert
<__T
, __FetchT
>::__run(
688 __tex_fetch_v4
<__op
>::template __run
<__FetchT
>(
689 __tex_handle_to_obj(__handle
), __args
...));
692 // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
693 // cudaReadModeNormalizedFloat fetches always return float4.
694 template <class __op
, class __DataT
, class __RetT
, int __TexT
, class... __Args
>
695 __device__
static void
696 __tex_fetch(__DataT
*, __RetT
*__ptr
,
697 texture
<__DataT
, __TexT
, cudaReadModeNormalizedFloat
> __handle
,
699 using __FetchT
= typename __TypeInfoT
<__DataT
>::__fetch_t
;
700 *__ptr
= __convert
<__RetT
, float4
>::__run(
701 __tex_fetch_v4
<__op
>::template __run
<__FetchT
>(
702 __tex_handle_to_obj(__handle
), __args
...));
705 // __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
706 // For cudaReadModeElementType fetch return type is based on type_dummy.
707 template <class __op
, class __DataT
, class __RetT
, int __TexT
, class... __Args
>
708 __device__
static void
709 __tex_fetch(__DataT
*, __RetT
*__ptr
,
710 texture
<__DataT
, __TexT
, cudaReadModeElementType
> __handle
,
712 using __FetchT
= typename __TypeInfoT
<__DataT
>::__fetch_t
;
713 *__ptr
= __convert
<__RetT
, __FetchT
>::__run(
714 __tex_fetch_v4
<__op
>::template __run
<__FetchT
>(
715 __tex_handle_to_obj(__handle
), __args
...));
717 #endif // CUDA_VERSION
718 } // namespace __cuda_tex
720 #pragma pop_macro("__ASM_OUT")
721 #pragma pop_macro("__ASM_OUTP")
722 #pragma pop_macro("__Args")
723 #pragma pop_macro("__ID")
724 #pragma pop_macro("__IDV")
725 #pragma pop_macro("__IMPL_2DGATHER")
726 #pragma pop_macro("__IMPL_ALIAS")
727 #pragma pop_macro("__IMPL_ALIASI")
728 #pragma pop_macro("__IMPL_F1")
729 #pragma pop_macro("__IMPL_F3")
730 #pragma pop_macro("__IMPL_F3N")
731 #pragma pop_macro("__IMPL_F3S")
732 #pragma pop_macro("__IMPL_S")
733 #pragma pop_macro("__IMPL_S3")
734 #pragma pop_macro("__IMPL_S3I")
735 #pragma pop_macro("__IMPL_S3N")
736 #pragma pop_macro("__IMPL_S3NI")
737 #pragma pop_macro("__IMPL_S3S")
738 #pragma pop_macro("__IMPL_S3SI")
739 #pragma pop_macro("__IMPL_SI")
740 #pragma pop_macro("__L")
741 #pragma pop_macro("__STRIP_PARENS")
742 #endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__