[TargetVersion] Only enable on RISC-V and AArch64 (#115991)
[llvm-project.git] / libclc / clspv / lib / shared / vstore_half.cl
blobb05fcfe75fb7a4c679c80357821fc2e9cc6b4d8b
1 #include <clc/clc.h>
3 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
5 #define ROUND_VEC1(out, in, ROUNDF) out = ROUNDF(in);
6 #define ROUND_VEC2(out, in, ROUNDF) \
7 ROUND_VEC1(out.lo, in.lo, ROUNDF); \
8 ROUND_VEC1(out.hi, in.hi, ROUNDF);
9 #define ROUND_VEC3(out, in, ROUNDF) \
10 ROUND_VEC1(out.s0, in.s0, ROUNDF); \
11 ROUND_VEC1(out.s1, in.s1, ROUNDF); \
12 ROUND_VEC1(out.s2, in.s2, ROUNDF);
13 #define ROUND_VEC4(out, in, ROUNDF) \
14 ROUND_VEC2(out.lo, in.lo, ROUNDF); \
15 ROUND_VEC2(out.hi, in.hi, ROUNDF);
16 #define ROUND_VEC8(out, in, ROUNDF) \
17 ROUND_VEC4(out.lo, in.lo, ROUNDF); \
18 ROUND_VEC4(out.hi, in.hi, ROUNDF);
19 #define ROUND_VEC16(out, in, ROUNDF) \
20 ROUND_VEC8(out.lo, in.lo, ROUNDF); \
21 ROUND_VEC8(out.hi, in.hi, ROUNDF);
23 #define __FUNC(SUFFIX, VEC_SIZE, TYPE, AS, ROUNDF) \
24 void _CLC_OVERLOAD vstore_half_##VEC_SIZE(TYPE, size_t, AS half *); \
25 _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, \
26 AS half *mem) { \
27 TYPE rounded_vec; \
28 ROUND_VEC##VEC_SIZE(rounded_vec, vec, ROUNDF); \
29 vstore_half_##VEC_SIZE(rounded_vec, offset, mem); \
30 } \
31 void _CLC_OVERLOAD vstorea_half_##VEC_SIZE(TYPE, size_t, AS half *); \
32 _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, \
33 AS half *mem) { \
34 TYPE rounded_vec; \
35 ROUND_VEC##VEC_SIZE(rounded_vec, vec, ROUNDF); \
36 vstorea_half_##VEC_SIZE(rounded_vec, offset, mem); \
39 _CLC_DEF _CLC_OVERLOAD float __clc_rtz(float x) {
40 /* Handle nan corner case */
41 if (isnan(x))
42 return x;
43 /* RTZ does not produce Inf for large numbers */
44 if (fabs(x) > 65504.0f && !isinf(x))
45 return copysign(65504.0f, x);
47 const int exp = (as_uint(x) >> 23 & 0xff) - 127;
48 /* Manage range rounded to +- zero explicitely */
49 if (exp < -24)
50 return copysign(0.0f, x);
52 /* Remove lower 13 bits to make sure the number is rounded down */
53 int mask = 0xffffe000;
54 /* Denormals cannot be flushed, and they use different bit for rounding */
55 if (exp < -14)
56 mask <<= min(-(exp + 14), 10);
58 return as_float(as_uint(x) & mask);
61 _CLC_DEF _CLC_OVERLOAD float __clc_rti(float x) {
62 /* Handle nan corner case */
63 if (isnan(x))
64 return x;
66 const float inf = copysign(INFINITY, x);
67 uint ux = as_uint(x);
69 /* Manage +- infinity explicitely */
70 if (as_float(ux & 0x7fffffff) > 0x1.ffcp+15f) {
71 return inf;
73 /* Manage +- zero explicitely */
74 if ((ux & 0x7fffffff) == 0) {
75 return copysign(0.0f, x);
78 const int exp = (as_uint(x) >> 23 & 0xff) - 127;
79 /* Manage range rounded to smallest half denormal explicitely */
80 if (exp < -24) {
81 return copysign(0x1.0p-24f, x);
84 /* Set lower 13 bits */
85 int mask = (1 << 13) - 1;
86 /* Denormals cannot be flushed, and they use different bit for rounding */
87 if (exp < -14) {
88 mask = (1 << (13 + min(-(exp + 14), 10))) - 1;
91 const float next = nextafter(as_float(ux | mask), inf);
92 return ((ux & mask) == 0) ? as_float(ux) : next;
94 _CLC_DEF _CLC_OVERLOAD float __clc_rtn(float x) {
95 return ((as_uint(x) & 0x80000000) == 0) ? __clc_rtz(x) : __clc_rti(x);
97 _CLC_DEF _CLC_OVERLOAD float __clc_rtp(float x) {
98 return ((as_uint(x) & 0x80000000) == 0) ? __clc_rti(x) : __clc_rtz(x);
100 _CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) {
101 /* Mantisa + implicit bit */
102 const uint mantissa = (as_uint(x) & 0x7fffff) | (1u << 23);
103 const int exp = (as_uint(x) >> 23 & 0xff) - 127;
104 int shift = 13;
105 if (exp < -14) {
106 /* The default assumes lower 13 bits are rounded,
107 * but it might be more for denormals.
108 * Shifting beyond last == 0b, and qr == 00b is not necessary */
109 shift += min(-(exp + 14), 15);
111 int mask = (1 << shift) - 1;
112 const uint grs = mantissa & mask;
113 const uint last = mantissa & (1 << shift);
114 /* IEEE round up rule is: grs > 101b or grs == 100b and last == 1.
115 * exp > 15 should round to inf. */
116 bool roundup = (grs > (1 << (shift - 1))) ||
117 (grs == (1 << (shift - 1)) && last != 0) || (exp > 15);
118 return roundup ? __clc_rti(x) : __clc_rtz(x);
121 #define __XFUNC(SUFFIX, VEC_SIZE, TYPE, AS) \
122 __FUNC(SUFFIX, VEC_SIZE, TYPE, AS, __clc_rte) \
123 __FUNC(SUFFIX##_rtz, VEC_SIZE, TYPE, AS, __clc_rtz) \
124 __FUNC(SUFFIX##_rtn, VEC_SIZE, TYPE, AS, __clc_rtn) \
125 __FUNC(SUFFIX##_rtp, VEC_SIZE, TYPE, AS, __clc_rtp) \
126 __FUNC(SUFFIX##_rte, VEC_SIZE, TYPE, AS, __clc_rte)
128 #define FUNC(SUFFIX, VEC_SIZE, TYPE, AS) __XFUNC(SUFFIX, VEC_SIZE, TYPE, AS)
130 #define __CLC_BODY "vstore_half.inc"
131 #include <clc/math/gentype.inc>
132 #undef __CLC_BODY
133 #undef FUNC
134 #undef __XFUNC
135 #undef __FUNC