3 #pragma OPENCL EXTENSION cl_khr_byte_addressable_store
: enable
5 #define ROUND_VEC1
(out, in
, ROUNDF
) out
= ROUNDF
(in);
6 #define ROUND_VEC2
(out, in
, ROUNDF
) \
7 ROUND_VEC1
(out.lo
, in.lo
, ROUNDF
); \
8 ROUND_VEC1
(out.hi
, in.hi
, ROUNDF
);
9 #define ROUND_VEC3
(out, in
, ROUNDF
) \
10 ROUND_VEC1
(out.s0
, in.s0
, ROUNDF
); \
11 ROUND_VEC1
(out.s1
, in.s1
, ROUNDF
); \
12 ROUND_VEC1
(out.s2
, in.s2
, ROUNDF
);
13 #define ROUND_VEC4
(out, in
, ROUNDF
) \
14 ROUND_VEC2
(out.lo
, in.lo
, ROUNDF
); \
15 ROUND_VEC2
(out.hi
, in.hi
, ROUNDF
);
16 #define ROUND_VEC8
(out, in
, ROUNDF
) \
17 ROUND_VEC4
(out.lo
, in.lo
, ROUNDF
); \
18 ROUND_VEC4
(out.hi
, in.hi
, ROUNDF
);
19 #define ROUND_VEC16
(out, in
, ROUNDF
) \
20 ROUND_VEC8
(out.lo
, in.lo
, ROUNDF
); \
21 ROUND_VEC8
(out.hi
, in.hi
, ROUNDF
);
23 #define __FUNC
(SUFFIX, VEC_SIZE
, TYPE
, AS
, ROUNDF
) \
24 void _CLC_OVERLOAD vstore_half_
##VEC_SIZE
(TYPE, size_t
, AS half
*); \
25 _CLC_OVERLOAD _CLC_DEF void vstore_half
##SUFFIX
(TYPE vec
, size_t offset
, \
28 ROUND_VEC
##VEC_SIZE
(rounded_vec, vec
, ROUNDF
); \
29 vstore_half_
##VEC_SIZE
(rounded_vec, offset
, mem
); \
31 void _CLC_OVERLOAD vstorea_half_
##VEC_SIZE
(TYPE, size_t
, AS half
*); \
32 _CLC_OVERLOAD _CLC_DEF void vstorea_half
##SUFFIX
(TYPE vec
, size_t offset
, \
35 ROUND_VEC
##VEC_SIZE
(rounded_vec, vec
, ROUNDF
); \
36 vstorea_half_
##VEC_SIZE
(rounded_vec, offset
, mem
); \
39 _CLC_DEF _CLC_OVERLOAD float __clc_rtz
(float x
) {
40 /* Handle nan corner case
*/
43 /* RTZ does not produce Inf for large numbers
*/
44 if
(fabs(x) > 65504.0f
&& !isinf
(x))
45 return copysign
(65504.0f
, x
);
47 const int exp
= (as_uint(x) >> 23 & 0xff) -
127;
48 /* Manage range rounded to
+- zero explicitely
*/
50 return copysign
(0.0f
, x
);
52 /* Remove lower
13 bits to make sure the number is rounded down
*/
53 int mask
= 0xffffe000;
54 /* Denormals cannot be flushed
, and they use different bit for rounding
*/
56 mask
<<= min
(-(exp + 14), 10);
58 return as_float
(as_uint(x) & mask
);
61 _CLC_DEF _CLC_OVERLOAD float __clc_rti
(float x
) {
62 /* Handle nan corner case
*/
66 const float inf
= copysign
(INFINITY, x
);
69 /* Manage
+- infinity explicitely
*/
70 if
(as_float(ux & 0x7fffffff) > 0x1.ffcp
+15f
) {
73 /* Manage
+- zero explicitely
*/
74 if
((ux & 0x7fffffff) == 0) {
75 return copysign
(0.0f
, x
);
78 const int exp
= (as_uint(x) >> 23 & 0xff) -
127;
79 /* Manage range rounded to smallest half denormal explicitely
*/
81 return copysign
(0x1.0p-24f
, x
);
84 /* Set lower
13 bits
*/
85 int mask
= (1 << 13) -
1;
86 /* Denormals cannot be flushed
, and they use different bit for rounding
*/
88 mask
= (1 << (13 + min
(-(exp + 14), 10))) -
1;
91 const float next
= nextafter
(as_float(ux | mask
), inf
);
92 return
((ux & mask
) == 0) ? as_float
(ux) : next
;
94 _CLC_DEF _CLC_OVERLOAD float __clc_rtn
(float x
) {
95 return
((as_uint(x) & 0x80000000) == 0) ? __clc_rtz
(x) : __clc_rti
(x);
97 _CLC_DEF _CLC_OVERLOAD float __clc_rtp
(float x
) {
98 return
((as_uint(x) & 0x80000000) == 0) ? __clc_rti
(x) : __clc_rtz
(x);
100 _CLC_DEF _CLC_OVERLOAD float __clc_rte
(float x
) {
101 /* Mantisa
+ implicit bit
*/
102 const uint mantissa
= (as_uint(x) & 0x7fffff) |
(1u << 23);
103 const int exp
= (as_uint(x) >> 23 & 0xff) -
127;
106 /* The default assumes lower
13 bits are rounded
,
107 * but it might be more for denormals.
108 * Shifting beyond last
== 0b
, and qr
== 00b is not necessary
*/
109 shift
+= min
(-(exp + 14), 15);
111 int mask
= (1 << shift
) -
1;
112 const uint grs
= mantissa
& mask
;
113 const uint last
= mantissa
& (1 << shift
);
114 /* IEEE round up rule is
: grs
> 101b or grs
== 100b and last
== 1.
115 * exp
> 15 should round to inf.
*/
116 bool roundup
= (grs > (1 << (shift -
1))) ||
117 (grs == (1 << (shift -
1)) && last
!= 0) ||
(exp > 15);
118 return roundup ? __clc_rti
(x) : __clc_rtz
(x);
121 #define __XFUNC
(SUFFIX, VEC_SIZE
, TYPE
, AS
) \
122 __FUNC
(SUFFIX, VEC_SIZE
, TYPE
, AS
, __clc_rte
) \
123 __FUNC
(SUFFIX##_rtz
, VEC_SIZE
, TYPE
, AS
, __clc_rtz
) \
124 __FUNC
(SUFFIX##_rtn
, VEC_SIZE
, TYPE
, AS
, __clc_rtn
) \
125 __FUNC
(SUFFIX##_rtp
, VEC_SIZE
, TYPE
, AS
, __clc_rtp
) \
126 __FUNC
(SUFFIX##_rte
, VEC_SIZE
, TYPE
, AS
, __clc_rte
)
128 #define FUNC
(SUFFIX, VEC_SIZE
, TYPE
, AS
) __XFUNC
(SUFFIX, VEC_SIZE
, TYPE
, AS
)
130 #define __CLC_BODY
"vstore_half.inc"
131 #include
<clc
/math
/gentype.inc
>