2 * Copyright
(c) 2014 Advanced Micro Devices
, Inc.
4 * Permission is hereby granted
, free of charge
, to any person obtaining a copy
5 * of this software and associated documentation files
(the "Software"), to deal
6 * in the Software without restriction
, including without limitation the rights
7 * to use
, copy
, modify
, merge
, publish
, distribute
, sublicense
, and
/or sell
8 * copies of the Software
, and to permit persons to whom the Software is
9 * furnished to do so
, subject to the following conditions
:
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED
"AS IS", WITHOUT WARRANTY OF ANY KIND
, EXPRESS OR
15 * IMPLIED
, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY
,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM
, DAMAGES OR OTHER
18 * LIABILITY
, WHETHER IN AN ACTION OF CONTRACT
, TORT OR OTHERWISE
, ARISING FROM
,
19 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #include
<clc
/clcmacro.h
>
25 #include
<clc
/math
/clc_floor.h
>
26 #include
<clc
/math
/clc_trunc.h
>
27 #include
<clc
/shared
/clc_max.h
>
28 #include
<math
/clc_remainder.h
>
33 _CLC_DEF _CLC_OVERLOAD float __clc_remquo
(float x
, float y
,
35 x
= __clc_flush_denormal_if_not_supported
(x);
36 y
= __clc_flush_denormal_if_not_supported
(y);
38 int ax
= ux
& EXSIGNBIT_SP32
;
39 float xa
= as_float
(ax);
41 int ex
= ax
>> EXPSHIFTBITS_SP32
;
44 int ay
= uy
& EXSIGNBIT_SP32
;
45 float ya
= as_float
(ay);
47 int ey
= ay
>> EXPSHIFTBITS_SP32
;
49 float xr
= as_float
(0x3f800000 |
(ax & 0x007fffff));
50 float yr
= as_float
(0x3f800000 |
(ay & 0x007fffff));
74 c
= (yr < 2.0f
* xr
) |
((yr == 2.0f
* xr
) & ((q & 0x1) == 0x1));
78 float s
= as_float
(ey << EXPSHIFTBITS_SP32
);
81 int qsgn
= sx
== sy ?
1 : -
1;
82 int quot
= (q & 0x7f) * qsgn
;
85 quot
= c ? qsgn
: quot
;
88 xr
= as_float
(sx ^ as_int
(xr));
90 c
= ax
> PINFBITPATT_SP32 | ay
> PINFBITPATT_SP32 | ax
== PINFBITPATT_SP32 |
93 xr
= c ? as_float
(QNANBITPATT_SP32) : xr
;
99 // remquo signature is special
, we don
't have macro for this
100 #define __VEC_REMQUO
(TYPE, VEC_SIZE
, HALF_VEC_SIZE
) \
101 _CLC_DEF _CLC_OVERLOAD TYPE
##VEC_SIZE __clc_remquo
( \
102 TYPE
##VEC_SIZE x
, TYPE
##VEC_SIZE y
, __private int
##VEC_SIZE
*quo
) { \
103 int
##HALF_VEC_SIZE lo
, hi
; \
104 TYPE
##VEC_SIZE ret
; \
105 ret.lo
= __clc_remquo
(x.lo
, y.lo
, &lo
); \
106 ret.hi
= __clc_remquo
(x.hi
, y.hi
, &hi
); \
112 #define __VEC3_REMQUO
(TYPE) \
113 _CLC_DEF _CLC_OVERLOAD TYPE
##3 __clc_remquo
(TYPE##3 x
, TYPE
##3 y
, \
114 __private int
##3 * quo
) { \
118 ret.s01
= __clc_remquo
(x.s01
, y.s01
, &lo
); \
119 ret.s2
= __clc_remquo
(x.s2
, y.s2
, &hi
); \
124 __VEC_REMQUO
(float, 2, )
126 __VEC_REMQUO
(float, 4, 2)
127 __VEC_REMQUO
(float, 8, 4)
128 __VEC_REMQUO
(float, 16, 8)
131 _CLC_DEF _CLC_OVERLOAD double __clc_remquo
(double x
, double y
,
132 __private int
*pquo
) {
133 ulong ux
= as_ulong
(x);
134 ulong ax
= ux
& ~SIGNBIT_DP64
;
135 ulong xsgn
= ux ^ ax
;
136 double dx
= as_double
(ax);
137 int xexp
= convert_int
(ax >> EXPSHIFTBITS_DP64
);
138 int xexp1
= 11 -
(int)clz
(ax & MANTBITS_DP64
);
139 xexp1
= xexp
< 1 ? xexp1
: xexp
;
141 ulong uy
= as_ulong
(y);
142 ulong ay
= uy
& ~SIGNBIT_DP64
;
143 double dy
= as_double
(ay);
144 int yexp
= convert_int
(ay >> EXPSHIFTBITS_DP64
);
145 int yexp1
= 11 -
(int)clz
(ay & MANTBITS_DP64
);
146 yexp1
= yexp
< 1 ? yexp1
: yexp
;
148 int qsgn
= ((ux ^ uy
) & SIGNBIT_DP64
) == 0UL ?
1 : -
1;
150 // First assume |x|
> |y|
152 // Set ntimes to the number of times we need to do a
153 // partial remainder. If the exponent of x is an exact multiple
154 // of
53 larger than the exponent of y
, and the mantissa of x is
155 // less than the mantissa of y
, ntimes will be one too large
156 // but it doesn
't matter - it just means that we
'll go round
157 // the loop below one extra time.
158 int ntimes
= __clc_max
(0, (xexp1 - yexp1
) / 53);
159 double w
= ldexp
(dy, ntimes
* 53);
160 w
= ntimes
== 0 ? dy
: w
;
161 double scale
= ntimes
== 0 ?
1.0 : 0x1.0p-53
;
163 // Each time round the loop we compute a partial remainder.
164 // This is done by subtracting a large multiple of w
165 // from x each time
, where w is a scaled up version of y.
166 // The subtraction must be performed exactly in quad
167 // precision
, though the result at each stage can
168 // fit exactly in a double precision number.
172 for
(i = 0; i < ntimes; i++) {
173 // Compute integral multiplier
174 t
= __clc_trunc
(dx / w
);
176 // Compute w
* t in quad precision
180 // Subtract w
* t from dx
182 dx
= v
+ (((dx - v
) - p
) - pp
);
184 // If t was one too large
, dx will be negative. Add back one w.
185 dx
+= dx
< 0.0 ? w
: 0.0;
187 // Scale w down by
2^
(-53) for the next iteration
192 // Variable todd says whether the integer t is odd or not
193 t
= __clc_floor
(dx / w
);
200 dx
= v
+ (((dx - v
) - p
) - pp
);
207 // At this point
, dx lies in the range
[0,dy
)
209 // For the remainder function
, we need to adjust dx
210 // so that it lies in the range
(-y/2, y
/2] by carefully
211 // subtracting w
(== dy
== y
) if necessary. The rigmarole
212 // with todd is to get the correct sign of the result
213 // when x
/y lies exactly half way between two integers
,
214 // when we need to choose the even integer.
216 int al
= (2.0
* dx
> w
) |
(todd & (2.0
* dx
== w
));
217 double dxl
= dx -
(al ? w
: 0.0);
219 int ag
= (dx > 0.5 * w
) |
(todd & (dx == 0.5 * w
));
220 double dxg
= dx -
(ag ? w
: 0.0);
222 dx
= dy
< 0x1.0p
+1022 ? dxl
: dxg
;
223 lt
+= dy
< 0x1.0p
+1022 ? al
: ag
;
224 int quo
= ((int)lt
& 0x7f) * qsgn
;
226 double ret
= as_double
(xsgn ^ as_ulong
(dx));
229 // Now handle |x|
== |y|
232 quo
= c ? qsgn
: quo
;
235 // Next
, handle |x|
< |y|
240 c
&= (yexp<1023 & 2.0 * dx
> dy
) |
(dx > 0.5 * dy
);
241 quo
= c ? qsgn
: quo
;
242 // we could use a conversion here instead since qsgn
= +-
1
243 p
= qsgn
== 1 ? -
1.0 : 1.0;
247 // We don
't need anything special for |x|
== 0
252 ret
= c ? as_double
(QNANBITPATT_DP64) : ret
;
255 c
= yexp
> BIASEDEMAX_DP64
;
261 c
= xexp
> BIASEDEMAX_DP64
;
263 ret
= c ? as_double
(QNANBITPATT_DP64) : ret
;
268 __VEC_REMQUO
(double, 2, )
269 __VEC3_REMQUO
(double)
270 __VEC_REMQUO
(double, 4, 2)
271 __VEC_REMQUO
(double, 8, 4)
272 __VEC_REMQUO
(double, 16, 8)
277 #pragma OPENCL EXTENSION cl_khr_fp16
: enable
279 _CLC_OVERLOAD _CLC_DEF half __clc_remquo
(half x
, half y
, __private int
*pquo
) {
280 return
(half)__clc_remquo
((float)x
, (float)y
, pquo
);
282 __VEC_REMQUO
(half, 2, )
284 __VEC_REMQUO
(half, 4, 2)
285 __VEC_REMQUO
(half, 8, 4)
286 __VEC_REMQUO
(half, 16, 8)