1 /* Copyright
(C
) 2000, 2001, 2003, 2005 Free Software Foundation
, Inc.
2 Contributed by James E. Wilson
<wilson
@cygnus.com
>.
4 This file is part of GCC.
6 GCC is free software
; you can redistribute it and/or modify
7 it under the terms of the GNU General
Public License as published by
8 the Free Software Foundation
; either version 2, or (at your option)
11 GCC is distributed
in the hope that it will be useful
,
12 but WITHOUT ANY WARRANTY
; without even the implied warranty of
13 MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General
Public License for more details.
16 You should have received a copy of the GNU General
Public License
17 along with GCC
; see the file COPYING. If not, write to
18 the Free Software Foundation
, 51 Franklin Street
, Fifth Floor
,
19 Boston
, MA
02110-1301, USA.
*/
21 /* As a special exception
, if you link
this library with other files
,
22 some of which are compiled with GCC
, to produce an executable
,
23 this library does
not by itself cause the resulting executable
24 to be covered by the GNU General
Public License.
25 This exception does
not however invalidate any other reasons why
26 the executable file might be covered by the GNU General
Public License.
*/
29 // Compute a
80-bit IEEE double
-extended quotient.
31 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
34 // farg0 holds the dividend. farg1 holds the divisor.
36 // __divtf3 is an alternate symbol
name for backward compatibility.
45 cmp.
eq p7
, p0
= r0
, r0
46 frcpa.s0 f10
, p6
= farg0
, farg1
48 (p6
) cmp.ne p7
, p0
= r0
, r0
49 .pred.rel.mutex p6
, p7
50 (p6
) fnma.s1 f11
= farg1
, f10
, f1
51 (p6
) fma.s1 f12
= farg0
, f10
, f0
53 (p6
) fma.s1 f13
= f11
, f11
, f0
54 (p6
) fma.s1 f14
= f11
, f11
, f11
56 (p6
) fma.s1 f11
= f13
, f13
, f11
57 (p6
) fma.s1 f13
= f14
, f10
, f10
59 (p6
) fma.s1 f10
= f13
, f11
, f10
60 (p6
) fnma.s1 f11
= farg1
, f12
, farg0
62 (p6
) fma.s1 f11
= f11
, f10
, f12
63 (p6
) fnma.s1 f12
= farg1
, f10
, f1
65 (p6
) fma.s1 f10
= f12
, f10
, f10
66 (p6
) fnma.s1 f12
= farg1
, f11
, farg0
68 (p6
) fma.s0 fret0
= f12
, f10
, f11
75 // Compute a
64-bit IEEE double quotient.
77 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
80 // farg0 holds the dividend. farg1 holds the divisor.
87 cmp.
eq p7
, p0
= r0
, r0
88 frcpa.s0 f10
, p6
= farg0
, farg1
90 (p6
) cmp.ne p7
, p0
= r0
, r0
91 .pred.rel.mutex p6
, p7
92 (p6
) fmpy.s1 f11
= farg0
, f10
93 (p6
) fnma.s1 f12
= farg1
, f10
, f1
95 (p6
) fma.s1 f11
= f12
, f11
, f11
96 (p6
) fmpy.s1 f13
= f12
, f12
98 (p6
) fma.s1 f10
= f12
, f10
, f10
99 (p6
) fma.s1 f11
= f13
, f11
, f11
101 (p6
) fmpy.s1 f12
= f13
, f13
102 (p6
) fma.s1 f10
= f13
, f10
, f10
104 (p6
) fma.d.s1 f11
= f12
, f11
, f11
105 (p6
) fma.s1 f10
= f12
, f10
, f10
107 (p6
) fnma.d.s1 f8
= farg1
, f11
, farg0
109 (p6
) fma.d fret0
= f8
, f10
, f11
117 // Compute a
32-bit IEEE float quotient.
119 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
122 // farg0 holds the dividend. farg1 holds the divisor.
129 cmp.
eq p7
, p0
= r0
, r0
130 frcpa.s0 f10
, p6
= farg0
, farg1
132 (p6
) cmp.ne p7
, p0
= r0
, r0
133 .pred.rel.mutex p6
, p7
134 (p6
) fmpy.s1 f8
= farg0
, f10
135 (p6
) fnma.s1 f9
= farg1
, f10
, f1
137 (p6
) fma.s1 f8
= f9
, f8
, f8
138 (p6
) fmpy.s1 f9
= f9
, f9
140 (p6
) fma.s1 f8
= f9
, f8
, f8
141 (p6
) fmpy.s1 f9
= f9
, f9
143 (p6
) fma.d.s1 f10
= f9
, f8
, f8
145 (p6
) fnorm.s.s0 fret0
= f10
153 // Compute a
64-bit integer quotient.
155 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
158 // in0 holds the dividend. in1 holds the divisor.
166 // Transfer inputs to FP registers.
169 // Check divide by zero.
170 cmp.ne.unc p0
,p7
=0,in1
172 // Convert the inputs to FP
, so that they won
't be treated as unsigned.
177 // Compute the reciprocal approximation.
178 frcpa.s1 f10, p6 = f8, f9
180 // 3 Newton-Raphson iterations.
181 (p6) fnma.s1 f11 = f9, f10, f1
182 (p6) fmpy.s1 f12 = f8, f10
184 (p6) fmpy.s1 f13 = f11, f11
185 (p6) fma.s1 f12 = f11, f12, f12
187 (p6) fma.s1 f10 = f11, f10, f10
188 (p6) fma.s1 f11 = f13, f12, f12
190 (p6) fma.s1 f10 = f13, f10, f10
191 (p6) fnma.s1 f12 = f9, f11, f8
193 (p6) fma.s1 f10 = f12, f10, f11
195 // Round quotient to an integer.
196 fcvt.fx.trunc.s1 f10 = f10
198 // Transfer result to GP registers.
206 // Compute a 64-bit integer modulus.
208 // From the Intel IA-64 Optimization Guide, choose the minimum latency
211 // in0 holds the dividend (a). in1 holds the divisor (b).
219 // Transfer inputs to FP registers.
222 // Check divide by zero.
223 cmp.ne.unc p0,p7=0,in1
225 // Convert the inputs to FP, so that they won't be treated as unsigned.
230 // Compute the reciprocal approximation.
231 frcpa.s1 f10
, p6
= f8
, f9
233 // 3 Newton
-Raphson iterations.
234 (p6
) fmpy.s1 f12
= f8
, f10
235 (p6
) fnma.s1 f11
= f9
, f10
, f1
237 (p6
) fma.s1 f12
= f11
, f12
, f12
238 (p6
) fmpy.s1 f13
= f11
, f11
240 (p6
) fma.s1 f10
= f11
, f10
, f10
241 (p6
) fma.s1 f11
= f13
, f12
, f12
244 (p6
) fma.s1 f10
= f13
, f10
, f10
245 (p6
) fnma.s1 f12
= f9
, f11
, f8
248 (p6
) fma.s1 f10
= f12
, f10
, f11
250 fcvt.fx.trunc.s1 f10
= f10
253 xma.l f10
= f10
, f9
, f14
255 // Transfer result to GP registers.
263 // Compute a
64-bit unsigned integer quotient.
265 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
268 // in0 holds the dividend. in1 holds the divisor.
276 // Transfer inputs to FP registers.
279 // Check divide by zero.
280 cmp.ne.unc p0
,p7
=0,in1
282 // Convert the inputs to FP
, to avoid FP software
-assist faults.
287 // Compute the reciprocal approximation.
288 frcpa.s1 f10
, p6
= f8
, f9
290 // 3 Newton
-Raphson iterations.
291 (p6
) fnma.s1 f11
= f9
, f10
, f1
292 (p6
) fmpy.s1 f12
= f8
, f10
294 (p6
) fmpy.s1 f13
= f11
, f11
295 (p6
) fma.s1 f12
= f11
, f12
, f12
297 (p6
) fma.s1 f10
= f11
, f10
, f10
298 (p6
) fma.s1 f11
= f13
, f12
, f12
300 (p6
) fma.s1 f10
= f13
, f10
, f10
301 (p6
) fnma.s1 f12
= f9
, f11
, f8
303 (p6
) fma.s1 f10
= f12
, f10
, f11
305 // Round quotient to an unsigned integer.
306 fcvt.fxu.trunc.s1 f10
= f10
308 // Transfer result to GP registers.
316 // Compute a
64-bit unsigned integer modulus.
318 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
321 // in0 holds the dividend
(a
). in1 holds the divisor
(b
).
329 // Transfer inputs to FP registers.
332 // Check divide by zero.
333 cmp.ne.unc p0
,p7
=0,in1
335 // Convert the inputs to FP
, to avoid FP software assist faults.
340 // Compute the reciprocal approximation.
341 frcpa.s1 f10
, p6
= f8
, f9
343 // 3 Newton
-Raphson iterations.
344 (p6
) fmpy.s1 f12
= f8
, f10
345 (p6
) fnma.s1 f11
= f9
, f10
, f1
347 (p6
) fma.s1 f12
= f11
, f12
, f12
348 (p6
) fmpy.s1 f13
= f11
, f11
350 (p6
) fma.s1 f10
= f11
, f10
, f10
351 (p6
) fma.s1 f11
= f13
, f12
, f12
354 (p6
) fma.s1 f10
= f13
, f10
, f10
355 (p6
) fnma.s1 f12
= f9
, f11
, f8
358 (p6
) fma.s1 f10
= f12
, f10
, f11
360 // Round quotient to an unsigned integer.
361 fcvt.fxu.trunc.s1 f10
= f10
364 xma.l f10
= f10
, f9
, f14
366 // Transfer result to GP registers.
374 // Compute a
32-bit integer quotient.
376 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
379 // in0 holds the dividend. in1 holds the divisor.
387 // Check divide by zero.
388 cmp.ne.unc p0
,p7
=0,in1
401 frcpa.s1 f10
, p6
= f8
, f9
403 (p6
) fmpy.s1 f8
= f8
, f10
404 (p6
) fnma.s1 f9
= f9
, f10
, f1
406 (p6
) fma.s1 f8
= f9
, f8
, f8
407 (p6
) fma.s1 f9
= f9
, f9
, f11
409 (p6
) fma.s1 f10
= f9
, f8
, f8
411 fcvt.fx.trunc.s1 f10
= f10
420 // Compute a
32-bit integer modulus.
422 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
425 // in0 holds the dividend. in1 holds the divisor.
439 // Check divide by zero.
440 cmp.ne.unc p0
,p7
=0,in1
447 frcpa.s1 f10
, p6
= f8
, f9
450 (p6
) fmpy.s1 f12
= f8
, f10
451 (p6
) fnma.s1 f10
= f9
, f10
, f1
454 (p6
) fma.s1 f12
= f10
, f12
, f12
455 (p6
) fma.s1 f10
= f10
, f10
, f11
457 (p6
) fma.s1 f10
= f10
, f12
, f12
459 fcvt.fx.trunc.s1 f10
= f10
461 xma.l f10
= f10
, f9
, f13
470 // Compute a
32-bit unsigned integer quotient.
472 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
475 // in0 holds the dividend. in1 holds the divisor.
489 // Check divide by zero.
490 cmp.ne.unc p0
,p7
=0,in1
497 frcpa.s1 f10
, p6
= f8
, f9
499 (p6
) fmpy.s1 f8
= f8
, f10
500 (p6
) fnma.s1 f9
= f9
, f10
, f1
502 (p6
) fma.s1 f8
= f9
, f8
, f8
503 (p6
) fma.s1 f9
= f9
, f9
, f11
505 (p6
) fma.s1 f10
= f9
, f8
, f8
507 fcvt.fxu.trunc.s1 f10
= f10
516 // Compute a
32-bit unsigned integer modulus.
518 // From the Intel IA
-64 Optimization Guide
, choose the minimum latency
521 // in0 holds the dividend. in1 holds the divisor.
535 // Check divide by zero.
536 cmp.ne.unc p0
,p7
=0,in1
543 frcpa.s1 f10
, p6
= f8
, f9
546 (p6
) fmpy.s1 f12
= f8
, f10
547 (p6
) fnma.s1 f10
= f9
, f10
, f1
550 (p6
) fma.s1 f12
= f10
, f12
, f12
551 (p6
) fma.s1 f10
= f10
, f10
, f11
553 (p6
) fma.s1 f10
= f10
, f12
, f12
555 fcvt.fxu.trunc.s1 f10
= f10
557 xma.l f10
= f10
, f9
, f13
565 #ifdef L__save_stack_nonlocal
566 // Notes on save
/restore stack
nonlocal: We read ar.bsp but write
567 // ar.bspstore.
This is because ar.bsp can be read at all times
568 // (independent of the RSE mode
) but since it
's read-only we need to
569 // restore the value via ar.bspstore. This is OK because
570 // ar.bsp==ar.bspstore after executing "flushrs".
572 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
576 .global __ia64_save_stack_nonlocal
577 .proc __ia64_save_stack_nonlocal
578 __ia64_save_stack_nonlocal:
580 alloc r18 = ar.pfs, 2, 0, 0, 0
611 .endp __ia64_save_stack_nonlocal
614 #ifdef L__nonlocal_goto
615 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
616 // void *static_chain);
620 .global __ia64_nonlocal_goto
621 .proc __ia64_nonlocal_goto
622 __ia64_nonlocal_goto:
624 alloc r20 = ar.pfs, 3, 0, 0, 0
626 mov.ret.sptk rp = in0, .L0
647 mov ar.bspstore = r16
664 .endp __ia64_nonlocal_goto
667 #ifdef L__restore_stack_nonlocal
668 // This is mostly the same as nonlocal_goto above.
669 // ??? This has not been tested yet.
671 // void __ia64_restore_stack_nonlocal(void *save_area)
675 .global __ia64_restore_stack_nonlocal
676 .proc __ia64_restore_stack_nonlocal
677 __ia64_restore_stack_nonlocal:
679 alloc r20 = ar.pfs, 4, 0, 0, 0
700 mov ar.bspstore = r16
717 .endp __ia64_restore_stack_nonlocal
721 // Implement the nested function trampoline. This is out of line
722 // so that we don't have to bother with flushing the icache
, as
723 // well as making the on
-stack trampoline smaller.
725 // The trampoline has the following
form:
727 // +-------------------+ >
728 // TRAMP: | __ia64_trampoline | |
729 // +-------------------+ > fake function descriptor
731 // +-------------------+ >
732 // | target descriptor |
733 // +-------------------+
735 // +-------------------+
739 .
global __ia64_trampoline
740 .
proc __ia64_trampoline
757 .
endp __ia64_trampoline
760 // Thunks for backward compatibility.
768 br.sptk.many __fixxfti
779 br.sptk.many __fixunsxfti
790 br.sptk.many __floattixf