2006-02-09 Joseph S. Myers <joseph@codesourcery.com>
[glibc-ports.git] / sysdeps / alpha / alphaev6 / addmul_1.s
bloba061fb9edb6599bd641ff7334f056caded9221c7
1 # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 # the result to a second limb vector.
4 # Copyright (C) 2000 Free Software Foundation, Inc.
6 # This file is part of the GNU MP Library.
8 # The GNU MP Library is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU Lesser General Public License as published
10 # by the Free Software Foundation; either version 2.1 of the License, or (at
11 # your option) any later version.
13 # The GNU MP Library is distributed in the hope that it will be useful, but
14 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 # License for more details.
18 # You should have received a copy of the GNU Lesser General Public License
19 # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20 # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21 # MA 02111-1307, USA.
23 # INPUT PARAMETERS
24 # res_ptr $16
25 # s1_ptr $17
26 # size $18
27 # s2_limb $19
29 # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
30 # exactly 3.625 cycles/limb on EV6...
32 # This code was written in close cooperation with ev6 pipeline expert
33 # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
35 # Register usages for unrolled loop:
36 # 0-3 mul's
37 # 4-7 acc's
38 # 8-15 mul results
39 # 20,21 carry's
40 # 22,23 save for stores
42 # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
44 # The stores can issue a cycle late so we have paired no-op's to 'catch'
45 # them, so that further disturbance to the schedule is damped.
47 # We couldn't pair the loads, because the entangled schedule of the
48 # carry's has to happen on one side {0} of the machine. Note, the total
49 # use of U0, and the total use of L0 (after attending to the stores).
50 # which is part of the reason why....
52 # This is a great schedule for the d_cache, a poor schedule for the
53 # b_cache. The lockup on U0 means that any stall can't be recovered
54 # from. Consider a ldq in L1. say that load gets stalled because it
55 # collides with a fill from the b_Cache. On the next cycle, this load
56 # gets priority. If first looks at L0, and goes there. The instruction
57 # we intended for L0 gets to look at L1, which is NOT where we want
58 # it. It either stalls 1, because it can't go in L0, or goes there, and
59 # causes a further instruction to stall.
61 # So for b_cache, we're likely going to want to put one or more cycles
62 # back into the code! And, of course, put in prefetches. For the
63 # accumulator, lds, intent to modify. For the multiplier, you might
64 # want ldq, evict next, if you're not wanting to use it again soon. Use
65 # 256 ahead of present pointer value. At a place where we have an mt
66 # followed by a bookkeeping, put the bookkeeping in upper, and the
67 # prefetch into lower.
69 # Note, the usage of physical registers per cycle is smoothed off, as
70 # much as possible.
72 # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
73 # like not to have a ldq or stq to preceded a conditional branch in a
74 # quadpack. The conditional branch moves the retire pointer one cycle
75 # later.
77 # Optimization notes:
78 # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
79 # Reserved regs: $29 $30 $31
80 # Free caller-saves regs in unrolled code: $24 $25 $28
81 # We should swap some of the callee-saves regs for some of the free
82 # caller-saves regs, saving some overhead cycles.
83 # Most importantly, we should write fast code for the 0-7 case.
84 # The code we use there are for the 21164, and runs at 7 cycles/limb
85 # on the 21264. Should not be hard, if we write specialized code for
86 # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
87 # need a jump table indexed by the low 3 bits of the count argument.
89 .set noreorder
90 .set noat
91 .text
93 .globl __mpn_addmul_1
94 .ent __mpn_addmul_1
95 __mpn_addmul_1:
96 .frame $30,0,$26,0
97 .prologue 0
99 cmpult $18, 8, $1
100 beq $1, $Large
102 ldq $2, 0($17) # $2 = s1_limb
103 addq $17, 8, $17 # s1_ptr++
104 subq $18, 1, $18 # size--
105 mulq $2, $19, $3 # $3 = prod_low
106 ldq $5, 0($16) # $5 = *res_ptr
107 umulh $2, $19, $0 # $0 = prod_high
108 beq $18, $Lend0b # jump if size was == 1
109 ldq $2, 0($17) # $2 = s1_limb
110 addq $17, 8, $17 # s1_ptr++
111 subq $18, 1, $18 # size--
112 addq $5, $3, $3
113 cmpult $3, $5, $4
114 stq $3, 0($16)
115 addq $16, 8, $16 # res_ptr++
116 beq $18, $Lend0a # jump if size was == 2
118 .align 3
119 $Loop0: mulq $2, $19, $3 # $3 = prod_low
120 ldq $5, 0($16) # $5 = *res_ptr
121 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
122 subq $18, 1, $18 # size--
123 umulh $2, $19, $4 # $4 = cy_limb
124 ldq $2, 0($17) # $2 = s1_limb
125 addq $17, 8, $17 # s1_ptr++
126 addq $3, $0, $3 # $3 = cy_limb + prod_low
127 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
128 addq $5, $3, $3
129 cmpult $3, $5, $5
130 stq $3, 0($16)
131 addq $16, 8, $16 # res_ptr++
132 addq $5, $0, $0 # combine carries
133 bne $18, $Loop0
134 $Lend0a:
135 mulq $2, $19, $3 # $3 = prod_low
136 ldq $5, 0($16) # $5 = *res_ptr
137 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
138 umulh $2, $19, $4 # $4 = cy_limb
139 addq $3, $0, $3 # $3 = cy_limb + prod_low
140 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
141 addq $5, $3, $3
142 cmpult $3, $5, $5
143 stq $3, 0($16)
144 addq $5, $0, $0 # combine carries
145 addq $4, $0, $0 # cy_limb = prod_high + cy
146 ret $31, ($26), 1
147 $Lend0b:
148 addq $5, $3, $3
149 cmpult $3, $5, $5
150 stq $3, 0($16)
151 addq $0, $5, $0
152 ret $31, ($26), 1
154 $Large:
155 lda $30, -240($30)
156 stq $9, 8($30)
157 stq $10, 16($30)
158 stq $11, 24($30)
159 stq $12, 32($30)
160 stq $13, 40($30)
161 stq $14, 48($30)
162 stq $15, 56($30)
164 and $18, 7, $20 # count for the first loop, 0-7
165 srl $18, 3, $18 # count for unrolled loop
166 bis $31, $31, $0
167 beq $20, $Lunroll
168 ldq $2, 0($17) # $2 = s1_limb
169 addq $17, 8, $17 # s1_ptr++
170 subq $20, 1, $20 # size--
171 mulq $2, $19, $3 # $3 = prod_low
172 ldq $5, 0($16) # $5 = *res_ptr
173 umulh $2, $19, $0 # $0 = prod_high
174 beq $20, $Lend1b # jump if size was == 1
175 ldq $2, 0($17) # $2 = s1_limb
176 addq $17, 8, $17 # s1_ptr++
177 subq $20, 1, $20 # size--
178 addq $5, $3, $3
179 cmpult $3, $5, $4
180 stq $3, 0($16)
181 addq $16, 8, $16 # res_ptr++
182 beq $20, $Lend1a # jump if size was == 2
184 .align 3
185 $Loop1: mulq $2, $19, $3 # $3 = prod_low
186 ldq $5, 0($16) # $5 = *res_ptr
187 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
188 subq $20, 1, $20 # size--
189 umulh $2, $19, $4 # $4 = cy_limb
190 ldq $2, 0($17) # $2 = s1_limb
191 addq $17, 8, $17 # s1_ptr++
192 addq $3, $0, $3 # $3 = cy_limb + prod_low
193 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
194 addq $5, $3, $3
195 cmpult $3, $5, $5
196 stq $3, 0($16)
197 addq $16, 8, $16 # res_ptr++
198 addq $5, $0, $0 # combine carries
199 bne $20, $Loop1
201 $Lend1a:
202 mulq $2, $19, $3 # $3 = prod_low
203 ldq $5, 0($16) # $5 = *res_ptr
204 addq $4, $0, $0 # cy_limb = cy_limb + 'cy'
205 umulh $2, $19, $4 # $4 = cy_limb
206 addq $3, $0, $3 # $3 = cy_limb + prod_low
207 cmpult $3, $0, $0 # $0 = carry from (cy_limb + prod_low)
208 addq $5, $3, $3
209 cmpult $3, $5, $5
210 stq $3, 0($16)
211 addq $16, 8, $16 # res_ptr++
212 addq $5, $0, $0 # combine carries
213 addq $4, $0, $0 # cy_limb = prod_high + cy
214 br $31, $Lunroll
215 $Lend1b:
216 addq $5, $3, $3
217 cmpult $3, $5, $5
218 stq $3, 0($16)
219 addq $16, 8, $16 # res_ptr++
220 addq $0, $5, $0
222 $Lunroll:
223 lda $17, -16($17) # L1 bookkeeping
224 lda $16, -16($16) # L1 bookkeeping
225 bis $0, $31, $12
227 # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
229 ldq $2, 16($17) # L1
230 ldq $3, 24($17) # L1
231 lda $18, -1($18) # L1 bookkeeping
232 ldq $6, 16($16) # L1
233 ldq $7, 24($16) # L1
234 ldq $0, 32($17) # L1
235 mulq $19, $2, $13 # U1
236 ldq $1, 40($17) # L1
237 umulh $19, $2, $14 # U1
238 mulq $19, $3, $15 # U1
239 lda $17, 64($17) # L1 bookkeeping
240 ldq $4, 32($16) # L1
241 ldq $5, 40($16) # L1
242 umulh $19, $3, $8 # U1
243 ldq $2, -16($17) # L1
244 mulq $19, $0, $9 # U1
245 ldq $3, -8($17) # L1
246 umulh $19, $0, $10 # U1
247 addq $6, $13, $6 # L0 lo + acc
248 mulq $19, $1, $11 # U1
249 cmpult $6, $13, $20 # L0 lo add => carry
250 lda $16, 64($16) # L1 bookkeeping
251 addq $6, $12, $22 # U0 hi add => answer
252 cmpult $22, $12, $21 # L0 hi add => carry
253 addq $14, $20, $14 # U0 hi mul + carry
254 ldq $6, -16($16) # L1
255 addq $7, $15, $23 # L0 lo + acc
256 addq $14, $21, $14 # U0 hi mul + carry
257 ldq $7, -8($16) # L1
258 umulh $19, $1, $12 # U1
259 cmpult $23, $15, $20 # L0 lo add => carry
260 addq $23, $14, $23 # U0 hi add => answer
261 ldq $0, 0($17) # L1
262 mulq $19, $2, $13 # U1
263 cmpult $23, $14, $21 # L0 hi add => carry
264 addq $8, $20, $8 # U0 hi mul + carry
265 ldq $1, 8($17) # L1
266 umulh $19, $2, $14 # U1
267 addq $4, $9, $4 # L0 lo + acc
268 stq $22, -48($16) # L0
269 stq $23, -40($16) # L1
270 mulq $19, $3, $15 # U1
271 addq $8, $21, $8 # U0 hi mul + carry
272 cmpult $4, $9, $20 # L0 lo add => carry
273 addq $4, $8, $22 # U0 hi add => answer
274 ble $18, $Lend # U1 bookkeeping
276 # ____ MAIN UNROLLED LOOP ____
277 .align 4
278 $Loop:
279 bis $31, $31, $31 # U1 mt
280 cmpult $22, $8, $21 # L0 hi add => carry
281 addq $10, $20, $10 # U0 hi mul + carry
282 ldq $4, 0($16) # L1
284 bis $31, $31, $31 # U1 mt
285 addq $5, $11, $23 # L0 lo + acc
286 addq $10, $21, $10 # L0 hi mul + carry
287 ldq $5, 8($16) # L1
289 umulh $19, $3, $8 # U1
290 cmpult $23, $11, $20 # L0 lo add => carry
291 addq $23, $10, $23 # U0 hi add => answer
292 ldq $2, 16($17) # L1
294 mulq $19, $0, $9 # U1
295 cmpult $23, $10, $21 # L0 hi add => carry
296 addq $12, $20, $12 # U0 hi mul + carry
297 ldq $3, 24($17) # L1
299 umulh $19, $0, $10 # U1
300 addq $6, $13, $6 # L0 lo + acc
301 stq $22, -32($16) # L0
302 stq $23, -24($16) # L1
304 bis $31, $31, $31 # L0 st slosh
305 mulq $19, $1, $11 # U1
306 bis $31, $31, $31 # L1 st slosh
307 addq $12, $21, $12 # U0 hi mul + carry
309 cmpult $6, $13, $20 # L0 lo add => carry
310 bis $31, $31, $31 # U1 mt
311 lda $18, -1($18) # L1 bookkeeping
312 addq $6, $12, $22 # U0 hi add => answer
314 bis $31, $31, $31 # U1 mt
315 cmpult $22, $12, $21 # L0 hi add => carry
316 addq $14, $20, $14 # U0 hi mul + carry
317 ldq $6, 16($16) # L1
319 bis $31, $31, $31 # U1 mt
320 addq $7, $15, $23 # L0 lo + acc
321 addq $14, $21, $14 # U0 hi mul + carry
322 ldq $7, 24($16) # L1
324 umulh $19, $1, $12 # U1
325 cmpult $23, $15, $20 # L0 lo add => carry
326 addq $23, $14, $23 # U0 hi add => answer
327 ldq $0, 32($17) # L1
329 mulq $19, $2, $13 # U1
330 cmpult $23, $14, $21 # L0 hi add => carry
331 addq $8, $20, $8 # U0 hi mul + carry
332 ldq $1, 40($17) # L1
334 umulh $19, $2, $14 # U1
335 addq $4, $9, $4 # U0 lo + acc
336 stq $22, -16($16) # L0
337 stq $23, -8($16) # L1
339 bis $31, $31, $31 # L0 st slosh
340 mulq $19, $3, $15 # U1
341 bis $31, $31, $31 # L1 st slosh
342 addq $8, $21, $8 # L0 hi mul + carry
344 cmpult $4, $9, $20 # L0 lo add => carry
345 bis $31, $31, $31 # U1 mt
346 lda $17, 64($17) # L1 bookkeeping
347 addq $4, $8, $22 # U0 hi add => answer
349 bis $31, $31, $31 # U1 mt
350 cmpult $22, $8, $21 # L0 hi add => carry
351 addq $10, $20, $10 # U0 hi mul + carry
352 ldq $4, 32($16) # L1
354 bis $31, $31, $31 # U1 mt
355 addq $5, $11, $23 # L0 lo + acc
356 addq $10, $21, $10 # L0 hi mul + carry
357 ldq $5, 40($16) # L1
359 umulh $19, $3, $8 # U1
360 cmpult $23, $11, $20 # L0 lo add => carry
361 addq $23, $10, $23 # U0 hi add => answer
362 ldq $2, -16($17) # L1
364 mulq $19, $0, $9 # U1
365 cmpult $23, $10, $21 # L0 hi add => carry
366 addq $12, $20, $12 # U0 hi mul + carry
367 ldq $3, -8($17) # L1
369 umulh $19, $0, $10 # U1
370 addq $6, $13, $6 # L0 lo + acc
371 stq $22, 0($16) # L0
372 stq $23, 8($16) # L1
374 bis $31, $31, $31 # L0 st slosh
375 mulq $19, $1, $11 # U1
376 bis $31, $31, $31 # L1 st slosh
377 addq $12, $21, $12 # U0 hi mul + carry
379 cmpult $6, $13, $20 # L0 lo add => carry
380 bis $31, $31, $31 # U1 mt
381 lda $16, 64($16) # L1 bookkeeping
382 addq $6, $12, $22 # U0 hi add => answer
384 bis $31, $31, $31 # U1 mt
385 cmpult $22, $12, $21 # L0 hi add => carry
386 addq $14, $20, $14 # U0 hi mul + carry
387 ldq $6, -16($16) # L1
389 bis $31, $31, $31 # U1 mt
390 addq $7, $15, $23 # L0 lo + acc
391 addq $14, $21, $14 # U0 hi mul + carry
392 ldq $7, -8($16) # L1
394 umulh $19, $1, $12 # U1
395 cmpult $23, $15, $20 # L0 lo add => carry
396 addq $23, $14, $23 # U0 hi add => answer
397 ldq $0, 0($17) # L1
399 mulq $19, $2, $13 # U1
400 cmpult $23, $14, $21 # L0 hi add => carry
401 addq $8, $20, $8 # U0 hi mul + carry
402 ldq $1, 8($17) # L1
404 umulh $19, $2, $14 # U1
405 addq $4, $9, $4 # L0 lo + acc
406 stq $22, -48($16) # L0
407 stq $23, -40($16) # L1
409 bis $31, $31, $31 # L0 st slosh
410 mulq $19, $3, $15 # U1
411 bis $31, $31, $31 # L1 st slosh
412 addq $8, $21, $8 # U0 hi mul + carry
414 cmpult $4, $9, $20 # L0 lo add => carry
415 addq $4, $8, $22 # U0 hi add => answer
416 bis $31, $31, $31 # L1 mt
417 bgt $18, $Loop # U1 bookkeeping
419 # ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
420 $Lend:
421 cmpult $22, $8, $21 # L0 hi add => carry
422 addq $10, $20, $10 # U0 hi mul + carry
423 ldq $4, 0($16) # L1
424 addq $5, $11, $23 # L0 lo + acc
425 addq $10, $21, $10 # L0 hi mul + carry
426 ldq $5, 8($16) # L1
427 umulh $19, $3, $8 # U1
428 cmpult $23, $11, $20 # L0 lo add => carry
429 addq $23, $10, $23 # U0 hi add => answer
430 mulq $19, $0, $9 # U1
431 cmpult $23, $10, $21 # L0 hi add => carry
432 addq $12, $20, $12 # U0 hi mul + carry
433 umulh $19, $0, $10 # U1
434 addq $6, $13, $6 # L0 lo + acc
435 stq $22, -32($16) # L0
436 stq $23, -24($16) # L1
437 mulq $19, $1, $11 # U1
438 addq $12, $21, $12 # U0 hi mul + carry
439 cmpult $6, $13, $20 # L0 lo add => carry
440 addq $6, $12, $22 # U0 hi add => answer
441 cmpult $22, $12, $21 # L0 hi add => carry
442 addq $14, $20, $14 # U0 hi mul + carry
443 addq $7, $15, $23 # L0 lo + acc
444 addq $14, $21, $14 # U0 hi mul + carry
445 umulh $19, $1, $12 # U1
446 cmpult $23, $15, $20 # L0 lo add => carry
447 addq $23, $14, $23 # U0 hi add => answer
448 cmpult $23, $14, $21 # L0 hi add => carry
449 addq $8, $20, $8 # U0 hi mul + carry
450 addq $4, $9, $4 # U0 lo + acc
451 stq $22, -16($16) # L0
452 stq $23, -8($16) # L1
453 bis $31, $31, $31 # L0 st slosh
454 addq $8, $21, $8 # L0 hi mul + carry
455 cmpult $4, $9, $20 # L0 lo add => carry
456 addq $4, $8, $22 # U0 hi add => answer
457 cmpult $22, $8, $21 # L0 hi add => carry
458 addq $10, $20, $10 # U0 hi mul + carry
459 addq $5, $11, $23 # L0 lo + acc
460 addq $10, $21, $10 # L0 hi mul + carry
461 cmpult $23, $11, $20 # L0 lo add => carry
462 addq $23, $10, $23 # U0 hi add => answer
463 cmpult $23, $10, $21 # L0 hi add => carry
464 addq $12, $20, $12 # U0 hi mul + carry
465 stq $22, 0($16) # L0
466 stq $23, 8($16) # L1
467 addq $12, $21, $0 # U0 hi mul + carry
469 ldq $9, 8($30)
470 ldq $10, 16($30)
471 ldq $11, 24($30)
472 ldq $12, 32($30)
473 ldq $13, 40($30)
474 ldq $14, 48($30)
475 ldq $15, 56($30)
476 lda $30, 240($30)
477 ret $31, ($26), 1
479 .end __mpn_addmul_1