1 # Alpha ev6 mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 # the result to a second limb vector.
4 # Copyright (C) 2000 Free Software Foundation, Inc.
6 # This file is part of the GNU MP Library.
8 # The GNU MP Library is free software; you can redistribute it and/or modify
9 # it under the terms of the GNU Lesser General Public License as published
10 # by the Free Software Foundation; either version 2.1 of the License, or (at
11 # your option) any later version.
13 # The GNU MP Library is distributed in the hope that it will be useful, but
14 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 # License for more details.
18 # You should have received a copy of the GNU Lesser General Public License
19 # along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20 # the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
29 # This code runs at 42 cycles/limb on EV4, 18 cycles/limb on EV5, and
30 # exactly 3.625 cycles/limb on EV6...
32 # This code was written in close cooperation with ev6 pipeline expert
33 # Steve Root (root@toober.hlo.dec.com). Any errors are tege's fault, though.
35 # Register usages for unrolled loop:
40 # 22,23 save for stores
42 # Sustains 8 mul-adds in 29 cycles in the unrolled inner loop.
44 # The stores can issue a cycle late so we have paired no-op's to 'catch'
45 # them, so that further disturbance to the schedule is damped.
47 # We couldn't pair the loads, because the entangled schedule of the
48 # carry's has to happen on one side {0} of the machine. Note, the total
49 # use of U0, and the total use of L0 (after attending to the stores).
50 # which is part of the reason why....
52 # This is a great schedule for the d_cache, a poor schedule for the
53 # b_cache. The lockup on U0 means that any stall can't be recovered
54 # from. Consider a ldq in L1. say that load gets stalled because it
55 # collides with a fill from the b_Cache. On the next cycle, this load
56 # gets priority. If first looks at L0, and goes there. The instruction
57 # we intended for L0 gets to look at L1, which is NOT where we want
58 # it. It either stalls 1, because it can't go in L0, or goes there, and
59 # causes a further instruction to stall.
61 # So for b_cache, we're likely going to want to put one or more cycles
62 # back into the code! And, of course, put in prefetches. For the
63 # accumulator, lds, intent to modify. For the multiplier, you might
64 # want ldq, evict next, if you're not wanting to use it again soon. Use
65 # 256 ahead of present pointer value. At a place where we have an mt
66 # followed by a bookkeeping, put the bookkeeping in upper, and the
67 # prefetch into lower.
69 # Note, the usage of physical registers per cycle is smoothed off, as
72 # Note, the ldq's and stq's are at the end of the quadpacks. note, we'd
73 # like not to have a ldq or stq to preceded a conditional branch in a
74 # quadpack. The conditional branch moves the retire pointer one cycle
78 # Callee-saves regs: $9 $10 $11 $12 $13 $14 $15 $26 ?$27?
79 # Reserved regs: $29 $30 $31
80 # Free caller-saves regs in unrolled code: $24 $25 $28
81 # We should swap some of the callee-saves regs for some of the free
82 # caller-saves regs, saving some overhead cycles.
83 # Most importantly, we should write fast code for the 0-7 case.
84 # The code we use there are for the 21164, and runs at 7 cycles/limb
85 # on the 21264. Should not be hard, if we write specialized code for
86 # 1-7 limbs (the one for 0 limbs should be straightforward). We then just
87 # need a jump table indexed by the low 3 bits of the count argument.
102 ldq $
2, 0($
17) # $2 = s1_limb
103 addq $
17, 8, $
17 # s1_ptr++
104 subq $
18, 1, $
18 # size--
105 mulq $
2, $
19, $
3 # $3 = prod_low
106 ldq $
5, 0($
16) # $5 = *res_ptr
107 umulh $
2, $
19, $
0 # $0 = prod_high
108 beq $
18, $Lend0b
# jump if size was == 1
109 ldq $
2, 0($
17) # $2 = s1_limb
110 addq $
17, 8, $
17 # s1_ptr++
111 subq $
18, 1, $
18 # size--
115 addq $
16, 8, $
16 # res_ptr++
116 beq $
18, $Lend0a
# jump if size was == 2
119 $Loop0
: mulq $
2, $
19, $
3 # $3 = prod_low
120 ldq $
5, 0($
16) # $5 = *res_ptr
121 addq $
4, $
0, $
0 # cy_limb = cy_limb + 'cy'
122 subq $
18, 1, $
18 # size--
123 umulh $
2, $
19, $
4 # $4 = cy_limb
124 ldq $
2, 0($
17) # $2 = s1_limb
125 addq $
17, 8, $
17 # s1_ptr++
126 addq $
3, $
0, $
3 # $3 = cy_limb + prod_low
127 cmpult $
3, $
0, $
0 # $0 = carry from (cy_limb + prod_low)
131 addq $
16, 8, $
16 # res_ptr++
132 addq $
5, $
0, $
0 # combine carries
135 mulq $
2, $
19, $
3 # $3 = prod_low
136 ldq $
5, 0($
16) # $5 = *res_ptr
137 addq $
4, $
0, $
0 # cy_limb = cy_limb + 'cy'
138 umulh $
2, $
19, $
4 # $4 = cy_limb
139 addq $
3, $
0, $
3 # $3 = cy_limb + prod_low
140 cmpult $
3, $
0, $
0 # $0 = carry from (cy_limb + prod_low)
144 addq $
5, $
0, $
0 # combine carries
145 addq $
4, $
0, $
0 # cy_limb = prod_high + cy
164 and $
18, 7, $
20 # count for the first loop, 0-7
165 srl $
18, 3, $
18 # count for unrolled loop
168 ldq $
2, 0($
17) # $2 = s1_limb
169 addq $
17, 8, $
17 # s1_ptr++
170 subq $
20, 1, $
20 # size--
171 mulq $
2, $
19, $
3 # $3 = prod_low
172 ldq $
5, 0($
16) # $5 = *res_ptr
173 umulh $
2, $
19, $
0 # $0 = prod_high
174 beq $
20, $Lend1b
# jump if size was == 1
175 ldq $
2, 0($
17) # $2 = s1_limb
176 addq $
17, 8, $
17 # s1_ptr++
177 subq $
20, 1, $
20 # size--
181 addq $
16, 8, $
16 # res_ptr++
182 beq $
20, $Lend1a
# jump if size was == 2
185 $Loop1
: mulq $
2, $
19, $
3 # $3 = prod_low
186 ldq $
5, 0($
16) # $5 = *res_ptr
187 addq $
4, $
0, $
0 # cy_limb = cy_limb + 'cy'
188 subq $
20, 1, $
20 # size--
189 umulh $
2, $
19, $
4 # $4 = cy_limb
190 ldq $
2, 0($
17) # $2 = s1_limb
191 addq $
17, 8, $
17 # s1_ptr++
192 addq $
3, $
0, $
3 # $3 = cy_limb + prod_low
193 cmpult $
3, $
0, $
0 # $0 = carry from (cy_limb + prod_low)
197 addq $
16, 8, $
16 # res_ptr++
198 addq $
5, $
0, $
0 # combine carries
202 mulq $
2, $
19, $
3 # $3 = prod_low
203 ldq $
5, 0($
16) # $5 = *res_ptr
204 addq $
4, $
0, $
0 # cy_limb = cy_limb + 'cy'
205 umulh $
2, $
19, $
4 # $4 = cy_limb
206 addq $
3, $
0, $
3 # $3 = cy_limb + prod_low
207 cmpult $
3, $
0, $
0 # $0 = carry from (cy_limb + prod_low)
211 addq $
16, 8, $
16 # res_ptr++
212 addq $
5, $
0, $
0 # combine carries
213 addq $
4, $
0, $
0 # cy_limb = prod_high + cy
219 addq $
16, 8, $
16 # res_ptr++
223 lda $
17, -16($
17) # L1 bookkeeping
224 lda $
16, -16($
16) # L1 bookkeeping
227 # ____ UNROLLED LOOP SOFTWARE PIPELINE STARTUP ____
231 lda $
18, -1($
18) # L1 bookkeeping
235 mulq $
19, $
2, $
13 # U1
237 umulh $
19, $
2, $
14 # U1
238 mulq $
19, $
3, $
15 # U1
239 lda $
17, 64($
17) # L1 bookkeeping
242 umulh $
19, $
3, $
8 # U1
243 ldq $
2, -16($
17) # L1
244 mulq $
19, $
0, $
9 # U1
246 umulh $
19, $
0, $
10 # U1
247 addq $
6, $
13, $
6 # L0 lo + acc
248 mulq $
19, $
1, $
11 # U1
249 cmpult $
6, $
13, $
20 # L0 lo add => carry
250 lda $
16, 64($
16) # L1 bookkeeping
251 addq $
6, $
12, $
22 # U0 hi add => answer
252 cmpult $
22, $
12, $
21 # L0 hi add => carry
253 addq $
14, $
20, $
14 # U0 hi mul + carry
254 ldq $
6, -16($
16) # L1
255 addq $
7, $
15, $
23 # L0 lo + acc
256 addq $
14, $
21, $
14 # U0 hi mul + carry
258 umulh $
19, $
1, $
12 # U1
259 cmpult $
23, $
15, $
20 # L0 lo add => carry
260 addq $
23, $
14, $
23 # U0 hi add => answer
262 mulq $
19, $
2, $
13 # U1
263 cmpult $
23, $
14, $
21 # L0 hi add => carry
264 addq $
8, $
20, $
8 # U0 hi mul + carry
266 umulh $
19, $
2, $
14 # U1
267 addq $
4, $
9, $
4 # L0 lo + acc
268 stq $
22, -48($
16) # L0
269 stq $
23, -40($
16) # L1
270 mulq $
19, $
3, $
15 # U1
271 addq $
8, $
21, $
8 # U0 hi mul + carry
272 cmpult $
4, $
9, $
20 # L0 lo add => carry
273 addq $
4, $
8, $
22 # U0 hi add => answer
274 ble $
18, $Lend
# U1 bookkeeping
276 # ____ MAIN UNROLLED LOOP ____
279 bis $
31, $
31, $
31 # U1 mt
280 cmpult $
22, $
8, $
21 # L0 hi add => carry
281 addq $
10, $
20, $
10 # U0 hi mul + carry
284 bis $
31, $
31, $
31 # U1 mt
285 addq $
5, $
11, $
23 # L0 lo + acc
286 addq $
10, $
21, $
10 # L0 hi mul + carry
289 umulh $
19, $
3, $
8 # U1
290 cmpult $
23, $
11, $
20 # L0 lo add => carry
291 addq $
23, $
10, $
23 # U0 hi add => answer
294 mulq $
19, $
0, $
9 # U1
295 cmpult $
23, $
10, $
21 # L0 hi add => carry
296 addq $
12, $
20, $
12 # U0 hi mul + carry
299 umulh $
19, $
0, $
10 # U1
300 addq $
6, $
13, $
6 # L0 lo + acc
301 stq $
22, -32($
16) # L0
302 stq $
23, -24($
16) # L1
304 bis $
31, $
31, $
31 # L0 st slosh
305 mulq $
19, $
1, $
11 # U1
306 bis $
31, $
31, $
31 # L1 st slosh
307 addq $
12, $
21, $
12 # U0 hi mul + carry
309 cmpult $
6, $
13, $
20 # L0 lo add => carry
310 bis $
31, $
31, $
31 # U1 mt
311 lda $
18, -1($
18) # L1 bookkeeping
312 addq $
6, $
12, $
22 # U0 hi add => answer
314 bis $
31, $
31, $
31 # U1 mt
315 cmpult $
22, $
12, $
21 # L0 hi add => carry
316 addq $
14, $
20, $
14 # U0 hi mul + carry
319 bis $
31, $
31, $
31 # U1 mt
320 addq $
7, $
15, $
23 # L0 lo + acc
321 addq $
14, $
21, $
14 # U0 hi mul + carry
324 umulh $
19, $
1, $
12 # U1
325 cmpult $
23, $
15, $
20 # L0 lo add => carry
326 addq $
23, $
14, $
23 # U0 hi add => answer
329 mulq $
19, $
2, $
13 # U1
330 cmpult $
23, $
14, $
21 # L0 hi add => carry
331 addq $
8, $
20, $
8 # U0 hi mul + carry
334 umulh $
19, $
2, $
14 # U1
335 addq $
4, $
9, $
4 # U0 lo + acc
336 stq $
22, -16($
16) # L0
337 stq $
23, -8($
16) # L1
339 bis $
31, $
31, $
31 # L0 st slosh
340 mulq $
19, $
3, $
15 # U1
341 bis $
31, $
31, $
31 # L1 st slosh
342 addq $
8, $
21, $
8 # L0 hi mul + carry
344 cmpult $
4, $
9, $
20 # L0 lo add => carry
345 bis $
31, $
31, $
31 # U1 mt
346 lda $
17, 64($
17) # L1 bookkeeping
347 addq $
4, $
8, $
22 # U0 hi add => answer
349 bis $
31, $
31, $
31 # U1 mt
350 cmpult $
22, $
8, $
21 # L0 hi add => carry
351 addq $
10, $
20, $
10 # U0 hi mul + carry
354 bis $
31, $
31, $
31 # U1 mt
355 addq $
5, $
11, $
23 # L0 lo + acc
356 addq $
10, $
21, $
10 # L0 hi mul + carry
359 umulh $
19, $
3, $
8 # U1
360 cmpult $
23, $
11, $
20 # L0 lo add => carry
361 addq $
23, $
10, $
23 # U0 hi add => answer
362 ldq $
2, -16($
17) # L1
364 mulq $
19, $
0, $
9 # U1
365 cmpult $
23, $
10, $
21 # L0 hi add => carry
366 addq $
12, $
20, $
12 # U0 hi mul + carry
369 umulh $
19, $
0, $
10 # U1
370 addq $
6, $
13, $
6 # L0 lo + acc
374 bis $
31, $
31, $
31 # L0 st slosh
375 mulq $
19, $
1, $
11 # U1
376 bis $
31, $
31, $
31 # L1 st slosh
377 addq $
12, $
21, $
12 # U0 hi mul + carry
379 cmpult $
6, $
13, $
20 # L0 lo add => carry
380 bis $
31, $
31, $
31 # U1 mt
381 lda $
16, 64($
16) # L1 bookkeeping
382 addq $
6, $
12, $
22 # U0 hi add => answer
384 bis $
31, $
31, $
31 # U1 mt
385 cmpult $
22, $
12, $
21 # L0 hi add => carry
386 addq $
14, $
20, $
14 # U0 hi mul + carry
387 ldq $
6, -16($
16) # L1
389 bis $
31, $
31, $
31 # U1 mt
390 addq $
7, $
15, $
23 # L0 lo + acc
391 addq $
14, $
21, $
14 # U0 hi mul + carry
394 umulh $
19, $
1, $
12 # U1
395 cmpult $
23, $
15, $
20 # L0 lo add => carry
396 addq $
23, $
14, $
23 # U0 hi add => answer
399 mulq $
19, $
2, $
13 # U1
400 cmpult $
23, $
14, $
21 # L0 hi add => carry
401 addq $
8, $
20, $
8 # U0 hi mul + carry
404 umulh $
19, $
2, $
14 # U1
405 addq $
4, $
9, $
4 # L0 lo + acc
406 stq $
22, -48($
16) # L0
407 stq $
23, -40($
16) # L1
409 bis $
31, $
31, $
31 # L0 st slosh
410 mulq $
19, $
3, $
15 # U1
411 bis $
31, $
31, $
31 # L1 st slosh
412 addq $
8, $
21, $
8 # U0 hi mul + carry
414 cmpult $
4, $
9, $
20 # L0 lo add => carry
415 addq $
4, $
8, $
22 # U0 hi add => answer
416 bis $
31, $
31, $
31 # L1 mt
417 bgt $
18, $Loop
# U1 bookkeeping
419 # ____ UNROLLED LOOP SOFTWARE PIPELINE FINISH ____
421 cmpult $
22, $
8, $
21 # L0 hi add => carry
422 addq $
10, $
20, $
10 # U0 hi mul + carry
424 addq $
5, $
11, $
23 # L0 lo + acc
425 addq $
10, $
21, $
10 # L0 hi mul + carry
427 umulh $
19, $
3, $
8 # U1
428 cmpult $
23, $
11, $
20 # L0 lo add => carry
429 addq $
23, $
10, $
23 # U0 hi add => answer
430 mulq $
19, $
0, $
9 # U1
431 cmpult $
23, $
10, $
21 # L0 hi add => carry
432 addq $
12, $
20, $
12 # U0 hi mul + carry
433 umulh $
19, $
0, $
10 # U1
434 addq $
6, $
13, $
6 # L0 lo + acc
435 stq $
22, -32($
16) # L0
436 stq $
23, -24($
16) # L1
437 mulq $
19, $
1, $
11 # U1
438 addq $
12, $
21, $
12 # U0 hi mul + carry
439 cmpult $
6, $
13, $
20 # L0 lo add => carry
440 addq $
6, $
12, $
22 # U0 hi add => answer
441 cmpult $
22, $
12, $
21 # L0 hi add => carry
442 addq $
14, $
20, $
14 # U0 hi mul + carry
443 addq $
7, $
15, $
23 # L0 lo + acc
444 addq $
14, $
21, $
14 # U0 hi mul + carry
445 umulh $
19, $
1, $
12 # U1
446 cmpult $
23, $
15, $
20 # L0 lo add => carry
447 addq $
23, $
14, $
23 # U0 hi add => answer
448 cmpult $
23, $
14, $
21 # L0 hi add => carry
449 addq $
8, $
20, $
8 # U0 hi mul + carry
450 addq $
4, $
9, $
4 # U0 lo + acc
451 stq $
22, -16($
16) # L0
452 stq $
23, -8($
16) # L1
453 bis $
31, $
31, $
31 # L0 st slosh
454 addq $
8, $
21, $
8 # L0 hi mul + carry
455 cmpult $
4, $
9, $
20 # L0 lo add => carry
456 addq $
4, $
8, $
22 # U0 hi add => answer
457 cmpult $
22, $
8, $
21 # L0 hi add => carry
458 addq $
10, $
20, $
10 # U0 hi mul + carry
459 addq $
5, $
11, $
23 # L0 lo + acc
460 addq $
10, $
21, $
10 # L0 hi mul + carry
461 cmpult $
23, $
11, $
20 # L0 lo add => carry
462 addq $
23, $
10, $
23 # U0 hi add => answer
463 cmpult $
23, $
10, $
21 # L0 hi add => carry
464 addq $
12, $
20, $
12 # U0 hi mul + carry
467 addq $
12, $
21, $
0 # U0 hi mul + carry