sysdeps/m88k/m88100/add_n.S

   1 ; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store
   2 ; sum in a third limb vector.
   3
   4 ; Copyright (C) 1992, 1994 Free Software Foundation, Inc.
   5
   6 ; This file is part of the GNU MP Library.
   7
   8 ; The GNU MP Library is free software; you can redistribute it and/or modify
   9 ; it under the terms of the GNU General Public License as published by
  10 ; the Free Software Foundation; either version 2, or (at your option)
  11 ; any later version.
  12
  13 ; The GNU MP Library is distributed in the hope that it will be useful,
  14 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 ; GNU General Public License for more details.
  17
  18 ; You should have received a copy of the GNU General Public License
  19 ; along with the GNU MP Library; see the file COPYING.  If not, write to
  20 ; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  21
  22
  23 ; INPUT PARAMETERS
  24 ; res_ptr       r2
  25 ; s1_ptr        r3
  26 ; s2_ptr        r4
  27 ; size          r5
  28
  29 ; This code has been optimized to run one instruction per clock, avoiding
  30 ; load stalls and writeback contention.  As a result, the instruction
  31 ; order is not always natural.
  32
  33 ; The speed is approximately 4.3 clocks/limb + 18 clocks/limb-vector.
  34
  35 #include "sysdep.h"
  36
  37 ENTRY (__mpn_add_n)
  38         ld      r6,r3,0                 ; read first limb from s1_ptr
  39         extu    r10,r5,4
  40         ld      r7,r4,0                 ; read first limb from s2_ptr
  41
  42         subu.co r5,r0,r5                ; (clear carry as side effect)
  43         mak     r5,r5,4<4>
  44         bcnd    eq0,r5,Lzero
  45
  46         or      r12,r0,lo16(Lbase)
  47         or.u    r12,r12,hi16(Lbase)
  48         addu    r12,r12,r5              ; r12 is address for entering in loop
  49
  50         extu    r5,r5,2                 ; divide by 4
  51         subu    r2,r2,r5                ; adjust res_ptr
  52         subu    r3,r3,r5                ; adjust s1_ptr
  53         subu    r4,r4,r5                ; adjust s2_ptr
  54
  55         or      r8,r6,r0
  56
  57         jmp.n   r12
  58          or     r9,r7,r0
  59
  60 Loop:   addu    r3,r3,64
  61         st      r8,r2,60
  62         addu    r4,r4,64
  63         ld      r6,r3,0
  64         addu    r2,r2,64
  65         ld      r7,r4,0
  66 Lzero:  subu    r10,r10,1       ; add 0 + 16r limbs (adjust loop counter)
  67 Lbase:  ld      r8,r3,4
  68         addu.cio r6,r6,r7
  69         ld      r9,r4,4
  70         st      r6,r2,0
  71         ld      r6,r3,8         ; add 15 + 16r limbs
  72         addu.cio r8,r8,r9
  73         ld      r7,r4,8
  74         st      r8,r2,4
  75         ld      r8,r3,12        ; add 14 + 16r limbs
  76         addu.cio r6,r6,r7
  77         ld      r9,r4,12
  78         st      r6,r2,8
  79         ld      r6,r3,16        ; add 13 + 16r limbs
  80         addu.cio r8,r8,r9
  81         ld      r7,r4,16
  82         st      r8,r2,12
  83         ld      r8,r3,20        ; add 12 + 16r limbs
  84         addu.cio r6,r6,r7
  85         ld      r9,r4,20
  86         st      r6,r2,16
  87         ld      r6,r3,24        ; add 11 + 16r limbs
  88         addu.cio r8,r8,r9
  89         ld      r7,r4,24
  90         st      r8,r2,20
  91         ld      r8,r3,28        ; add 10 + 16r limbs
  92         addu.cio r6,r6,r7
  93         ld      r9,r4,28
  94         st      r6,r2,24
  95         ld      r6,r3,32        ; add 9 + 16r limbs
  96         addu.cio r8,r8,r9
  97         ld      r7,r4,32
  98         st      r8,r2,28
  99         ld      r8,r3,36        ; add 8 + 16r limbs
 100         addu.cio r6,r6,r7
 101         ld      r9,r4,36
 102         st      r6,r2,32
 103         ld      r6,r3,40        ; add 7 + 16r limbs
 104         addu.cio r8,r8,r9
 105         ld      r7,r4,40
 106         st      r8,r2,36
 107         ld      r8,r3,44        ; add 6 + 16r limbs
 108         addu.cio r6,r6,r7
 109         ld      r9,r4,44
 110         st      r6,r2,40
 111         ld      r6,r3,48        ; add 5 + 16r limbs
 112         addu.cio r8,r8,r9
 113         ld      r7,r4,48
 114         st      r8,r2,44
 115         ld      r8,r3,52        ; add 4 + 16r limbs
 116         addu.cio r6,r6,r7
 117         ld      r9,r4,52
 118         st      r6,r2,48
 119         ld      r6,r3,56        ; add 3 + 16r limbs
 120         addu.cio r8,r8,r9
 121         ld      r7,r4,56
 122         st      r8,r2,52
 123         ld      r8,r3,60        ; add 2 + 16r limbs
 124         addu.cio r6,r6,r7
 125         ld      r9,r4,60
 126         st      r6,r2,56
 127         bcnd.n  ne0,r10,Loop    ; add 1 + 16r limbs
 128          addu.cio r8,r8,r9
 129
 130         st      r8,r2,60                ; store most significant limb
 131
 132         jmp.n    r1
 133          addu.ci r2,r0,r0               ; return carry-out from most sign. limb