usr/src/cmd/sgs/rtld.4.x/umultiply.s

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License, Version 1.0 only
   6  * (the "License").  You may not use this file except in compliance
   7  * with the License.
   8  *
   9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  10  * or http://www.opensolaris.org/os/licensing.
  11  * See the License for the specific language governing permissions
  12  * and limitations under the License.
  13  *
  14  * When distributing Covered Code, include this CDDL HEADER in each
  15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  16  * If applicable, add the following below this CDDL HEADER, with the
  17  * fields enclosed by brackets "[]" replaced with your own identifying
  18  * information: Portions Copyright [yyyy] [name of copyright owner]
  19  *
  20  * CDDL HEADER END
  21  */
  22 /*
  23  *      .seg    "data"
  24  *      .asciz  "Copyr 1987 Sun Micro"
  25  *      .align  4
  26  */
  27         .seg    "text"
  28
  29 #ident  "%Z%%M% %I%     %E% SMI"
  30
  31 !       Copyright (c) 1987 by Sun Microsystems, Inc.
  32
  33
  34 #include <sys/asm_linkage.h>
  35
  36 /*
  37  * procedure to perform a 32 by 32 unsigned integer multiply.
  38  * pass the multiplier into %o0, and the multiplicand into %o1
  39  * the least significant 32 bits of the result will be returned in %o0,
  40  * and the most significant in %o1
  41  *
  42  * Most unsigned integer multiplies involve small numbers, so it is
  43  * worthwhile to optimize for short multiplies at the expense of long
  44  * multiplies.  This code checks the size of the multiplier, and has
  45  * special cases for the following:
  46  *
  47  *      4 or fewer bit multipliers:     19 or 21 instruction cycles
  48  *      8 or fewer bit multipliers:     26 or 28 instruction cycles
  49  *      12 or fewer bit multipliers:    34 or 36 instruction cycles
  50  *      16 or fewer bit multipliers:    42 or 44 instruction cycles
  51  *
  52  * Long multipliers require 58 or 60 instruction cycles:
  53  *
  54  * This code indicates that overflow has occured, by leaving the Z condition
  55  * code clear. The following call sequence would be used if you wish to
  56  * deal with overflow:
  57  *
  58  *              call    .umul
  59  *              nop             ( or set up last parameter here )
  60  *              bnz     overflow_code   (or tnz to overflow handler)
  61  */
  62
  63 !       RTENTRY(.umul)
  64         .global .umul
  65 .umul:
  66         wr      %o0, %y                 ! multiplier to Y register
  67
  68         andncc  %o0, 0xf, %o4           ! mask out lower 4 bits; if branch
  69                                         ! taken, %o4, N and V have been cleared
  70
  71         be      umul_4bit               ! 4-bit multiplier
  72         sethi   %hi(0xffff0000), %o5    ! mask for 16-bit case; have to
  73                                         ! wait 3 instructions after wd
  74                                         ! before %y has stabilized anyway
  75
  76         andncc  %o0, 0xff, %o4
  77         be,a    umul_8bit               ! 8-bit multiplier
  78         mulscc  %o4, %o1, %o4           ! first iteration of 9
  79
  80         andncc  %o0, 0xfff, %o4
  81         be,a    umul_12bit              ! 12-bit multiplier
  82         mulscc  %o4, %o1, %o4           ! first iteration of 13
  83
  84         andcc   %o0, %o5, %o4
  85         be,a    umul_16bit              ! 16-bit multiplier
  86         mulscc  %o4, %o1, %o4           ! first iteration of 17
  87
  88         andcc   %g0, %g0, %o4           ! zero the partial product
  89                                         ! and clear N and V conditions
  90         !
  91         ! long multiply
  92         !
  93         mulscc  %o4, %o1, %o4           ! first iteration of 33
  94         mulscc  %o4, %o1, %o4
  95         mulscc  %o4, %o1, %o4
  96         mulscc  %o4, %o1, %o4
  97         mulscc  %o4, %o1, %o4
  98         mulscc  %o4, %o1, %o4
  99         mulscc  %o4, %o1, %o4
 100         mulscc  %o4, %o1, %o4
 101         mulscc  %o4, %o1, %o4
 102         mulscc  %o4, %o1, %o4
 103         mulscc  %o4, %o1, %o4
 104         mulscc  %o4, %o1, %o4
 105         mulscc  %o4, %o1, %o4
 106         mulscc  %o4, %o1, %o4
 107         mulscc  %o4, %o1, %o4
 108         mulscc  %o4, %o1, %o4
 109         mulscc  %o4, %o1, %o4
 110         mulscc  %o4, %o1, %o4
 111         mulscc  %o4, %o1, %o4
 112         mulscc  %o4, %o1, %o4
 113         mulscc  %o4, %o1, %o4
 114         mulscc  %o4, %o1, %o4
 115         mulscc  %o4, %o1, %o4
 116         mulscc  %o4, %o1, %o4
 117         mulscc  %o4, %o1, %o4
 118         mulscc  %o4, %o1, %o4
 119         mulscc  %o4, %o1, %o4
 120         mulscc  %o4, %o1, %o4
 121         mulscc  %o4, %o1, %o4
 122         mulscc  %o4, %o1, %o4
 123         mulscc  %o4, %o1, %o4
 124         mulscc  %o4, %o1, %o4           ! 32nd iteration
 125         mulscc  %o4, %g0, %o4           ! last iteration only shifts
 126         !
 127         ! For unsigned multiplies, a pure shifty-add approach yields the
 128         ! correct result.  Signed multiplies introduce complications.
 129         !
 130         ! With 32-bit twos-complement numbers, -x can be represented as
 131         !
 132         !       ((2 - (x/(2**32)) mod 2) * 2**32.
 133         !
 134         ! To simplify the equations, the radix point can be moved to just
 135         ! to the left of the sign bit.  So:
 136         !
 137         !        x *  y = (xy) mod 2
 138         !       -x *  y = (2 - x) mod 2 * y = (2y - xy) mod 2
 139         !        x * -y = x * (2 - y) mod 2 = (2x - xy) mod 2
 140         !       -x * -y = (2 - x) * (2 - y) = (4 - 2x - 2y + xy) mod 2
 141         !
 142         ! Because of the way the shift into the partial product is calculated
 143         ! (N xor V), the extra term is automagically removed for negative
 144         ! multiplicands, so no adjustment is necessary.
 145         !
 146         ! But for unsigned multiplies, the high-order bit of the multiplicand
 147         ! is incorrectly treated as a sign bit.  For unsigned multiplies where
 148         ! the high-order bit of the multiplicand is one, the result is
 149         !
 150         !       xy - y * (2**32)
 151         !
 152         ! we fix that here
 153         !
 154         tst     %o1
 155         bge     1f
 156         nop
 157
 158         add     %o4, %o0, %o4           ! add (2**32) * %o0; bits 63-32
 159                                         ! of the product are in %o4
 160         !
 161         ! The multiply hasn't overflowed if the high-order bits are 0
 162         !
 163         ! if you are not interested in detecting overflow,
 164         ! replace the following code with:
 165         !
 166         !       1:
 167         !               rd      %y, %o0
 168         !               retl
 169         !               mov     %o4, %o1
 170         !
 171 1:
 172         rd      %y, %o0
 173         retl                            ! leaf routine return
 174         addcc   %o4, %g0, %o1           ! return high-order bits and set Z if
 175                                         ! high order bits are 0
 176         !
 177         ! 4-bit multiply
 178         !
 179 umul_4bit:
 180         mulscc  %o4, %o1, %o4           ! first iteration of 5
 181         mulscc  %o4, %o1, %o4
 182         mulscc  %o4, %o1, %o4
 183         mulscc  %o4, %o1, %o4           ! 4th iteration
 184         mulscc  %o4, %g0, %o4           ! last iteration only shifts
 185
 186         rd      %y, %o5
 187         !
 188         ! The folowing code adds (2**32) * %o0 to the product if the
 189         ! multiplicand had it's high bit set (see 32-bit case for explanation)
 190         !
 191         tst     %o1
 192         bge     2f
 193         sra     %o4, 28, %o1            ! right shift high bits by 28 bits
 194
 195         add     %o1, %o0, %o1
 196         !
 197         ! The multiply hasn't overflowed if high-order bits are 0
 198         !
 199         ! if you are not interested in detecting overflow,
 200         ! replace the following code with:
 201         !
 202         !       2:
 203         !               sll     %o4, 4, %o0
 204         !               srl     %o5, 28, %o5
 205         !               retl
 206         !               or      %o5, %o0, %o0
 207         !
 208 2:
 209         sll     %o4, 4, %o0             ! left shift middle bits by 4 bits
 210         srl     %o5, 28, %o5            ! right shift low bits by 28 bits
 211         or      %o5, %o0, %o0           ! merge for true product
 212         retl                            ! leaf routine return
 213         tst     %o1                     ! set Z if high order bits are 0
 214         !
 215         ! 8-bit multiply
 216         !
 217 umul_8bit:
 218         mulscc  %o4, %o1, %o4           ! second iteration of 9
 219         mulscc  %o4, %o1, %o4
 220         mulscc  %o4, %o1, %o4
 221         mulscc  %o4, %o1, %o4
 222         mulscc  %o4, %o1, %o4
 223         mulscc  %o4, %o1, %o4
 224         mulscc  %o4, %o1, %o4           ! 8th iteration
 225         mulscc  %o4, %g0, %o4           ! last iteration only shifts
 226
 227         rd      %y, %o5
 228         !
 229         ! The folowing code adds (2**32) * %o0 to the product if the
 230         ! multiplicand had it's high bit set (see 32-bit case for explanation)
 231         !
 232         tst     %o1
 233         bge     3f
 234         sra     %o4, 24, %o1            ! right shift high bits by 24 bits
 235
 236         add     %o1, %o0, %o1
 237         !
 238         ! The multiply hasn't overflowed if high-order bits are 0
 239         !
 240         ! if you are not interested in detecting overflow,
 241         ! replace the following code with:
 242         !
 243         !       3:
 244         !               sll     %o4, 8, %o0
 245         !               srl     %o5, 24, %o5
 246         !               retl
 247         !               or      %o5, %o0, %o0
 248         !
 249 3:
 250         sll     %o4, 8, %o0             ! left shift middle bits by 8 bits
 251         srl     %o5, 24, %o5            ! right shift low bits by 24 bits
 252         or      %o5, %o0, %o0           ! merge for true product
 253         retl                            ! leaf routine return
 254         tst     %o1                     ! set Z if high order bits are 0
 255         !
 256         ! 12-bit multiply
 257         !
 258 umul_12bit:
 259         mulscc  %o4, %o1, %o4           ! second iteration of 13
 260         mulscc  %o4, %o1, %o4
 261         mulscc  %o4, %o1, %o4
 262         mulscc  %o4, %o1, %o4
 263         mulscc  %o4, %o1, %o4
 264         mulscc  %o4, %o1, %o4
 265         mulscc  %o4, %o1, %o4
 266         mulscc  %o4, %o1, %o4
 267         mulscc  %o4, %o1, %o4
 268         mulscc  %o4, %o1, %o4
 269         mulscc  %o4, %o1, %o4           ! 12th iteration
 270         mulscc  %o4, %g0, %o4           ! last iteration only shifts
 271
 272         rd      %y, %o5
 273         !
 274         ! The folowing code adds (2**32) * %o0 to the product if the
 275         ! multiplicand had it's high bit set (see 32-bit case for explanation)
 276         !
 277         tst     %o1
 278         bge     4f
 279         sra     %o4, 20, %o1            ! right shift high bits by 20 bits
 280
 281         add     %o1, %o0, %o1
 282         !
 283         ! The multiply hasn't overflowed if high-order bits are 0
 284         !
 285         ! if you are not interested in detecting overflow,
 286         ! replace the following code with:
 287         !
 288         !       4:
 289         !               sll     %o4, 12, %o0
 290         !               srl     %o5, 20, %o5
 291         !               retl
 292         !               or      %o5, %o0, %o0
 293         !
 294 4:
 295         sll     %o4, 12, %o0            ! left shift middle bits by 12 bits
 296         srl     %o5, 20, %o5            ! right shift low bits by 20 bits
 297         or      %o5, %o0, %o0           ! merge for true product
 298         retl                            ! leaf routine return
 299         tst     %o1                     ! set Z if high order bits are 0
 300         !
 301         ! 16-bit multiply
 302         !
 303 umul_16bit:
 304         mulscc  %o4, %o1, %o4           ! second iteration of 17
 305         mulscc  %o4, %o1, %o4
 306         mulscc  %o4, %o1, %o4
 307         mulscc  %o4, %o1, %o4
 308         mulscc  %o4, %o1, %o4
 309         mulscc  %o4, %o1, %o4
 310         mulscc  %o4, %o1, %o4
 311         mulscc  %o4, %o1, %o4
 312         mulscc  %o4, %o1, %o4
 313         mulscc  %o4, %o1, %o4
 314         mulscc  %o4, %o1, %o4
 315         mulscc  %o4, %o1, %o4
 316         mulscc  %o4, %o1, %o4
 317         mulscc  %o4, %o1, %o4
 318         mulscc  %o4, %o1, %o4           ! 16th iteration
 319         mulscc  %o4, %g0, %o4           ! last iteration only shifts
 320
 321         rd      %y, %o5
 322         !
 323         ! The folowing code adds (2**32) * %o0 to the product if the
 324         ! multiplicand had it's high bit set (see 32-bit case for explanation)
 325         !
 326         tst     %o1
 327         bge     5f
 328         sra     %o4, 16, %o1            ! right shift high bits by 16 bits
 329
 330         add     %o1, %o0, %o1
 331         !
 332         ! The multiply hasn't overflowed if high-order bits are 0
 333         !
 334         ! if you are not interested in detecting overflow,
 335         ! replace the following code with:
 336         !
 337         !       5:
 338         !               sll     %o4, 16, %o0
 339         !               srl     %o5, 16, %o5
 340         !               retl
 341         !               or      %o5, %o0, %o0
 342         !
 343 5:
 344         sll     %o4, 16, %o0            ! left shift middle bits by 16 bits
 345         srl     %o5, 16, %o5            ! right shift low bits by 16 bits
 346         or      %o5, %o0, %o0           ! merge for true product
 347         retl                            ! leaf routine return
 348         tst     %o1                     ! set Z if high order bits are 0