8354 sync regcomp(3C) with upstream (fix make catalog)
[unleashed/tickless.git] / usr / src / cmd / sgs / rtld.4.x / umultiply.s
blob2547339a7e71645dee2c1fd25347c5e000ad7734
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
7 * with the License.
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
20 * CDDL HEADER END
23 * .seg "data"
24 * .asciz "Copyr 1987 Sun Micro"
25 * .align 4
27 .seg "text"
29 #ident "%Z%%M% %I% %E% SMI"
31 ! Copyright (c) 1987 by Sun Microsystems, Inc.
34 #include <sys/asm_linkage.h>
37 * procedure to perform a 32 by 32 unsigned integer multiply.
38 * pass the multiplier into %o0, and the multiplicand into %o1
39 * the least significant 32 bits of the result will be returned in %o0,
40 * and the most significant in %o1
42 * Most unsigned integer multiplies involve small numbers, so it is
43 * worthwhile to optimize for short multiplies at the expense of long
44 * multiplies. This code checks the size of the multiplier, and has
45 * special cases for the following:
47 * 4 or fewer bit multipliers: 19 or 21 instruction cycles
48 * 8 or fewer bit multipliers: 26 or 28 instruction cycles
49 * 12 or fewer bit multipliers: 34 or 36 instruction cycles
50 * 16 or fewer bit multipliers: 42 or 44 instruction cycles
52 * Long multipliers require 58 or 60 instruction cycles:
54 * This code indicates that overflow has occured, by leaving the Z condition
55 * code clear. The following call sequence would be used if you wish to
56 * deal with overflow:
58 * call .umul
59 * nop ( or set up last parameter here )
60 * bnz overflow_code (or tnz to overflow handler)
63 ! RTENTRY(.umul)
64 .global .umul
65 .umul:
66 wr %o0, %y ! multiplier to Y register
68 andncc %o0, 0xf, %o4 ! mask out lower 4 bits; if branch
69 ! taken, %o4, N and V have been cleared
71 be umul_4bit ! 4-bit multiplier
72 sethi %hi(0xffff0000), %o5 ! mask for 16-bit case; have to
73 ! wait 3 instructions after wd
74 ! before %y has stabilized anyway
76 andncc %o0, 0xff, %o4
77 be,a umul_8bit ! 8-bit multiplier
78 mulscc %o4, %o1, %o4 ! first iteration of 9
80 andncc %o0, 0xfff, %o4
81 be,a umul_12bit ! 12-bit multiplier
82 mulscc %o4, %o1, %o4 ! first iteration of 13
84 andcc %o0, %o5, %o4
85 be,a umul_16bit ! 16-bit multiplier
86 mulscc %o4, %o1, %o4 ! first iteration of 17
88 andcc %g0, %g0, %o4 ! zero the partial product
89 ! and clear N and V conditions
91 ! long multiply
93 mulscc %o4, %o1, %o4 ! first iteration of 33
94 mulscc %o4, %o1, %o4
95 mulscc %o4, %o1, %o4
96 mulscc %o4, %o1, %o4
97 mulscc %o4, %o1, %o4
98 mulscc %o4, %o1, %o4
99 mulscc %o4, %o1, %o4
100 mulscc %o4, %o1, %o4
101 mulscc %o4, %o1, %o4
102 mulscc %o4, %o1, %o4
103 mulscc %o4, %o1, %o4
104 mulscc %o4, %o1, %o4
105 mulscc %o4, %o1, %o4
106 mulscc %o4, %o1, %o4
107 mulscc %o4, %o1, %o4
108 mulscc %o4, %o1, %o4
109 mulscc %o4, %o1, %o4
110 mulscc %o4, %o1, %o4
111 mulscc %o4, %o1, %o4
112 mulscc %o4, %o1, %o4
113 mulscc %o4, %o1, %o4
114 mulscc %o4, %o1, %o4
115 mulscc %o4, %o1, %o4
116 mulscc %o4, %o1, %o4
117 mulscc %o4, %o1, %o4
118 mulscc %o4, %o1, %o4
119 mulscc %o4, %o1, %o4
120 mulscc %o4, %o1, %o4
121 mulscc %o4, %o1, %o4
122 mulscc %o4, %o1, %o4
123 mulscc %o4, %o1, %o4
124 mulscc %o4, %o1, %o4 ! 32nd iteration
125 mulscc %o4, %g0, %o4 ! last iteration only shifts
127 ! For unsigned multiplies, a pure shifty-add approach yields the
128 ! correct result. Signed multiplies introduce complications.
130 ! With 32-bit twos-complement numbers, -x can be represented as
132 ! ((2 - (x/(2**32)) mod 2) * 2**32.
134 ! To simplify the equations, the radix point can be moved to just
135 ! to the left of the sign bit. So:
137 ! x * y = (xy) mod 2
138 ! -x * y = (2 - x) mod 2 * y = (2y - xy) mod 2
139 ! x * -y = x * (2 - y) mod 2 = (2x - xy) mod 2
140 ! -x * -y = (2 - x) * (2 - y) = (4 - 2x - 2y + xy) mod 2
142 ! Because of the way the shift into the partial product is calculated
143 ! (N xor V), the extra term is automagically removed for negative
144 ! multiplicands, so no adjustment is necessary.
146 ! But for unsigned multiplies, the high-order bit of the multiplicand
147 ! is incorrectly treated as a sign bit. For unsigned multiplies where
148 ! the high-order bit of the multiplicand is one, the result is
150 ! xy - y * (2**32)
152 ! we fix that here
154 tst %o1
155 bge 1f
158 add %o4, %o0, %o4 ! add (2**32) * %o0; bits 63-32
159 ! of the product are in %o4
161 ! The multiply hasn't overflowed if the high-order bits are 0
163 ! if you are not interested in detecting overflow,
164 ! replace the following code with:
166 ! 1:
167 ! rd %y, %o0
168 ! retl
169 ! mov %o4, %o1
172 rd %y, %o0
173 retl ! leaf routine return
174 addcc %o4, %g0, %o1 ! return high-order bits and set Z if
175 ! high order bits are 0
177 ! 4-bit multiply
179 umul_4bit:
180 mulscc %o4, %o1, %o4 ! first iteration of 5
181 mulscc %o4, %o1, %o4
182 mulscc %o4, %o1, %o4
183 mulscc %o4, %o1, %o4 ! 4th iteration
184 mulscc %o4, %g0, %o4 ! last iteration only shifts
186 rd %y, %o5
188 ! The folowing code adds (2**32) * %o0 to the product if the
189 ! multiplicand had it's high bit set (see 32-bit case for explanation)
191 tst %o1
192 bge 2f
193 sra %o4, 28, %o1 ! right shift high bits by 28 bits
195 add %o1, %o0, %o1
197 ! The multiply hasn't overflowed if high-order bits are 0
199 ! if you are not interested in detecting overflow,
200 ! replace the following code with:
202 ! 2:
203 ! sll %o4, 4, %o0
204 ! srl %o5, 28, %o5
205 ! retl
206 ! or %o5, %o0, %o0
209 sll %o4, 4, %o0 ! left shift middle bits by 4 bits
210 srl %o5, 28, %o5 ! right shift low bits by 28 bits
211 or %o5, %o0, %o0 ! merge for true product
212 retl ! leaf routine return
213 tst %o1 ! set Z if high order bits are 0
215 ! 8-bit multiply
217 umul_8bit:
218 mulscc %o4, %o1, %o4 ! second iteration of 9
219 mulscc %o4, %o1, %o4
220 mulscc %o4, %o1, %o4
221 mulscc %o4, %o1, %o4
222 mulscc %o4, %o1, %o4
223 mulscc %o4, %o1, %o4
224 mulscc %o4, %o1, %o4 ! 8th iteration
225 mulscc %o4, %g0, %o4 ! last iteration only shifts
227 rd %y, %o5
229 ! The folowing code adds (2**32) * %o0 to the product if the
230 ! multiplicand had it's high bit set (see 32-bit case for explanation)
232 tst %o1
233 bge 3f
234 sra %o4, 24, %o1 ! right shift high bits by 24 bits
236 add %o1, %o0, %o1
238 ! The multiply hasn't overflowed if high-order bits are 0
240 ! if you are not interested in detecting overflow,
241 ! replace the following code with:
243 ! 3:
244 ! sll %o4, 8, %o0
245 ! srl %o5, 24, %o5
246 ! retl
247 ! or %o5, %o0, %o0
250 sll %o4, 8, %o0 ! left shift middle bits by 8 bits
251 srl %o5, 24, %o5 ! right shift low bits by 24 bits
252 or %o5, %o0, %o0 ! merge for true product
253 retl ! leaf routine return
254 tst %o1 ! set Z if high order bits are 0
256 ! 12-bit multiply
258 umul_12bit:
259 mulscc %o4, %o1, %o4 ! second iteration of 13
260 mulscc %o4, %o1, %o4
261 mulscc %o4, %o1, %o4
262 mulscc %o4, %o1, %o4
263 mulscc %o4, %o1, %o4
264 mulscc %o4, %o1, %o4
265 mulscc %o4, %o1, %o4
266 mulscc %o4, %o1, %o4
267 mulscc %o4, %o1, %o4
268 mulscc %o4, %o1, %o4
269 mulscc %o4, %o1, %o4 ! 12th iteration
270 mulscc %o4, %g0, %o4 ! last iteration only shifts
272 rd %y, %o5
274 ! The folowing code adds (2**32) * %o0 to the product if the
275 ! multiplicand had it's high bit set (see 32-bit case for explanation)
277 tst %o1
278 bge 4f
279 sra %o4, 20, %o1 ! right shift high bits by 20 bits
281 add %o1, %o0, %o1
283 ! The multiply hasn't overflowed if high-order bits are 0
285 ! if you are not interested in detecting overflow,
286 ! replace the following code with:
288 ! 4:
289 ! sll %o4, 12, %o0
290 ! srl %o5, 20, %o5
291 ! retl
292 ! or %o5, %o0, %o0
295 sll %o4, 12, %o0 ! left shift middle bits by 12 bits
296 srl %o5, 20, %o5 ! right shift low bits by 20 bits
297 or %o5, %o0, %o0 ! merge for true product
298 retl ! leaf routine return
299 tst %o1 ! set Z if high order bits are 0
301 ! 16-bit multiply
303 umul_16bit:
304 mulscc %o4, %o1, %o4 ! second iteration of 17
305 mulscc %o4, %o1, %o4
306 mulscc %o4, %o1, %o4
307 mulscc %o4, %o1, %o4
308 mulscc %o4, %o1, %o4
309 mulscc %o4, %o1, %o4
310 mulscc %o4, %o1, %o4
311 mulscc %o4, %o1, %o4
312 mulscc %o4, %o1, %o4
313 mulscc %o4, %o1, %o4
314 mulscc %o4, %o1, %o4
315 mulscc %o4, %o1, %o4
316 mulscc %o4, %o1, %o4
317 mulscc %o4, %o1, %o4
318 mulscc %o4, %o1, %o4 ! 16th iteration
319 mulscc %o4, %g0, %o4 ! last iteration only shifts
321 rd %y, %o5
323 ! The folowing code adds (2**32) * %o0 to the product if the
324 ! multiplicand had it's high bit set (see 32-bit case for explanation)
326 tst %o1
327 bge 5f
328 sra %o4, 16, %o1 ! right shift high bits by 16 bits
330 add %o1, %o0, %o1
332 ! The multiply hasn't overflowed if high-order bits are 0
334 ! if you are not interested in detecting overflow,
335 ! replace the following code with:
337 ! 5:
338 ! sll %o4, 16, %o0
339 ! srl %o5, 16, %o5
340 ! retl
341 ! or %o5, %o0, %o0
344 sll %o4, 16, %o0 ! left shift middle bits by 16 bits
345 srl %o5, 16, %o5 ! right shift low bits by 16 bits
346 or %o5, %o0, %o0 ! merge for true product
347 retl ! leaf routine return
348 tst %o1 ! set Z if high order bits are 0