1 ;
! HP-PA-
1.1 __mpn_mul_1
-- Multiply
a limb vector with
a limb
and store
2 ;
! the result in
a second limb vector.
4 ;
! Copyright
(C
) 1992, 1993, 1994 Free Software Foundation
, Inc.
6 ;
! This file is part of the GNU MP Library.
8 ;
! The GNU MP Library is free software; you can redistribute it
and/or modify
9 ;
! it under the terms of the GNU Lesser General Public License as published by
10 ;
! the Free Software Foundation; either version
2.1 of the License
, or (at your
11 ;
! option
) any later version.
13 ;
! The GNU MP Library is distributed in the hope that it will
be useful
, but
14 ;
! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 ;
! or FITNESS FOR
A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 ;
! License for more details.
18 ;
! You should have received
a copy of the GNU Lesser General Public License
19 ;
! along with the GNU MP Library; see the file COPYING.LIB. If
not, write to
20 ;
! the Free Software Foundation
, Inc.
, 59 Temple Place
- Suite
330, Boston
,
21 ;
! MA
02111-1307, USA.
30 ;
! This runs at
9 cycles
/limb on
a PA7000. With the used instructions
, it can
31 ;
! not become faster due to data cache contention after
a store. On the
32 ;
! PA7100 it runs at
7 cycles
/limb
, and that can
not be improved either
, since
33 ;
! only the xmpyu does
not need the integer pipeline
, so the only dual-issue
34 ;
! we will get are addc+xmpyu. Unrolling would
not help either CPU.
36 ;
! We could use fldds to read two limbs at
a time from the S1 array
, and that
37 ;
! could bring down the times to
8.5 and 6.5 cycles
/limb for the PA7000
and
38 ;
! PA7100
, respectively. We don
't do that since it does not seem worth the
39 ;! (alignment) troubles...
41 ;! At least the PA7100 is rumored to be able to deal with cache-misses
42 ;! without stalling instruction issue. If this is true, and the cache is
43 ;! actually also lockup-free, we should use a deeper software pipeline, and
44 ;! load from S1 very early; (The loads and stores to -12(sp) will surely be
51 .callinfo frame=64,no_calls
56 stw %r23,-16(%r30) ;! move s2_limb ...
57 addib,= -1,%r24,L$just_one_limb
58 fldws -16(%r30),%fr4 ;! ... into fr4
59 add %r0,%r0,%r0 ;! clear carry
64 ldw -12(%r30),%r20 ;! least significant limb in product
79 addib,<> -1,%r24,L$loop