etc/services - sync with NetBSD-8
[minix.git] / crypto / external / bsd / openssl / lib / libcrypto / arch / arm / armv4-mont.S
bloba6238e2327e7f3d8e8ef9b7e2244bd78d4f66a48
1 #include "arm_arch.h"
2 #include "arm_asm.h"
4 .text
5 .code   32
7 #if __ARM_MAX_ARCH__>=7
8 .align  5
9 .LOPENSSL_armcap:
10 .word   OPENSSL_armcap_P-bn_mul_mont
11 #endif
13 .global bn_mul_mont
14 .type   bn_mul_mont,%function
16 .align  5
17 bn_mul_mont:
18         ldr     ip,[sp,#4]              @ load num
19         stmdb   sp!,{r0,r2}             @ sp points at argument block
20 #if __ARM_MAX_ARCH__>=7
21         tst     ip,#7
22         bne     .Lialu
23         adr     r0,bn_mul_mont
24         ldr     r2,.LOPENSSL_armcap
25         ldr     r0,[r0,r2]
26         tst     r0,#1                   @ NEON available?
27         ldmia   sp, {r0,r2}
28         beq     .Lialu
29         add     sp,sp,#8
30         b       bn_mul8x_mont_neon
31 .align  4
32 .Lialu:
33 #endif
34         cmp     ip,#2
35         mov     r0,ip                   @ load num
36         movlt   r0,#0
37         addlt   sp,sp,#2*4
38         blt     .Labrt
40         stmdb   sp!,{r4-r12,lr}         @ save 10 registers
42         mov     r0,r0,lsl#2             @ rescale r0 for byte count
43         sub     sp,sp,r0                @ alloca(4*num)
44         sub     sp,sp,#4                @ +extra dword
45         sub     r0,r0,#4                @ "num=num-1"
46         add     r4,r2,r0                @ &bp[num-1]
48         add     r0,sp,r0                @ r0 to point at &tp[num-1]
49         ldr     r8,[r0,#14*4]           @ &n0
50         ldr     r2,[r2]         @ bp[0]
51         ldr     r5,[r1],#4              @ ap[0],ap++
52         ldr     r6,[r3],#4              @ np[0],np++
53         ldr     r8,[r8]         @ *n0
54         str     r4,[r0,#15*4]           @ save &bp[num]
56         umull   r10,r11,r5,r2   @ ap[0]*bp[0]
57         str     r8,[r0,#14*4]           @ save n0 value
58         mul     r8,r10,r8               @ "tp[0]"*n0
59         mov     r12,#0
60         umlal   r10,r12,r6,r8   @ np[0]*n0+"t[0]"
61         mov     r4,sp
63 .L1st:
64         ldr     r5,[r1],#4              @ ap[j],ap++
65         mov     r10,r11
66         ldr     r6,[r3],#4              @ np[j],np++
67         mov     r11,#0
68         umlal   r10,r11,r5,r2   @ ap[j]*bp[0]
69         mov     r14,#0
70         umlal   r12,r14,r6,r8   @ np[j]*n0
71         adds    r12,r12,r10
72         str     r12,[r4],#4             @ tp[j-1]=,tp++
73         adc     r12,r14,#0
74         cmp     r4,r0
75         bne     .L1st
77         adds    r12,r12,r11
78         ldr     r4,[r0,#13*4]           @ restore bp
79         mov     r14,#0
80         ldr     r8,[r0,#14*4]           @ restore n0
81         adc     r14,r14,#0
82         str     r12,[r0]                @ tp[num-1]=
83         str     r14,[r0,#4]             @ tp[num]=
85 .Louter:
86         sub     r7,r0,sp                @ "original" r0-1 value
87         sub     r1,r1,r7                @ "rewind" ap to &ap[1]
88         ldr     r2,[r4,#4]!             @ *(++bp)
89         sub     r3,r3,r7                @ "rewind" np to &np[1]
90         ldr     r5,[r1,#-4]             @ ap[0]
91         ldr     r10,[sp]                @ tp[0]
92         ldr     r6,[r3,#-4]             @ np[0]
93         ldr     r7,[sp,#4]              @ tp[1]
95         mov     r11,#0
96         umlal   r10,r11,r5,r2   @ ap[0]*bp[i]+tp[0]
97         str     r4,[r0,#13*4]           @ save bp
98         mul     r8,r10,r8
99         mov     r12,#0
100         umlal   r10,r12,r6,r8   @ np[0]*n0+"tp[0]"
101         mov     r4,sp
103 .Linner:
104         ldr     r5,[r1],#4              @ ap[j],ap++
105         adds    r10,r11,r7              @ +=tp[j]
106         ldr     r6,[r3],#4              @ np[j],np++
107         mov     r11,#0
108         umlal   r10,r11,r5,r2   @ ap[j]*bp[i]
109         mov     r14,#0
110         umlal   r12,r14,r6,r8   @ np[j]*n0
111         adc     r11,r11,#0
112         ldr     r7,[r4,#8]              @ tp[j+1]
113         adds    r12,r12,r10
114         str     r12,[r4],#4             @ tp[j-1]=,tp++
115         adc     r12,r14,#0
116         cmp     r4,r0
117         bne     .Linner
119         adds    r12,r12,r11
120         mov     r14,#0
121         ldr     r4,[r0,#13*4]           @ restore bp
122         adc     r14,r14,#0
123         ldr     r8,[r0,#14*4]           @ restore n0
124         adds    r12,r12,r7
125         ldr     r7,[r0,#15*4]           @ restore &bp[num]
126         adc     r14,r14,#0
127         str     r12,[r0]                @ tp[num-1]=
128         str     r14,[r0,#4]             @ tp[num]=
130         cmp     r4,r7
131         bne     .Louter
133         ldr     r2,[r0,#12*4]           @ pull rp
134         add     r0,r0,#4                @ r0 to point at &tp[num]
135         sub     r5,r0,sp                @ "original" num value
136         mov     r4,sp                   @ "rewind" r4
137         mov     r1,r4                   @ "borrow" r1
138         sub     r3,r3,r5                @ "rewind" r3 to &np[0]
140         subs    r7,r7,r7                @ "clear" carry flag
141 .Lsub:  ldr     r7,[r4],#4
142         ldr     r6,[r3],#4
143         sbcs    r7,r7,r6                @ tp[j]-np[j]
144         str     r7,[r2],#4              @ rp[j]=
145         teq     r4,r0           @ preserve carry
146         bne     .Lsub
147         sbcs    r14,r14,#0              @ upmost carry
148         mov     r4,sp                   @ "rewind" r4
149         sub     r2,r2,r5                @ "rewind" r2
151         and     r1,r4,r14
152         bic     r3,r2,r14
153         orr     r1,r1,r3                @ ap=borrow?tp:rp
155 .Lcopy: ldr     r7,[r1],#4              @ copy or in-place refresh
156         str     sp,[r4],#4              @ zap tp
157         str     r7,[r2],#4
158         cmp     r4,r0
159         bne     .Lcopy
161         add     sp,r0,#4                @ skip over tp[num+1]
162         ldmia   sp!,{r4-r12,lr}         @ restore registers
163         add     sp,sp,#2*4              @ skip over {r0,r2}
164         mov     r0,#1
165 .Labrt:
166 #if __ARM_ARCH__>=5
167         RET                             @ .word 0xe12fff1e
168 #else
169         tst     lr,#1
170         moveq   pc,lr                   @ be binary compatible with V4, yet
171         .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
172 #endif
173 .size   bn_mul_mont,.-bn_mul_mont
174 #if __ARM_MAX_ARCH__>=7
175 .arch   armv7-a
176 .fpu    neon
178 .type   bn_mul8x_mont_neon,%function
179 .align  5
180 bn_mul8x_mont_neon:
181         mov     ip,sp
182         stmdb   sp!,{r4-r11}
183         vstmdb  sp!,{d8-d15}            @ ABI specification says so
184         ldmia   ip,{r4-r5}              @ load rest of parameter block
186         sub             r7,sp,#16
187         vld1.32         {d28[0]}, [r2,:32]!
188         sub             r7,r7,r5,lsl#4
189         vld1.32         {d0-d3},  [r1]!         @ can't specify :32 :-(
190         and             r7,r7,#-64
191         vld1.32         {d30[0]}, [r4,:32]
192         mov             sp,r7                   @ alloca
193         veor            d8,d8,d8
194         subs            r8,r5,#8
195         vzip.16         d28,d8
197         vmull.u32       q6,d28,d0[0]
198         vmull.u32       q7,d28,d0[1]
199         vmull.u32       q8,d28,d1[0]
200         vshl.i64        d10,d13,#16
201         vmull.u32       q9,d28,d1[1]
203         vadd.u64        d10,d10,d12
204         veor            d8,d8,d8
205         vmul.u32        d29,d10,d30
207         vmull.u32       q10,d28,d2[0]
208          vld1.32        {d4-d7}, [r3]!
209         vmull.u32       q11,d28,d2[1]
210         vmull.u32       q12,d28,d3[0]
211         vzip.16         d29,d8
212         vmull.u32       q13,d28,d3[1]
214         bne     .LNEON_1st
216         @ special case for num=8, everything is in register bank...
218         vmlal.u32       q6,d29,d4[0]
219         sub             r9,r5,#1
220         vmlal.u32       q7,d29,d4[1]
221         vmlal.u32       q8,d29,d5[0]
222         vmlal.u32       q9,d29,d5[1]
224         vmlal.u32       q10,d29,d6[0]
225         vmov            q5,q6
226         vmlal.u32       q11,d29,d6[1]
227         vmov            q6,q7
228         vmlal.u32       q12,d29,d7[0]
229         vmov            q7,q8
230         vmlal.u32       q13,d29,d7[1]
231         vmov            q8,q9
232         vmov            q9,q10
233         vshr.u64        d10,d10,#16
234         vmov            q10,q11
235         vmov            q11,q12
236         vadd.u64        d10,d10,d11
237         vmov            q12,q13
238         veor            q13,q13
239         vshr.u64        d10,d10,#16
241         b       .LNEON_outer8
243 .align  4
244 .LNEON_outer8:
245         vld1.32         {d28[0]}, [r2,:32]!
246         veor            d8,d8,d8
247         vzip.16         d28,d8
248         vadd.u64        d12,d12,d10
250         vmlal.u32       q6,d28,d0[0]
251         vmlal.u32       q7,d28,d0[1]
252         vmlal.u32       q8,d28,d1[0]
253         vshl.i64        d10,d13,#16
254         vmlal.u32       q9,d28,d1[1]
256         vadd.u64        d10,d10,d12
257         veor            d8,d8,d8
258         subs            r9,r9,#1
259         vmul.u32        d29,d10,d30
261         vmlal.u32       q10,d28,d2[0]
262         vmlal.u32       q11,d28,d2[1]
263         vmlal.u32       q12,d28,d3[0]
264         vzip.16         d29,d8
265         vmlal.u32       q13,d28,d3[1]
267         vmlal.u32       q6,d29,d4[0]
268         vmlal.u32       q7,d29,d4[1]
269         vmlal.u32       q8,d29,d5[0]
270         vmlal.u32       q9,d29,d5[1]
272         vmlal.u32       q10,d29,d6[0]
273         vmov            q5,q6
274         vmlal.u32       q11,d29,d6[1]
275         vmov            q6,q7
276         vmlal.u32       q12,d29,d7[0]
277         vmov            q7,q8
278         vmlal.u32       q13,d29,d7[1]
279         vmov            q8,q9
280         vmov            q9,q10
281         vshr.u64        d10,d10,#16
282         vmov            q10,q11
283         vmov            q11,q12
284         vadd.u64        d10,d10,d11
285         vmov            q12,q13
286         veor            q13,q13
287         vshr.u64        d10,d10,#16
289         bne     .LNEON_outer8
291         vadd.u64        d12,d12,d10
292         mov             r7,sp
293         vshr.u64        d10,d12,#16
294         mov             r8,r5
295         vadd.u64        d13,d13,d10
296         add             r6,sp,#16
297         vshr.u64        d10,d13,#16
298         vzip.16         d12,d13
300         b       .LNEON_tail2
302 .align  4
303 .LNEON_1st:
304         vmlal.u32       q6,d29,d4[0]
305          vld1.32        {d0-d3}, [r1]!
306         vmlal.u32       q7,d29,d4[1]
307         subs            r8,r8,#8
308         vmlal.u32       q8,d29,d5[0]
309         vmlal.u32       q9,d29,d5[1]
311         vmlal.u32       q10,d29,d6[0]
312          vld1.32        {d4-d5}, [r3]!
313         vmlal.u32       q11,d29,d6[1]
314          vst1.64        {q6-q7}, [r7,:256]!
315         vmlal.u32       q12,d29,d7[0]
316         vmlal.u32       q13,d29,d7[1]
317          vst1.64        {q8-q9}, [r7,:256]!
319         vmull.u32       q6,d28,d0[0]
320          vld1.32        {d6-d7}, [r3]!
321         vmull.u32       q7,d28,d0[1]
322          vst1.64        {q10-q11}, [r7,:256]!
323         vmull.u32       q8,d28,d1[0]
324         vmull.u32       q9,d28,d1[1]
325          vst1.64        {q12-q13}, [r7,:256]!
327         vmull.u32       q10,d28,d2[0]
328         vmull.u32       q11,d28,d2[1]
329         vmull.u32       q12,d28,d3[0]
330         vmull.u32       q13,d28,d3[1]
332         bne     .LNEON_1st
334         vmlal.u32       q6,d29,d4[0]
335         add             r6,sp,#16
336         vmlal.u32       q7,d29,d4[1]
337         sub             r1,r1,r5,lsl#2          @ rewind r1
338         vmlal.u32       q8,d29,d5[0]
339          vld1.64        {q5}, [sp,:128]
340         vmlal.u32       q9,d29,d5[1]
341         sub             r9,r5,#1
343         vmlal.u32       q10,d29,d6[0]
344         vst1.64         {q6-q7}, [r7,:256]!
345         vmlal.u32       q11,d29,d6[1]
346         vshr.u64        d10,d10,#16
347          vld1.64        {q6},       [r6, :128]!
348         vmlal.u32       q12,d29,d7[0]
349         vst1.64         {q8-q9}, [r7,:256]!
350         vmlal.u32       q13,d29,d7[1]
352         vst1.64         {q10-q11}, [r7,:256]!
353         vadd.u64        d10,d10,d11
354         veor            q4,q4,q4
355         vst1.64         {q12-q13}, [r7,:256]!
356          vld1.64        {q7-q8}, [r6, :256]!
357         vst1.64         {q4},          [r7,:128]
358         vshr.u64        d10,d10,#16
360         b               .LNEON_outer
362 .align  4
363 .LNEON_outer:
364         vld1.32         {d28[0]}, [r2,:32]!
365         sub             r3,r3,r5,lsl#2          @ rewind r3
366         vld1.32         {d0-d3},  [r1]!
367         veor            d8,d8,d8
368         mov             r7,sp
369         vzip.16         d28,d8
370         sub             r8,r5,#8
371         vadd.u64        d12,d12,d10
373         vmlal.u32       q6,d28,d0[0]
374          vld1.64        {q9-q10},[r6,:256]!
375         vmlal.u32       q7,d28,d0[1]
376         vmlal.u32       q8,d28,d1[0]
377          vld1.64        {q11-q12},[r6,:256]!
378         vmlal.u32       q9,d28,d1[1]
380         vshl.i64        d10,d13,#16
381         veor            d8,d8,d8
382         vadd.u64        d10,d10,d12
383          vld1.64        {q13},[r6,:128]!
384         vmul.u32        d29,d10,d30
386         vmlal.u32       q10,d28,d2[0]
387          vld1.32        {d4-d7}, [r3]!
388         vmlal.u32       q11,d28,d2[1]
389         vmlal.u32       q12,d28,d3[0]
390         vzip.16         d29,d8
391         vmlal.u32       q13,d28,d3[1]
393 .LNEON_inner:
394         vmlal.u32       q6,d29,d4[0]
395          vld1.32        {d0-d3}, [r1]!
396         vmlal.u32       q7,d29,d4[1]
397          subs           r8,r8,#8
398         vmlal.u32       q8,d29,d5[0]
399         vmlal.u32       q9,d29,d5[1]
400         vst1.64         {q6-q7}, [r7,:256]!
402         vmlal.u32       q10,d29,d6[0]
403          vld1.64        {q6},       [r6, :128]!
404         vmlal.u32       q11,d29,d6[1]
405         vst1.64         {q8-q9}, [r7,:256]!
406         vmlal.u32       q12,d29,d7[0]
407          vld1.64        {q7-q8}, [r6, :256]!
408         vmlal.u32       q13,d29,d7[1]
409         vst1.64         {q10-q11}, [r7,:256]!
411         vmlal.u32       q6,d28,d0[0]
412          vld1.64        {q9-q10}, [r6, :256]!
413         vmlal.u32       q7,d28,d0[1]
414         vst1.64         {q12-q13}, [r7,:256]!
415         vmlal.u32       q8,d28,d1[0]
416          vld1.64        {q11-q12}, [r6, :256]!
417         vmlal.u32       q9,d28,d1[1]
418          vld1.32        {d4-d7}, [r3]!
420         vmlal.u32       q10,d28,d2[0]
421          vld1.64        {q13},       [r6, :128]!
422         vmlal.u32       q11,d28,d2[1]
423         vmlal.u32       q12,d28,d3[0]
424         vmlal.u32       q13,d28,d3[1]
426         bne     .LNEON_inner
428         vmlal.u32       q6,d29,d4[0]
429         add             r6,sp,#16
430         vmlal.u32       q7,d29,d4[1]
431         sub             r1,r1,r5,lsl#2          @ rewind r1
432         vmlal.u32       q8,d29,d5[0]
433          vld1.64        {q5}, [sp,:128]
434         vmlal.u32       q9,d29,d5[1]
435         subs            r9,r9,#1
437         vmlal.u32       q10,d29,d6[0]
438         vst1.64         {q6-q7}, [r7,:256]!
439         vmlal.u32       q11,d29,d6[1]
440          vld1.64        {q6},       [r6, :128]!
441         vshr.u64        d10,d10,#16
442         vst1.64         {q8-q9}, [r7,:256]!
443         vmlal.u32       q12,d29,d7[0]
444          vld1.64        {q7-q8}, [r6, :256]!
445         vmlal.u32       q13,d29,d7[1]
447         vst1.64         {q10-q11}, [r7,:256]!
448         vadd.u64        d10,d10,d11
449         vst1.64         {q12-q13}, [r7,:256]!
450         vshr.u64        d10,d10,#16
452         bne     .LNEON_outer
454         mov             r7,sp
455         mov             r8,r5
457 .LNEON_tail:
458         vadd.u64        d12,d12,d10
459         vld1.64         {q9-q10}, [r6, :256]!
460         vshr.u64        d10,d12,#16
461         vadd.u64        d13,d13,d10
462         vld1.64         {q11-q12}, [r6, :256]!
463         vshr.u64        d10,d13,#16
464         vld1.64         {q13},       [r6, :128]!
465         vzip.16         d12,d13
467 .LNEON_tail2:
468         vadd.u64        d14,d14,d10
469         vst1.32         {d12[0]}, [r7, :32]!
470         vshr.u64        d10,d14,#16
471         vadd.u64        d15,d15,d10
472         vshr.u64        d10,d15,#16
473         vzip.16         d14,d15
475         vadd.u64        d16,d16,d10
476         vst1.32         {d14[0]}, [r7, :32]!
477         vshr.u64        d10,d16,#16
478         vadd.u64        d17,d17,d10
479         vshr.u64        d10,d17,#16
480         vzip.16         d16,d17
482         vadd.u64        d18,d18,d10
483         vst1.32         {d16[0]}, [r7, :32]!
484         vshr.u64        d10,d18,#16
485         vadd.u64        d19,d19,d10
486         vshr.u64        d10,d19,#16
487         vzip.16         d18,d19
489         vadd.u64        d20,d20,d10
490         vst1.32         {d18[0]}, [r7, :32]!
491         vshr.u64        d10,d20,#16
492         vadd.u64        d21,d21,d10
493         vshr.u64        d10,d21,#16
494         vzip.16         d20,d21
496         vadd.u64        d22,d22,d10
497         vst1.32         {d20[0]}, [r7, :32]!
498         vshr.u64        d10,d22,#16
499         vadd.u64        d23,d23,d10
500         vshr.u64        d10,d23,#16
501         vzip.16         d22,d23
503         vadd.u64        d24,d24,d10
504         vst1.32         {d22[0]}, [r7, :32]!
505         vshr.u64        d10,d24,#16
506         vadd.u64        d25,d25,d10
507         vld1.64         {q6}, [r6, :128]!
508         vshr.u64        d10,d25,#16
509         vzip.16         d24,d25
511         vadd.u64        d26,d26,d10
512         vst1.32         {d24[0]}, [r7, :32]!
513         vshr.u64        d10,d26,#16
514         vadd.u64        d27,d27,d10
515         vld1.64         {q7-q8},        [r6, :256]!
516         vshr.u64        d10,d27,#16
517         vzip.16         d26,d27
518         subs            r8,r8,#8
519         vst1.32         {d26[0]}, [r7, :32]!
521         bne     .LNEON_tail
523         vst1.32 {d10[0]}, [r7, :32]             @ top-most bit
524         sub     r3,r3,r5,lsl#2                  @ rewind r3
525         subs    r1,sp,#0                                @ clear carry flag
526         add     r2,sp,r5,lsl#2
528 .LNEON_sub:
529         ldmia   r1!, {r4-r7}
530         ldmia   r3!, {r8-r11}
531         sbcs    r8, r4,r8
532         sbcs    r9, r5,r9
533         sbcs    r10,r6,r10
534         sbcs    r11,r7,r11
535         teq     r1,r2                           @ preserves carry
536         stmia   r0!, {r8-r11}
537         bne     .LNEON_sub
539         ldr     r10, [r1]                               @ load top-most bit
540         veor    q0,q0,q0
541         sub     r11,r2,sp                               @ this is num*4
542         veor    q1,q1,q1
543         mov     r1,sp
544         sub     r0,r0,r11                               @ rewind r0
545         mov     r3,r2                           @ second 3/4th of frame
546         sbcs    r10,r10,#0                              @ result is carry flag
548 .LNEON_copy_n_zap:
549         ldmia   r1!, {r4-r7}
550         ldmia   r0,  {r8-r11}
551         movcc   r8, r4
552         vst1.64 {q0-q1}, [r3,:256]!                     @ wipe
553         movcc   r9, r5
554         movcc   r10,r6
555         vst1.64 {q0-q1}, [r3,:256]!                     @ wipe
556         movcc   r11,r7
557         ldmia   r1, {r4-r7}
558         stmia   r0!, {r8-r11}
559         sub     r1,r1,#16
560         ldmia   r0, {r8-r11}
561         movcc   r8, r4
562         vst1.64 {q0-q1}, [r1,:256]!                     @ wipe
563         movcc   r9, r5
564         movcc   r10,r6
565         vst1.64 {q0-q1}, [r3,:256]!                     @ wipe
566         movcc   r11,r7
567         teq     r1,r2                           @ preserves carry
568         stmia   r0!, {r8-r11}
569         bne     .LNEON_copy_n_zap
571         sub     sp,ip,#96
572         vldmia  sp!,{d8-d15}
573         ldmia   sp!,{r4-r11}
574         RET                                             @ .word 0xe12fff1e
575 .size   bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
576 #endif
577 .asciz  "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
578 .align  2
579 #if __ARM_MAX_ARCH__>=7
580 .comm   OPENSSL_armcap_P,4,4
581 #endif