Merge Chromium + Blink git repositories
[chromium-blink-merge.git] / third_party / boringssl / linux-arm / crypto / bn / armv4-mont.S
blobfc671e8d6a5c8c363df49836cadc110b74208b41
1 #if defined(__arm__)
2 #include <openssl/arm_arch.h>
4 .text
5 .code   32
7 #if __ARM_MAX_ARCH__>=7
8 .align  5
9 .LOPENSSL_armcap:
10 .word   OPENSSL_armcap_P-.Lbn_mul_mont
11 #endif
13 .globl  bn_mul_mont
14 .hidden bn_mul_mont
15 .type   bn_mul_mont,%function
17 .align  5
18 bn_mul_mont:
19 .Lbn_mul_mont:
20         ldr     ip,[sp,#4]              @ load num
21         stmdb   sp!,{r0,r2}             @ sp points at argument block
22 #if __ARM_MAX_ARCH__>=7
23         tst     ip,#7
24         bne     .Lialu
25         adr     r0,bn_mul_mont
26         ldr     r2,.LOPENSSL_armcap
27         ldr     r0,[r0,r2]
28 #ifdef  __APPLE__
29         ldr     r0,[r0]
30 #endif
31         tst     r0,#1                   @ NEON available?
32         ldmia   sp, {r0,r2}
33         beq     .Lialu
34         add     sp,sp,#8
35         b       bn_mul8x_mont_neon
36 .align  4
37 .Lialu:
38 #endif
39         cmp     ip,#2
40         mov     r0,ip                   @ load num
41         movlt   r0,#0
42         addlt   sp,sp,#2*4
43         blt     .Labrt
45         stmdb   sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}          @ save 10 registers
47         mov     r0,r0,lsl#2             @ rescale r0 for byte count
48         sub     sp,sp,r0                @ alloca(4*num)
49         sub     sp,sp,#4                @ +extra dword
50         sub     r0,r0,#4                @ "num=num-1"
51         add     r4,r2,r0                @ &bp[num-1]
53         add     r0,sp,r0                @ r0 to point at &tp[num-1]
54         ldr     r8,[r0,#14*4]           @ &n0
55         ldr     r2,[r2]         @ bp[0]
56         ldr     r5,[r1],#4              @ ap[0],ap++
57         ldr     r6,[r3],#4              @ np[0],np++
58         ldr     r8,[r8]         @ *n0
59         str     r4,[r0,#15*4]           @ save &bp[num]
61         umull   r10,r11,r5,r2   @ ap[0]*bp[0]
62         str     r8,[r0,#14*4]           @ save n0 value
63         mul     r8,r10,r8               @ "tp[0]"*n0
64         mov     r12,#0
65         umlal   r10,r12,r6,r8   @ np[0]*n0+"t[0]"
66         mov     r4,sp
68 .L1st:
69         ldr     r5,[r1],#4              @ ap[j],ap++
70         mov     r10,r11
71         ldr     r6,[r3],#4              @ np[j],np++
72         mov     r11,#0
73         umlal   r10,r11,r5,r2   @ ap[j]*bp[0]
74         mov     r14,#0
75         umlal   r12,r14,r6,r8   @ np[j]*n0
76         adds    r12,r12,r10
77         str     r12,[r4],#4             @ tp[j-1]=,tp++
78         adc     r12,r14,#0
79         cmp     r4,r0
80         bne     .L1st
82         adds    r12,r12,r11
83         ldr     r4,[r0,#13*4]           @ restore bp
84         mov     r14,#0
85         ldr     r8,[r0,#14*4]           @ restore n0
86         adc     r14,r14,#0
87         str     r12,[r0]                @ tp[num-1]=
88         str     r14,[r0,#4]             @ tp[num]=
90 .Louter:
91         sub     r7,r0,sp                @ "original" r0-1 value
92         sub     r1,r1,r7                @ "rewind" ap to &ap[1]
93         ldr     r2,[r4,#4]!             @ *(++bp)
94         sub     r3,r3,r7                @ "rewind" np to &np[1]
95         ldr     r5,[r1,#-4]             @ ap[0]
96         ldr     r10,[sp]                @ tp[0]
97         ldr     r6,[r3,#-4]             @ np[0]
98         ldr     r7,[sp,#4]              @ tp[1]
100         mov     r11,#0
101         umlal   r10,r11,r5,r2   @ ap[0]*bp[i]+tp[0]
102         str     r4,[r0,#13*4]           @ save bp
103         mul     r8,r10,r8
104         mov     r12,#0
105         umlal   r10,r12,r6,r8   @ np[0]*n0+"tp[0]"
106         mov     r4,sp
108 .Linner:
109         ldr     r5,[r1],#4              @ ap[j],ap++
110         adds    r10,r11,r7              @ +=tp[j]
111         ldr     r6,[r3],#4              @ np[j],np++
112         mov     r11,#0
113         umlal   r10,r11,r5,r2   @ ap[j]*bp[i]
114         mov     r14,#0
115         umlal   r12,r14,r6,r8   @ np[j]*n0
116         adc     r11,r11,#0
117         ldr     r7,[r4,#8]              @ tp[j+1]
118         adds    r12,r12,r10
119         str     r12,[r4],#4             @ tp[j-1]=,tp++
120         adc     r12,r14,#0
121         cmp     r4,r0
122         bne     .Linner
124         adds    r12,r12,r11
125         mov     r14,#0
126         ldr     r4,[r0,#13*4]           @ restore bp
127         adc     r14,r14,#0
128         ldr     r8,[r0,#14*4]           @ restore n0
129         adds    r12,r12,r7
130         ldr     r7,[r0,#15*4]           @ restore &bp[num]
131         adc     r14,r14,#0
132         str     r12,[r0]                @ tp[num-1]=
133         str     r14,[r0,#4]             @ tp[num]=
135         cmp     r4,r7
136         bne     .Louter
138         ldr     r2,[r0,#12*4]           @ pull rp
139         add     r0,r0,#4                @ r0 to point at &tp[num]
140         sub     r5,r0,sp                @ "original" num value
141         mov     r4,sp                   @ "rewind" r4
142         mov     r1,r4                   @ "borrow" r1
143         sub     r3,r3,r5                @ "rewind" r3 to &np[0]
145         subs    r7,r7,r7                @ "clear" carry flag
146 .Lsub:  ldr     r7,[r4],#4
147         ldr     r6,[r3],#4
148         sbcs    r7,r7,r6                @ tp[j]-np[j]
149         str     r7,[r2],#4              @ rp[j]=
150         teq     r4,r0           @ preserve carry
151         bne     .Lsub
152         sbcs    r14,r14,#0              @ upmost carry
153         mov     r4,sp                   @ "rewind" r4
154         sub     r2,r2,r5                @ "rewind" r2
156         and     r1,r4,r14
157         bic     r3,r2,r14
158         orr     r1,r1,r3                @ ap=borrow?tp:rp
160 .Lcopy: ldr     r7,[r1],#4              @ copy or in-place refresh
161         str     sp,[r4],#4              @ zap tp
162         str     r7,[r2],#4
163         cmp     r4,r0
164         bne     .Lcopy
166         add     sp,r0,#4                @ skip over tp[num+1]
167         ldmia   sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}          @ restore registers
168         add     sp,sp,#2*4              @ skip over {r0,r2}
169         mov     r0,#1
170 .Labrt:
171 #if __ARM_ARCH__>=5
172         bx      lr                              @ .word 0xe12fff1e
173 #else
174         tst     lr,#1
175         moveq   pc,lr                   @ be binary compatible with V4, yet
176 .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
177 #endif
178 .size   bn_mul_mont,.-bn_mul_mont
179 #if __ARM_MAX_ARCH__>=7
180 .arch   armv7-a
181 .fpu    neon
183 .type   bn_mul8x_mont_neon,%function
184 .align  5
185 bn_mul8x_mont_neon:
186         mov     ip,sp
187         stmdb   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
188         vstmdb  sp!,{d8,d9,d10,d11,d12,d13,d14,d15}             @ ABI specification says so
189         ldmia   ip,{r4,r5}              @ load rest of parameter block
191         sub     r7,sp,#16
192         vld1.32 {d28[0]}, [r2,:32]!
193         sub     r7,r7,r5,lsl#4
194         vld1.32 {d0,d1,d2,d3},  [r1]!           @ can't specify :32 :-(
195         and     r7,r7,#-64
196         vld1.32 {d30[0]}, [r4,:32]
197         mov     sp,r7                   @ alloca
198         veor    d8,d8,d8
199         subs    r8,r5,#8
200         vzip.16 d28,d8
202         vmull.u32       q6,d28,d0[0]
203         vmull.u32       q7,d28,d0[1]
204         vmull.u32       q8,d28,d1[0]
205         vshl.i64        d10,d13,#16
206         vmull.u32       q9,d28,d1[1]
208         vadd.u64        d10,d10,d12
209         veor    d8,d8,d8
210         vmul.u32        d29,d10,d30
212         vmull.u32       q10,d28,d2[0]
213         vld1.32 {d4,d5,d6,d7}, [r3]!
214         vmull.u32       q11,d28,d2[1]
215         vmull.u32       q12,d28,d3[0]
216         vzip.16 d29,d8
217         vmull.u32       q13,d28,d3[1]
219         bne     .LNEON_1st
221         @ special case for num=8, everything is in register bank...
223         vmlal.u32       q6,d29,d4[0]
224         sub     r9,r5,#1
225         vmlal.u32       q7,d29,d4[1]
226         vmlal.u32       q8,d29,d5[0]
227         vmlal.u32       q9,d29,d5[1]
229         vmlal.u32       q10,d29,d6[0]
230         vmov    q5,q6
231         vmlal.u32       q11,d29,d6[1]
232         vmov    q6,q7
233         vmlal.u32       q12,d29,d7[0]
234         vmov    q7,q8
235         vmlal.u32       q13,d29,d7[1]
236         vmov    q8,q9
237         vmov    q9,q10
238         vshr.u64        d10,d10,#16
239         vmov    q10,q11
240         vmov    q11,q12
241         vadd.u64        d10,d10,d11
242         vmov    q12,q13
243         veor    q13,q13
244         vshr.u64        d10,d10,#16
246         b       .LNEON_outer8
248 .align  4
249 .LNEON_outer8:
250         vld1.32 {d28[0]}, [r2,:32]!
251         veor    d8,d8,d8
252         vzip.16 d28,d8
253         vadd.u64        d12,d12,d10
255         vmlal.u32       q6,d28,d0[0]
256         vmlal.u32       q7,d28,d0[1]
257         vmlal.u32       q8,d28,d1[0]
258         vshl.i64        d10,d13,#16
259         vmlal.u32       q9,d28,d1[1]
261         vadd.u64        d10,d10,d12
262         veor    d8,d8,d8
263         subs    r9,r9,#1
264         vmul.u32        d29,d10,d30
266         vmlal.u32       q10,d28,d2[0]
267         vmlal.u32       q11,d28,d2[1]
268         vmlal.u32       q12,d28,d3[0]
269         vzip.16 d29,d8
270         vmlal.u32       q13,d28,d3[1]
272         vmlal.u32       q6,d29,d4[0]
273         vmlal.u32       q7,d29,d4[1]
274         vmlal.u32       q8,d29,d5[0]
275         vmlal.u32       q9,d29,d5[1]
277         vmlal.u32       q10,d29,d6[0]
278         vmov    q5,q6
279         vmlal.u32       q11,d29,d6[1]
280         vmov    q6,q7
281         vmlal.u32       q12,d29,d7[0]
282         vmov    q7,q8
283         vmlal.u32       q13,d29,d7[1]
284         vmov    q8,q9
285         vmov    q9,q10
286         vshr.u64        d10,d10,#16
287         vmov    q10,q11
288         vmov    q11,q12
289         vadd.u64        d10,d10,d11
290         vmov    q12,q13
291         veor    q13,q13
292         vshr.u64        d10,d10,#16
294         bne     .LNEON_outer8
296         vadd.u64        d12,d12,d10
297         mov     r7,sp
298         vshr.u64        d10,d12,#16
299         mov     r8,r5
300         vadd.u64        d13,d13,d10
301         add     r6,sp,#16
302         vshr.u64        d10,d13,#16
303         vzip.16 d12,d13
305         b       .LNEON_tail2
307 .align  4
308 .LNEON_1st:
309         vmlal.u32       q6,d29,d4[0]
310         vld1.32 {d0,d1,d2,d3}, [r1]!
311         vmlal.u32       q7,d29,d4[1]
312         subs    r8,r8,#8
313         vmlal.u32       q8,d29,d5[0]
314         vmlal.u32       q9,d29,d5[1]
316         vmlal.u32       q10,d29,d6[0]
317         vld1.32 {d4,d5}, [r3]!
318         vmlal.u32       q11,d29,d6[1]
319         vst1.64 {q6,q7}, [r7,:256]!
320         vmlal.u32       q12,d29,d7[0]
321         vmlal.u32       q13,d29,d7[1]
322         vst1.64 {q8,q9}, [r7,:256]!
324         vmull.u32       q6,d28,d0[0]
325         vld1.32 {d6,d7}, [r3]!
326         vmull.u32       q7,d28,d0[1]
327         vst1.64 {q10,q11}, [r7,:256]!
328         vmull.u32       q8,d28,d1[0]
329         vmull.u32       q9,d28,d1[1]
330         vst1.64 {q12,q13}, [r7,:256]!
332         vmull.u32       q10,d28,d2[0]
333         vmull.u32       q11,d28,d2[1]
334         vmull.u32       q12,d28,d3[0]
335         vmull.u32       q13,d28,d3[1]
337         bne     .LNEON_1st
339         vmlal.u32       q6,d29,d4[0]
340         add     r6,sp,#16
341         vmlal.u32       q7,d29,d4[1]
342         sub     r1,r1,r5,lsl#2          @ rewind r1
343         vmlal.u32       q8,d29,d5[0]
344         vld1.64 {q5}, [sp,:128]
345         vmlal.u32       q9,d29,d5[1]
346         sub     r9,r5,#1
348         vmlal.u32       q10,d29,d6[0]
349         vst1.64 {q6,q7}, [r7,:256]!
350         vmlal.u32       q11,d29,d6[1]
351         vshr.u64        d10,d10,#16
352         vld1.64 {q6},       [r6, :128]!
353         vmlal.u32       q12,d29,d7[0]
354         vst1.64 {q8,q9}, [r7,:256]!
355         vmlal.u32       q13,d29,d7[1]
357         vst1.64 {q10,q11}, [r7,:256]!
358         vadd.u64        d10,d10,d11
359         veor    q4,q4,q4
360         vst1.64 {q12,q13}, [r7,:256]!
361         vld1.64 {q7,q8}, [r6, :256]!
362         vst1.64 {q4},          [r7,:128]
363         vshr.u64        d10,d10,#16
365         b       .LNEON_outer
367 .align  4
368 .LNEON_outer:
369         vld1.32 {d28[0]}, [r2,:32]!
370         sub     r3,r3,r5,lsl#2          @ rewind r3
371         vld1.32 {d0,d1,d2,d3},  [r1]!
372         veor    d8,d8,d8
373         mov     r7,sp
374         vzip.16 d28,d8
375         sub     r8,r5,#8
376         vadd.u64        d12,d12,d10
378         vmlal.u32       q6,d28,d0[0]
379         vld1.64 {q9,q10},[r6,:256]!
380         vmlal.u32       q7,d28,d0[1]
381         vmlal.u32       q8,d28,d1[0]
382         vld1.64 {q11,q12},[r6,:256]!
383         vmlal.u32       q9,d28,d1[1]
385         vshl.i64        d10,d13,#16
386         veor    d8,d8,d8
387         vadd.u64        d10,d10,d12
388         vld1.64 {q13},[r6,:128]!
389         vmul.u32        d29,d10,d30
391         vmlal.u32       q10,d28,d2[0]
392         vld1.32 {d4,d5,d6,d7}, [r3]!
393         vmlal.u32       q11,d28,d2[1]
394         vmlal.u32       q12,d28,d3[0]
395         vzip.16 d29,d8
396         vmlal.u32       q13,d28,d3[1]
398 .LNEON_inner:
399         vmlal.u32       q6,d29,d4[0]
400         vld1.32 {d0,d1,d2,d3}, [r1]!
401         vmlal.u32       q7,d29,d4[1]
402         subs    r8,r8,#8
403         vmlal.u32       q8,d29,d5[0]
404         vmlal.u32       q9,d29,d5[1]
405         vst1.64 {q6,q7}, [r7,:256]!
407         vmlal.u32       q10,d29,d6[0]
408         vld1.64 {q6},       [r6, :128]!
409         vmlal.u32       q11,d29,d6[1]
410         vst1.64 {q8,q9}, [r7,:256]!
411         vmlal.u32       q12,d29,d7[0]
412         vld1.64 {q7,q8}, [r6, :256]!
413         vmlal.u32       q13,d29,d7[1]
414         vst1.64 {q10,q11}, [r7,:256]!
416         vmlal.u32       q6,d28,d0[0]
417         vld1.64 {q9,q10}, [r6, :256]!
418         vmlal.u32       q7,d28,d0[1]
419         vst1.64 {q12,q13}, [r7,:256]!
420         vmlal.u32       q8,d28,d1[0]
421         vld1.64 {q11,q12}, [r6, :256]!
422         vmlal.u32       q9,d28,d1[1]
423         vld1.32 {d4,d5,d6,d7}, [r3]!
425         vmlal.u32       q10,d28,d2[0]
426         vld1.64 {q13},       [r6, :128]!
427         vmlal.u32       q11,d28,d2[1]
428         vmlal.u32       q12,d28,d3[0]
429         vmlal.u32       q13,d28,d3[1]
431         bne     .LNEON_inner
433         vmlal.u32       q6,d29,d4[0]
434         add     r6,sp,#16
435         vmlal.u32       q7,d29,d4[1]
436         sub     r1,r1,r5,lsl#2          @ rewind r1
437         vmlal.u32       q8,d29,d5[0]
438         vld1.64 {q5}, [sp,:128]
439         vmlal.u32       q9,d29,d5[1]
440         subs    r9,r9,#1
442         vmlal.u32       q10,d29,d6[0]
443         vst1.64 {q6,q7}, [r7,:256]!
444         vmlal.u32       q11,d29,d6[1]
445         vld1.64 {q6},       [r6, :128]!
446         vshr.u64        d10,d10,#16
447         vst1.64 {q8,q9}, [r7,:256]!
448         vmlal.u32       q12,d29,d7[0]
449         vld1.64 {q7,q8}, [r6, :256]!
450         vmlal.u32       q13,d29,d7[1]
452         vst1.64 {q10,q11}, [r7,:256]!
453         vadd.u64        d10,d10,d11
454         vst1.64 {q12,q13}, [r7,:256]!
455         vshr.u64        d10,d10,#16
457         bne     .LNEON_outer
459         mov     r7,sp
460         mov     r8,r5
462 .LNEON_tail:
463         vadd.u64        d12,d12,d10
464         vld1.64 {q9,q10}, [r6, :256]!
465         vshr.u64        d10,d12,#16
466         vadd.u64        d13,d13,d10
467         vld1.64 {q11,q12}, [r6, :256]!
468         vshr.u64        d10,d13,#16
469         vld1.64 {q13},       [r6, :128]!
470         vzip.16 d12,d13
472 .LNEON_tail2:
473         vadd.u64        d14,d14,d10
474         vst1.32 {d12[0]}, [r7, :32]!
475         vshr.u64        d10,d14,#16
476         vadd.u64        d15,d15,d10
477         vshr.u64        d10,d15,#16
478         vzip.16 d14,d15
480         vadd.u64        d16,d16,d10
481         vst1.32 {d14[0]}, [r7, :32]!
482         vshr.u64        d10,d16,#16
483         vadd.u64        d17,d17,d10
484         vshr.u64        d10,d17,#16
485         vzip.16 d16,d17
487         vadd.u64        d18,d18,d10
488         vst1.32 {d16[0]}, [r7, :32]!
489         vshr.u64        d10,d18,#16
490         vadd.u64        d19,d19,d10
491         vshr.u64        d10,d19,#16
492         vzip.16 d18,d19
494         vadd.u64        d20,d20,d10
495         vst1.32 {d18[0]}, [r7, :32]!
496         vshr.u64        d10,d20,#16
497         vadd.u64        d21,d21,d10
498         vshr.u64        d10,d21,#16
499         vzip.16 d20,d21
501         vadd.u64        d22,d22,d10
502         vst1.32 {d20[0]}, [r7, :32]!
503         vshr.u64        d10,d22,#16
504         vadd.u64        d23,d23,d10
505         vshr.u64        d10,d23,#16
506         vzip.16 d22,d23
508         vadd.u64        d24,d24,d10
509         vst1.32 {d22[0]}, [r7, :32]!
510         vshr.u64        d10,d24,#16
511         vadd.u64        d25,d25,d10
512         vld1.64 {q6}, [r6, :128]!
513         vshr.u64        d10,d25,#16
514         vzip.16 d24,d25
516         vadd.u64        d26,d26,d10
517         vst1.32 {d24[0]}, [r7, :32]!
518         vshr.u64        d10,d26,#16
519         vadd.u64        d27,d27,d10
520         vld1.64 {q7,q8},        [r6, :256]!
521         vshr.u64        d10,d27,#16
522         vzip.16 d26,d27
523         subs    r8,r8,#8
524         vst1.32 {d26[0]}, [r7, :32]!
526         bne     .LNEON_tail
528         vst1.32 {d10[0]}, [r7, :32]             @ top-most bit
529         sub     r3,r3,r5,lsl#2                  @ rewind r3
530         subs    r1,sp,#0                                @ clear carry flag
531         add     r2,sp,r5,lsl#2
533 .LNEON_sub:
534         ldmia   r1!, {r4,r5,r6,r7}
535         ldmia   r3!, {r8,r9,r10,r11}
536         sbcs    r8, r4,r8
537         sbcs    r9, r5,r9
538         sbcs    r10,r6,r10
539         sbcs    r11,r7,r11
540         teq     r1,r2                           @ preserves carry
541         stmia   r0!, {r8,r9,r10,r11}
542         bne     .LNEON_sub
544         ldr     r10, [r1]                               @ load top-most bit
545         veor    q0,q0,q0
546         sub     r11,r2,sp                               @ this is num*4
547         veor    q1,q1,q1
548         mov     r1,sp
549         sub     r0,r0,r11                               @ rewind r0
550         mov     r3,r2                           @ second 3/4th of frame
551         sbcs    r10,r10,#0                              @ result is carry flag
553 .LNEON_copy_n_zap:
554         ldmia   r1!, {r4,r5,r6,r7}
555         ldmia   r0,  {r8,r9,r10,r11}
556         movcc   r8, r4
557         vst1.64 {q0,q1}, [r3,:256]!                     @ wipe
558         movcc   r9, r5
559         movcc   r10,r6
560         vst1.64 {q0,q1}, [r3,:256]!                     @ wipe
561         movcc   r11,r7
562         ldmia   r1, {r4,r5,r6,r7}
563         stmia   r0!, {r8,r9,r10,r11}
564         sub     r1,r1,#16
565         ldmia   r0, {r8,r9,r10,r11}
566         movcc   r8, r4
567         vst1.64 {q0,q1}, [r1,:256]!                     @ wipe
568         movcc   r9, r5
569         movcc   r10,r6
570         vst1.64 {q0,q1}, [r3,:256]!                     @ wipe
571         movcc   r11,r7
572         teq     r1,r2                           @ preserves carry
573         stmia   r0!, {r8,r9,r10,r11}
574         bne     .LNEON_copy_n_zap
576         sub     sp,ip,#96
577         vldmia  sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
578         ldmia   sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
579         bx      lr                                              @ .word 0xe12fff1e
580 .size   bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
581 #endif
582 .byte   77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
583 .align  2
584 .align  2
585 #if __ARM_MAX_ARCH__>=7
586 .comm   OPENSSL_armcap_P,4,4
587 .hidden OPENSSL_armcap_P
588 #endif
589 #endif