dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / lib / libcrypto / ec / asm / ecp_nistz256-armv4.pl
blobf3205d673a7b2120162543dd51dcd90f4a40c33e
1 #! /usr/bin/env perl
2 # $OpenBSD: ecp_nistz256-armv4.pl,v 1.1 2016/11/04 17:33:19 miod Exp $
4 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
6 # Licensed under the OpenSSL license (the "License"). You may not use
7 # this file except in compliance with the License. You can obtain a copy
8 # in the file LICENSE in the source distribution or at
9 # https://www.openssl.org/source/license.html
12 # ====================================================================
13 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
14 # project. The module is, however, dual licensed under OpenSSL and
15 # CRYPTOGAMS licenses depending on where you obtain it. For further
16 # details see http://www.openssl.org/~appro/cryptogams/.
17 # ====================================================================
19 # ECP_NISTZ256 module for ARMv4.
21 # October 2014.
23 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
24 # http://eprint.iacr.org/2013/816. In the process of adaptation
25 # original .c module was made 32-bit savvy in order to make this
26 # implementation possible.
28 # with/without -DECP_NISTZ256_ASM
29 # Cortex-A8 +53-170%
30 # Cortex-A9 +76-205%
31 # Cortex-A15 +100-316%
32 # Snapdragon S4 +66-187%
34 # Ranges denote minimum and maximum improvement coefficients depending
35 # on benchmark. Lower coefficients are for ECDSA sign, server-side
36 # operation. Keep in mind that +200% means 3x improvement.
38 $flavour = shift;
39 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
40 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
42 if ($flavour && $flavour ne "void") {
43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
45 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
46 die "can't locate arm-xlate.pl";
48 open STDOUT,"| \"$^X\" $xlate $flavour $output";
49 } else {
50 open STDOUT,">$output";
53 $code.=<<___;
54 #include "arm_arch.h"
56 .text
57 #if defined(__thumb2__)
58 .syntax unified
59 .thumb
60 #else
61 .code 32
62 #endif
63 ___
65 $code.=<<___;
66 .Lone:
67 .long 1,0,0,0,0,0,0,0
68 .align 6
69 ___
71 ########################################################################
72 # common register layout, note that $t2 is link register, so that if
73 # internal subroutine uses $t2, then it has to offload lr...
75 ($r_ptr,$a_ptr,$b_ptr,$ff,$a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,$t1,$t2)=
76 map("r$_",(0..12,14));
77 ($t0,$t3)=($ff,$a_ptr);
79 $code.=<<___;
80 @ void ecp_nistz256_from_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
81 .globl ecp_nistz256_from_mont
82 .type ecp_nistz256_from_mont,%function
83 ecp_nistz256_from_mont:
84 adr $b_ptr,.Lone
85 b .Lecp_nistz256_mul_mont
86 .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
88 @ void ecp_nistz256_mul_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
89 .globl ecp_nistz256_mul_by_2
90 .type ecp_nistz256_mul_by_2,%function
91 .align 4
92 ecp_nistz256_mul_by_2:
93 stmdb sp!,{r4-r12,lr}
94 bl __ecp_nistz256_mul_by_2
95 #if __ARM_ARCH__>=5 || !defined(__thumb__)
96 ldmia sp!,{r4-r12,pc}
97 #else
98 ldmia sp!,{r4-r12,lr}
99 bx lr @ interoperable with Thumb ISA:-)
100 #endif
101 .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
103 .type __ecp_nistz256_mul_by_2,%function
104 .align 4
105 __ecp_nistz256_mul_by_2:
106 ldr $a0,[$a_ptr,#0]
107 ldr $a1,[$a_ptr,#4]
108 ldr $a2,[$a_ptr,#8]
109 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7], i.e. add with itself
110 ldr $a3,[$a_ptr,#12]
111 adcs $a1,$a1,$a1
112 ldr $a4,[$a_ptr,#16]
113 adcs $a2,$a2,$a2
114 ldr $a5,[$a_ptr,#20]
115 adcs $a3,$a3,$a3
116 ldr $a6,[$a_ptr,#24]
117 adcs $a4,$a4,$a4
118 ldr $a7,[$a_ptr,#28]
119 adcs $a5,$a5,$a5
120 adcs $a6,$a6,$a6
121 mov $ff,#0
122 adcs $a7,$a7,$a7
123 adc $ff,$ff,#0
125 b .Lreduce_by_sub
126 .size __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
128 @ void ecp_nistz256_add(BN_ULONG r0[8],const BN_ULONG r1[8],
129 @ const BN_ULONG r2[8]);
130 .globl ecp_nistz256_add
131 .type ecp_nistz256_add,%function
132 .align 4
133 ecp_nistz256_add:
134 stmdb sp!,{r4-r12,lr}
135 bl __ecp_nistz256_add
136 #if __ARM_ARCH__>=5 || !defined(__thumb__)
137 ldmia sp!,{r4-r12,pc}
138 #else
139 ldmia sp!,{r4-r12,lr}
140 bx lr @ interoperable with Thumb ISA:-)
141 #endif
142 .size ecp_nistz256_add,.-ecp_nistz256_add
144 .type __ecp_nistz256_add,%function
145 .align 4
146 __ecp_nistz256_add:
147 str lr,[sp,#-4]! @ push lr
149 ldr $a0,[$a_ptr,#0]
150 ldr $a1,[$a_ptr,#4]
151 ldr $a2,[$a_ptr,#8]
152 ldr $a3,[$a_ptr,#12]
153 ldr $a4,[$a_ptr,#16]
154 ldr $t0,[$b_ptr,#0]
155 ldr $a5,[$a_ptr,#20]
156 ldr $t1,[$b_ptr,#4]
157 ldr $a6,[$a_ptr,#24]
158 ldr $t2,[$b_ptr,#8]
159 ldr $a7,[$a_ptr,#28]
160 ldr $t3,[$b_ptr,#12]
161 adds $a0,$a0,$t0
162 ldr $t0,[$b_ptr,#16]
163 adcs $a1,$a1,$t1
164 ldr $t1,[$b_ptr,#20]
165 adcs $a2,$a2,$t2
166 ldr $t2,[$b_ptr,#24]
167 adcs $a3,$a3,$t3
168 ldr $t3,[$b_ptr,#28]
169 adcs $a4,$a4,$t0
170 adcs $a5,$a5,$t1
171 adcs $a6,$a6,$t2
172 mov $ff,#0
173 adcs $a7,$a7,$t3
174 adc $ff,$ff,#0
175 ldr lr,[sp],#4 @ pop lr
177 .Lreduce_by_sub:
179 @ if a+b >= modulus, subtract modulus.
181 @ But since comparison implies subtraction, we subtract
182 @ modulus and then add it back if subraction borrowed.
184 subs $a0,$a0,#-1
185 sbcs $a1,$a1,#-1
186 sbcs $a2,$a2,#-1
187 sbcs $a3,$a3,#0
188 sbcs $a4,$a4,#0
189 sbcs $a5,$a5,#0
190 sbcs $a6,$a6,#1
191 sbcs $a7,$a7,#-1
192 sbc $ff,$ff,#0
194 @ Note that because mod has special form, i.e. consists of
195 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
196 @ using value of borrow as a whole or extracting single bit.
197 @ Follow $ff register...
199 adds $a0,$a0,$ff @ add synthesized modulus
200 adcs $a1,$a1,$ff
201 str $a0,[$r_ptr,#0]
202 adcs $a2,$a2,$ff
203 str $a1,[$r_ptr,#4]
204 adcs $a3,$a3,#0
205 str $a2,[$r_ptr,#8]
206 adcs $a4,$a4,#0
207 str $a3,[$r_ptr,#12]
208 adcs $a5,$a5,#0
209 str $a4,[$r_ptr,#16]
210 adcs $a6,$a6,$ff,lsr#31
211 str $a5,[$r_ptr,#20]
212 adcs $a7,$a7,$ff
213 str $a6,[$r_ptr,#24]
214 str $a7,[$r_ptr,#28]
216 mov pc,lr
217 .size __ecp_nistz256_add,.-__ecp_nistz256_add
219 @ void ecp_nistz256_mul_by_3(BN_ULONG r0[8],const BN_ULONG r1[8]);
220 .globl ecp_nistz256_mul_by_3
221 .type ecp_nistz256_mul_by_3,%function
222 .align 4
223 ecp_nistz256_mul_by_3:
224 stmdb sp!,{r4-r12,lr}
225 bl __ecp_nistz256_mul_by_3
226 #if __ARM_ARCH__>=5 || !defined(__thumb__)
227 ldmia sp!,{r4-r12,pc}
228 #else
229 ldmia sp!,{r4-r12,lr}
230 bx lr @ interoperable with Thumb ISA:-)
231 #endif
232 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
234 .type __ecp_nistz256_mul_by_3,%function
235 .align 4
236 __ecp_nistz256_mul_by_3:
237 str lr,[sp,#-4]! @ push lr
239 @ As multiplication by 3 is performed as 2*n+n, below are inline
240 @ copies of __ecp_nistz256_mul_by_2 and __ecp_nistz256_add, see
241 @ corresponding subroutines for details.
243 ldr $a0,[$a_ptr,#0]
244 ldr $a1,[$a_ptr,#4]
245 ldr $a2,[$a_ptr,#8]
246 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
247 ldr $a3,[$a_ptr,#12]
248 adcs $a1,$a1,$a1
249 ldr $a4,[$a_ptr,#16]
250 adcs $a2,$a2,$a2
251 ldr $a5,[$a_ptr,#20]
252 adcs $a3,$a3,$a3
253 ldr $a6,[$a_ptr,#24]
254 adcs $a4,$a4,$a4
255 ldr $a7,[$a_ptr,#28]
256 adcs $a5,$a5,$a5
257 adcs $a6,$a6,$a6
258 mov $ff,#0
259 adcs $a7,$a7,$a7
260 adc $ff,$ff,#0
262 subs $a0,$a0,#-1 @ .Lreduce_by_sub but without stores
263 sbcs $a1,$a1,#-1
264 sbcs $a2,$a2,#-1
265 sbcs $a3,$a3,#0
266 sbcs $a4,$a4,#0
267 sbcs $a5,$a5,#0
268 sbcs $a6,$a6,#1
269 sbcs $a7,$a7,#-1
270 sbc $ff,$ff,#0
272 adds $a0,$a0,$ff @ add synthesized modulus
273 adcs $a1,$a1,$ff
274 adcs $a2,$a2,$ff
275 adcs $a3,$a3,#0
276 adcs $a4,$a4,#0
277 ldr $b_ptr,[$a_ptr,#0]
278 adcs $a5,$a5,#0
279 ldr $t1,[$a_ptr,#4]
280 adcs $a6,$a6,$ff,lsr#31
281 ldr $t2,[$a_ptr,#8]
282 adc $a7,$a7,$ff
284 ldr $t0,[$a_ptr,#12]
285 adds $a0,$a0,$b_ptr @ 2*a[0:7]+=a[0:7]
286 ldr $b_ptr,[$a_ptr,#16]
287 adcs $a1,$a1,$t1
288 ldr $t1,[$a_ptr,#20]
289 adcs $a2,$a2,$t2
290 ldr $t2,[$a_ptr,#24]
291 adcs $a3,$a3,$t0
292 ldr $t3,[$a_ptr,#28]
293 adcs $a4,$a4,$b_ptr
294 adcs $a5,$a5,$t1
295 adcs $a6,$a6,$t2
296 mov $ff,#0
297 adcs $a7,$a7,$t3
298 adc $ff,$ff,#0
299 ldr lr,[sp],#4 @ pop lr
301 b .Lreduce_by_sub
302 .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
304 @ void ecp_nistz256_div_by_2(BN_ULONG r0[8],const BN_ULONG r1[8]);
305 .globl ecp_nistz256_div_by_2
306 .type ecp_nistz256_div_by_2,%function
307 .align 4
308 ecp_nistz256_div_by_2:
309 stmdb sp!,{r4-r12,lr}
310 bl __ecp_nistz256_div_by_2
311 #if __ARM_ARCH__>=5 || !defined(__thumb__)
312 ldmia sp!,{r4-r12,pc}
313 #else
314 ldmia sp!,{r4-r12,lr}
315 bx lr @ interoperable with Thumb ISA:-)
316 #endif
317 .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
319 .type __ecp_nistz256_div_by_2,%function
320 .align 4
321 __ecp_nistz256_div_by_2:
322 @ ret = (a is odd ? a+mod : a) >> 1
324 ldr $a0,[$a_ptr,#0]
325 ldr $a1,[$a_ptr,#4]
326 ldr $a2,[$a_ptr,#8]
327 mov $ff,$a0,lsl#31 @ place least significant bit to most
328 @ significant position, now arithmetic
329 @ right shift by 31 will produce -1 or
330 @ 0, while logical right shift 1 or 0,
331 @ this is how modulus is conditionally
332 @ synthesized in this case...
333 ldr $a3,[$a_ptr,#12]
334 adds $a0,$a0,$ff,asr#31
335 ldr $a4,[$a_ptr,#16]
336 adcs $a1,$a1,$ff,asr#31
337 ldr $a5,[$a_ptr,#20]
338 adcs $a2,$a2,$ff,asr#31
339 ldr $a6,[$a_ptr,#24]
340 adcs $a3,$a3,#0
341 ldr $a7,[$a_ptr,#28]
342 adcs $a4,$a4,#0
343 mov $a0,$a0,lsr#1 @ a[0:7]>>=1, we can start early
344 @ because it doesn't affect flags
345 adcs $a5,$a5,#0
346 orr $a0,$a0,$a1,lsl#31
347 adcs $a6,$a6,$ff,lsr#31
348 mov $b_ptr,#0
349 adcs $a7,$a7,$ff,asr#31
350 mov $a1,$a1,lsr#1
351 adc $b_ptr,$b_ptr,#0 @ top-most carry bit from addition
353 orr $a1,$a1,$a2,lsl#31
354 mov $a2,$a2,lsr#1
355 str $a0,[$r_ptr,#0]
356 orr $a2,$a2,$a3,lsl#31
357 mov $a3,$a3,lsr#1
358 str $a1,[$r_ptr,#4]
359 orr $a3,$a3,$a4,lsl#31
360 mov $a4,$a4,lsr#1
361 str $a2,[$r_ptr,#8]
362 orr $a4,$a4,$a5,lsl#31
363 mov $a5,$a5,lsr#1
364 str $a3,[$r_ptr,#12]
365 orr $a5,$a5,$a6,lsl#31
366 mov $a6,$a6,lsr#1
367 str $a4,[$r_ptr,#16]
368 orr $a6,$a6,$a7,lsl#31
369 mov $a7,$a7,lsr#1
370 str $a5,[$r_ptr,#20]
371 orr $a7,$a7,$b_ptr,lsl#31 @ don't forget the top-most carry bit
372 str $a6,[$r_ptr,#24]
373 str $a7,[$r_ptr,#28]
375 mov pc,lr
376 .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
378 @ void ecp_nistz256_sub(BN_ULONG r0[8],const BN_ULONG r1[8],
379 @ const BN_ULONG r2[8]);
380 .globl ecp_nistz256_sub
381 .type ecp_nistz256_sub,%function
382 .align 4
383 ecp_nistz256_sub:
384 stmdb sp!,{r4-r12,lr}
385 bl __ecp_nistz256_sub
386 #if __ARM_ARCH__>=5 || !defined(__thumb__)
387 ldmia sp!,{r4-r12,pc}
388 #else
389 ldmia sp!,{r4-r12,lr}
390 bx lr @ interoperable with Thumb ISA:-)
391 #endif
392 .size ecp_nistz256_sub,.-ecp_nistz256_sub
394 .type __ecp_nistz256_sub,%function
395 .align 4
396 __ecp_nistz256_sub:
397 str lr,[sp,#-4]! @ push lr
399 ldr $a0,[$a_ptr,#0]
400 ldr $a1,[$a_ptr,#4]
401 ldr $a2,[$a_ptr,#8]
402 ldr $a3,[$a_ptr,#12]
403 ldr $a4,[$a_ptr,#16]
404 ldr $t0,[$b_ptr,#0]
405 ldr $a5,[$a_ptr,#20]
406 ldr $t1,[$b_ptr,#4]
407 ldr $a6,[$a_ptr,#24]
408 ldr $t2,[$b_ptr,#8]
409 ldr $a7,[$a_ptr,#28]
410 ldr $t3,[$b_ptr,#12]
411 subs $a0,$a0,$t0
412 ldr $t0,[$b_ptr,#16]
413 sbcs $a1,$a1,$t1
414 ldr $t1,[$b_ptr,#20]
415 sbcs $a2,$a2,$t2
416 ldr $t2,[$b_ptr,#24]
417 sbcs $a3,$a3,$t3
418 ldr $t3,[$b_ptr,#28]
419 sbcs $a4,$a4,$t0
420 sbcs $a5,$a5,$t1
421 sbcs $a6,$a6,$t2
422 sbcs $a7,$a7,$t3
423 sbc $ff,$ff,$ff @ broadcast borrow bit
424 ldr lr,[sp],#4 @ pop lr
426 .Lreduce_by_add:
428 @ if a-b borrows, add modulus.
430 @ Note that because mod has special form, i.e. consists of
431 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
432 @ broadcasting borrow bit to a register, $ff, and using it as
433 @ a whole or extracting single bit.
435 adds $a0,$a0,$ff @ add synthesized modulus
436 adcs $a1,$a1,$ff
437 str $a0,[$r_ptr,#0]
438 adcs $a2,$a2,$ff
439 str $a1,[$r_ptr,#4]
440 adcs $a3,$a3,#0
441 str $a2,[$r_ptr,#8]
442 adcs $a4,$a4,#0
443 str $a3,[$r_ptr,#12]
444 adcs $a5,$a5,#0
445 str $a4,[$r_ptr,#16]
446 adcs $a6,$a6,$ff,lsr#31
447 str $a5,[$r_ptr,#20]
448 adcs $a7,$a7,$ff
449 str $a6,[$r_ptr,#24]
450 str $a7,[$r_ptr,#28]
452 mov pc,lr
453 .size __ecp_nistz256_sub,.-__ecp_nistz256_sub
455 @ void ecp_nistz256_neg(BN_ULONG r0[8],const BN_ULONG r1[8]);
456 .globl ecp_nistz256_neg
457 .type ecp_nistz256_neg,%function
458 .align 4
459 ecp_nistz256_neg:
460 stmdb sp!,{r4-r12,lr}
461 bl __ecp_nistz256_neg
462 #if __ARM_ARCH__>=5 || !defined(__thumb__)
463 ldmia sp!,{r4-r12,pc}
464 #else
465 ldmia sp!,{r4-r12,lr}
466 bx lr @ interoperable with Thumb ISA:-)
467 #endif
468 .size ecp_nistz256_neg,.-ecp_nistz256_neg
470 .type __ecp_nistz256_neg,%function
471 .align 4
472 __ecp_nistz256_neg:
473 ldr $a0,[$a_ptr,#0]
474 eor $ff,$ff,$ff
475 ldr $a1,[$a_ptr,#4]
476 ldr $a2,[$a_ptr,#8]
477 subs $a0,$ff,$a0
478 ldr $a3,[$a_ptr,#12]
479 sbcs $a1,$ff,$a1
480 ldr $a4,[$a_ptr,#16]
481 sbcs $a2,$ff,$a2
482 ldr $a5,[$a_ptr,#20]
483 sbcs $a3,$ff,$a3
484 ldr $a6,[$a_ptr,#24]
485 sbcs $a4,$ff,$a4
486 ldr $a7,[$a_ptr,#28]
487 sbcs $a5,$ff,$a5
488 sbcs $a6,$ff,$a6
489 sbcs $a7,$ff,$a7
490 sbc $ff,$ff,$ff
492 b .Lreduce_by_add
493 .size __ecp_nistz256_neg,.-__ecp_nistz256_neg
496 my @acc=map("r$_",(3..11));
497 my ($t0,$t1,$bj,$t2,$t3)=map("r$_",(0,1,2,12,14));
499 $code.=<<___;
500 @ void ecp_nistz256_sqr_mont(BN_ULONG r0[8],const BN_ULONG r1[8]);
501 .globl ecp_nistz256_sqr_mont
502 .type ecp_nistz256_sqr_mont,%function
503 .align 4
504 ecp_nistz256_sqr_mont:
505 mov $b_ptr,$a_ptr
506 b .Lecp_nistz256_mul_mont
507 .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
509 @ void ecp_nistz256_mul_mont(BN_ULONG r0[8],const BN_ULONG r1[8],
510 @ const BN_ULONG r2[8]);
511 .globl ecp_nistz256_mul_mont
512 .type ecp_nistz256_mul_mont,%function
513 .align 4
514 ecp_nistz256_mul_mont:
515 .Lecp_nistz256_mul_mont:
516 stmdb sp!,{r4-r12,lr}
517 bl __ecp_nistz256_mul_mont
518 #if __ARM_ARCH__>=5 || !defined(__thumb__)
519 ldmia sp!,{r4-r12,pc}
520 #else
521 ldmia sp!,{r4-r12,lr}
522 bx lr @ interoperable with Thumb ISA:-)
523 #endif
524 .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
526 .type __ecp_nistz256_mul_mont,%function
527 .align 4
528 __ecp_nistz256_mul_mont:
529 stmdb sp!,{r0-r2,lr} @ make a copy of arguments too
531 ldr $bj,[$b_ptr,#0] @ b[0]
532 ldmia $a_ptr,{@acc[1]-@acc[8]}
534 umull @acc[0],$t3,@acc[1],$bj @ r[0]=a[0]*b[0]
535 stmdb sp!,{$acc[1]-@acc[8]} @ copy a[0-7] to stack, so
536 @ that it can be addressed
537 @ without spending register
538 @ on address
539 umull @acc[1],$t0,@acc[2],$bj @ r[1]=a[1]*b[0]
540 umull @acc[2],$t1,@acc[3],$bj
541 adds @acc[1],@acc[1],$t3 @ accumulate high part of mult
542 umull @acc[3],$t2,@acc[4],$bj
543 adcs @acc[2],@acc[2],$t0
544 umull @acc[4],$t3,@acc[5],$bj
545 adcs @acc[3],@acc[3],$t1
546 umull @acc[5],$t0,@acc[6],$bj
547 adcs @acc[4],@acc[4],$t2
548 umull @acc[6],$t1,@acc[7],$bj
549 adcs @acc[5],@acc[5],$t3
550 umull @acc[7],$t2,@acc[8],$bj
551 adcs @acc[6],@acc[6],$t0
552 adcs @acc[7],@acc[7],$t1
553 eor $t3,$t3,$t3 @ first overflow bit is zero
554 adc @acc[8],$t2,#0
556 for(my $i=1;$i<8;$i++) {
557 my $t4=@acc[0];
559 # Reduction iteration is normally performed by accumulating
560 # result of multiplication of modulus by "magic" digit [and
561 # omitting least significant word, which is guaranteed to
562 # be 0], but thanks to special form of modulus and "magic"
563 # digit being equal to least significant word, it can be
564 # performed with additions and subtractions alone. Indeed:
566 # ffff.0001.0000.0000.0000.ffff.ffff.ffff
567 # * abcd
568 # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
570 # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
571 # rewrite above as:
573 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
574 # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
575 # - abcd.0000.0000.0000.0000.0000.0000.abcd
577 # or marking redundant operations:
579 # xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
580 # + abcd.0000.abcd.0000.0000.abcd.----.----.----
581 # - abcd.----.----.----.----.----.----.----
583 $code.=<<___;
584 @ multiplication-less reduction $i
585 adds @acc[3],@acc[3],@acc[0] @ r[3]+=r[0]
586 ldr $bj,[sp,#40] @ restore b_ptr
587 adcs @acc[4],@acc[4],#0 @ r[4]+=0
588 adcs @acc[5],@acc[5],#0 @ r[5]+=0
589 adcs @acc[6],@acc[6],@acc[0] @ r[6]+=r[0]
590 ldr $t1,[sp,#0] @ load a[0]
591 adcs @acc[7],@acc[7],#0 @ r[7]+=0
592 ldr $bj,[$bj,#4*$i] @ load b[i]
593 adcs @acc[8],@acc[8],@acc[0] @ r[8]+=r[0]
594 eor $t0,$t0,$t0
595 adc $t3,$t3,#0 @ overflow bit
596 subs @acc[7],@acc[7],@acc[0] @ r[7]-=r[0]
597 ldr $t2,[sp,#4] @ a[1]
598 sbcs @acc[8],@acc[8],#0 @ r[8]-=0
599 umlal @acc[1],$t0,$t1,$bj @ "r[0]"+=a[0]*b[i]
600 eor $t1,$t1,$t1
601 sbc @acc[0],$t3,#0 @ overflow bit, keep in mind
602 @ that netto result is
603 @ addition of a value which
604 @ makes underflow impossible
606 ldr $t3,[sp,#8] @ a[2]
607 umlal @acc[2],$t1,$t2,$bj @ "r[1]"+=a[1]*b[i]
608 str @acc[0],[sp,#36] @ temporarily offload overflow
609 eor $t2,$t2,$t2
610 ldr $t4,[sp,#12] @ a[3], $t4 is alias @acc[0]
611 umlal @acc[3],$t2,$t3,$bj @ "r[2]"+=a[2]*b[i]
612 eor $t3,$t3,$t3
613 adds @acc[2],@acc[2],$t0 @ accumulate high part of mult
614 ldr $t0,[sp,#16] @ a[4]
615 umlal @acc[4],$t3,$t4,$bj @ "r[3]"+=a[3]*b[i]
616 eor $t4,$t4,$t4
617 adcs @acc[3],@acc[3],$t1
618 ldr $t1,[sp,#20] @ a[5]
619 umlal @acc[5],$t4,$t0,$bj @ "r[4]"+=a[4]*b[i]
620 eor $t0,$t0,$t0
621 adcs @acc[4],@acc[4],$t2
622 ldr $t2,[sp,#24] @ a[6]
623 umlal @acc[6],$t0,$t1,$bj @ "r[5]"+=a[5]*b[i]
624 eor $t1,$t1,$t1
625 adcs @acc[5],@acc[5],$t3
626 ldr $t3,[sp,#28] @ a[7]
627 umlal @acc[7],$t1,$t2,$bj @ "r[6]"+=a[6]*b[i]
628 eor $t2,$t2,$t2
629 adcs @acc[6],@acc[6],$t4
630 ldr @acc[0],[sp,#36] @ restore overflow bit
631 umlal @acc[8],$t2,$t3,$bj @ "r[7]"+=a[7]*b[i]
632 eor $t3,$t3,$t3
633 adcs @acc[7],@acc[7],$t0
634 adcs @acc[8],@acc[8],$t1
635 adcs @acc[0],$acc[0],$t2
636 adc $t3,$t3,#0 @ new overflow bit
638 push(@acc,shift(@acc)); # rotate registers, so that
639 # "r[i]" becomes r[i]
641 $code.=<<___;
642 @ last multiplication-less reduction
643 adds @acc[3],@acc[3],@acc[0]
644 ldr $r_ptr,[sp,#32] @ restore r_ptr
645 adcs @acc[4],@acc[4],#0
646 adcs @acc[5],@acc[5],#0
647 adcs @acc[6],@acc[6],@acc[0]
648 adcs @acc[7],@acc[7],#0
649 adcs @acc[8],@acc[8],@acc[0]
650 adc $t3,$t3,#0
651 subs @acc[7],@acc[7],@acc[0]
652 sbcs @acc[8],@acc[8],#0
653 sbc @acc[0],$t3,#0 @ overflow bit
655 @ Final step is "if result > mod, subtract mod", but we do it
656 @ "other way around", namely subtract modulus from result
657 @ and if it borrowed, add modulus back.
659 adds @acc[1],@acc[1],#1 @ subs @acc[1],@acc[1],#-1
660 adcs @acc[2],@acc[2],#0 @ sbcs @acc[2],@acc[2],#-1
661 adcs @acc[3],@acc[3],#0 @ sbcs @acc[3],@acc[3],#-1
662 sbcs @acc[4],@acc[4],#0
663 sbcs @acc[5],@acc[5],#0
664 sbcs @acc[6],@acc[6],#0
665 sbcs @acc[7],@acc[7],#1
666 adcs @acc[8],@acc[8],#0 @ sbcs @acc[8],@acc[8],#-1
667 ldr lr,[sp,#44] @ restore lr
668 sbc @acc[0],@acc[0],#0 @ broadcast borrow bit
669 add sp,sp,#48
671 @ Note that because mod has special form, i.e. consists of
672 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
673 @ broadcasting borrow bit to a register, @acc[0], and using it as
674 @ a whole or extracting single bit.
676 adds @acc[1],@acc[1],@acc[0] @ add modulus or zero
677 adcs @acc[2],@acc[2],@acc[0]
678 str @acc[1],[$r_ptr,#0]
679 adcs @acc[3],@acc[3],@acc[0]
680 str @acc[2],[$r_ptr,#4]
681 adcs @acc[4],@acc[4],#0
682 str @acc[3],[$r_ptr,#8]
683 adcs @acc[5],@acc[5],#0
684 str @acc[4],[$r_ptr,#12]
685 adcs @acc[6],@acc[6],#0
686 str @acc[5],[$r_ptr,#16]
687 adcs @acc[7],@acc[7],@acc[0],lsr#31
688 str @acc[6],[$r_ptr,#20]
689 adc @acc[8],@acc[8],@acc[0]
690 str @acc[7],[$r_ptr,#24]
691 str @acc[8],[$r_ptr,#28]
693 mov pc,lr
694 .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
699 my ($out,$inp,$index,$mask)=map("r$_",(0..3));
700 $code.=<<___;
701 @ void ecp_nistz256_select_w5(P256_POINT *r0,const void *r1,
702 @ int r2);
703 .globl ecp_nistz256_select_w5
704 .type ecp_nistz256_select_w5,%function
705 .align 5
706 ecp_nistz256_select_w5:
707 stmdb sp!,{r4-r11}
709 cmp $index,#0
710 mov $mask,#0
711 #ifdef __thumb2__
712 itt ne
713 #endif
714 subne $index,$index,#1
715 movne $mask,#-1
716 add $inp,$inp,$index,lsl#2
718 ldr r4,[$inp,#64*0]
719 ldr r5,[$inp,#64*1]
720 ldr r6,[$inp,#64*2]
721 and r4,r4,$mask
722 ldr r7,[$inp,#64*3]
723 and r5,r5,$mask
724 ldr r8,[$inp,#64*4]
725 and r6,r6,$mask
726 ldr r9,[$inp,#64*5]
727 and r7,r7,$mask
728 ldr r10,[$inp,#64*6]
729 and r8,r8,$mask
730 ldr r11,[$inp,#64*7]
731 add $inp,$inp,#64*8
732 and r9,r9,$mask
733 and r10,r10,$mask
734 and r11,r11,$mask
735 stmia $out!,{r4-r11} @ X
737 ldr r4,[$inp,#64*0]
738 ldr r5,[$inp,#64*1]
739 ldr r6,[$inp,#64*2]
740 and r4,r4,$mask
741 ldr r7,[$inp,#64*3]
742 and r5,r5,$mask
743 ldr r8,[$inp,#64*4]
744 and r6,r6,$mask
745 ldr r9,[$inp,#64*5]
746 and r7,r7,$mask
747 ldr r10,[$inp,#64*6]
748 and r8,r8,$mask
749 ldr r11,[$inp,#64*7]
750 add $inp,$inp,#64*8
751 and r9,r9,$mask
752 and r10,r10,$mask
753 and r11,r11,$mask
754 stmia $out!,{r4-r11} @ Y
756 ldr r4,[$inp,#64*0]
757 ldr r5,[$inp,#64*1]
758 ldr r6,[$inp,#64*2]
759 and r4,r4,$mask
760 ldr r7,[$inp,#64*3]
761 and r5,r5,$mask
762 ldr r8,[$inp,#64*4]
763 and r6,r6,$mask
764 ldr r9,[$inp,#64*5]
765 and r7,r7,$mask
766 ldr r10,[$inp,#64*6]
767 and r8,r8,$mask
768 ldr r11,[$inp,#64*7]
769 and r9,r9,$mask
770 and r10,r10,$mask
771 and r11,r11,$mask
772 stmia $out,{r4-r11} @ Z
774 ldmia sp!,{r4-r11}
775 #if __ARM_ARCH__>=5 || defined(__thumb__)
776 bx lr
777 #else
778 mov pc,lr
779 #endif
780 .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
782 @ void ecp_nistz256_select_w7(P256_POINT_AFFINE *r0,const void *r1,
783 @ int r2);
784 .globl ecp_nistz256_select_w7
785 .type ecp_nistz256_select_w7,%function
786 .align 5
787 ecp_nistz256_select_w7:
788 stmdb sp!,{r4-r7}
790 cmp $index,#0
791 mov $mask,#0
792 #ifdef __thumb2__
793 itt ne
794 #endif
795 subne $index,$index,#1
796 movne $mask,#-1
797 add $inp,$inp,$index
798 mov $index,#64/4
800 .Loop_select_w7:
801 ldrb r4,[$inp,#64*0]
802 subs $index,$index,#1
803 ldrb r5,[$inp,#64*1]
804 ldrb r6,[$inp,#64*2]
805 ldrb r7,[$inp,#64*3]
806 add $inp,$inp,#64*4
807 orr r4,r4,r5,lsl#8
808 orr r4,r4,r6,lsl#16
809 orr r4,r4,r7,lsl#24
810 and r4,r4,$mask
811 str r4,[$out],#4
812 bne .Loop_select_w7
814 ldmia sp!,{r4-r7}
815 #if __ARM_ARCH__>=5 || defined(__thumb__)
816 bx lr
817 #else
818 mov pc,lr
819 #endif
820 .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
823 if (0) {
824 # In comparison to integer-only equivalent of below subroutine:
826 # Cortex-A8 +10%
827 # Cortex-A9 -10%
828 # Snapdragon S4 +5%
830 # As not all time is spent in multiplication, overall impact is deemed
831 # too low to care about.
833 my ($A0,$A1,$A2,$A3,$Bi,$zero,$temp)=map("d$_",(0..7));
834 my $mask="q4";
835 my $mult="q5";
836 my @AxB=map("q$_",(8..15));
838 my ($rptr,$aptr,$bptr,$toutptr)=map("r$_",(0..3));
840 $code.=<<___;
841 #if __ARM_ARCH__>=7
842 .fpu neon
844 .globl ecp_nistz256_mul_mont_neon
845 .type ecp_nistz256_mul_mont_neon,%function
846 .align 5
847 ecp_nistz256_mul_mont_neon:
848 mov ip,sp
849 stmdb sp!,{r4-r9}
850 vstmdb sp!,{q4-q5} @ ABI specification says so
852 sub $toutptr,sp,#40
853 vld1.32 {${Bi}[0]},[$bptr,:32]!
854 veor $zero,$zero,$zero
855 vld1.32 {$A0-$A3}, [$aptr] @ can't specify :32 :-(
856 vzip.16 $Bi,$zero
857 mov sp,$toutptr @ alloca
858 vmov.i64 $mask,#0xffff
860 vmull.u32 @AxB[0],$Bi,${A0}[0]
861 vmull.u32 @AxB[1],$Bi,${A0}[1]
862 vmull.u32 @AxB[2],$Bi,${A1}[0]
863 vmull.u32 @AxB[3],$Bi,${A1}[1]
864 vshr.u64 $temp,@AxB[0]#lo,#16
865 vmull.u32 @AxB[4],$Bi,${A2}[0]
866 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
867 vmull.u32 @AxB[5],$Bi,${A2}[1]
868 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 32 bits of a[0]*b[0]
869 vmull.u32 @AxB[6],$Bi,${A3}[0]
870 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
871 vmull.u32 @AxB[7],$Bi,${A3}[1]
873 for($i=1;$i<8;$i++) {
874 $code.=<<___;
875 vld1.32 {${Bi}[0]},[$bptr,:32]!
876 veor $zero,$zero,$zero
877 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ reduction
878 vshl.u64 $mult,@AxB[0],#32
879 vadd.u64 @AxB[3],@AxB[3],@AxB[0]
880 vsub.u64 $mult,$mult,@AxB[0]
881 vzip.16 $Bi,$zero
882 vadd.u64 @AxB[6],@AxB[6],@AxB[0]
883 vadd.u64 @AxB[7],@AxB[7],$mult
885 push(@AxB,shift(@AxB));
886 $code.=<<___;
887 vmlal.u32 @AxB[0],$Bi,${A0}[0]
888 vmlal.u32 @AxB[1],$Bi,${A0}[1]
889 vmlal.u32 @AxB[2],$Bi,${A1}[0]
890 vmlal.u32 @AxB[3],$Bi,${A1}[1]
891 vshr.u64 $temp,@AxB[0]#lo,#16
892 vmlal.u32 @AxB[4],$Bi,${A2}[0]
893 vadd.u64 @AxB[0]#hi,@AxB[0]#hi,$temp
894 vmlal.u32 @AxB[5],$Bi,${A2}[1]
895 vshr.u64 $temp,@AxB[0]#hi,#16 @ upper 33 bits of a[0]*b[i]+t[0]
896 vmlal.u32 @AxB[6],$Bi,${A3}[0]
897 vand.u64 @AxB[0],@AxB[0],$mask @ lower 32 bits of a[0]*b[0]
898 vmull.u32 @AxB[7],$Bi,${A3}[1]
901 $code.=<<___;
902 vadd.u64 @AxB[1]#lo,@AxB[1]#lo,$temp @ last reduction
903 vshl.u64 $mult,@AxB[0],#32
904 vadd.u64 @AxB[3],@AxB[3],@AxB[0]
905 vsub.u64 $mult,$mult,@AxB[0]
906 vadd.u64 @AxB[6],@AxB[6],@AxB[0]
907 vadd.u64 @AxB[7],@AxB[7],$mult
909 vshr.u64 $temp,@AxB[1]#lo,#16 @ convert
910 vadd.u64 @AxB[1]#hi,@AxB[1]#hi,$temp
911 vshr.u64 $temp,@AxB[1]#hi,#16
912 vzip.16 @AxB[1]#lo,@AxB[1]#hi
914 foreach (2..7) {
915 $code.=<<___;
916 vadd.u64 @AxB[$_]#lo,@AxB[$_]#lo,$temp
917 vst1.32 {@AxB[$_-1]#lo[0]},[$toutptr,:32]!
918 vshr.u64 $temp,@AxB[$_]#lo,#16
919 vadd.u64 @AxB[$_]#hi,@AxB[$_]#hi,$temp
920 vshr.u64 $temp,@AxB[$_]#hi,#16
921 vzip.16 @AxB[$_]#lo,@AxB[$_]#hi
924 $code.=<<___;
925 vst1.32 {@AxB[7]#lo[0]},[$toutptr,:32]!
926 vst1.32 {$temp},[$toutptr] @ upper 33 bits
928 ldr r1,[sp,#0]
929 ldr r2,[sp,#4]
930 ldr r3,[sp,#8]
931 subs r1,r1,#-1
932 ldr r4,[sp,#12]
933 sbcs r2,r2,#-1
934 ldr r5,[sp,#16]
935 sbcs r3,r3,#-1
936 ldr r6,[sp,#20]
937 sbcs r4,r4,#0
938 ldr r7,[sp,#24]
939 sbcs r5,r5,#0
940 ldr r8,[sp,#28]
941 sbcs r6,r6,#0
942 ldr r9,[sp,#32] @ top-most bit
943 sbcs r7,r7,#1
944 sub sp,ip,#40+16
945 sbcs r8,r8,#-1
946 sbc r9,r9,#0
947 vldmia sp!,{q4-q5}
949 adds r1,r1,r9
950 adcs r2,r2,r9
951 str r1,[$rptr,#0]
952 adcs r3,r3,r9
953 str r2,[$rptr,#4]
954 adcs r4,r4,#0
955 str r3,[$rptr,#8]
956 adcs r5,r5,#0
957 str r4,[$rptr,#12]
958 adcs r6,r6,#0
959 str r5,[$rptr,#16]
960 adcs r7,r7,r9,lsr#31
961 str r6,[$rptr,#20]
962 adcs r8,r8,r9
963 str r7,[$rptr,#24]
964 str r8,[$rptr,#28]
966 ldmia sp!,{r4-r9}
967 bx lr
968 .size ecp_nistz256_mul_mont_neon,.-ecp_nistz256_mul_mont_neon
969 #endif
974 ########################################################################
975 # Below $aN assignment matches order in which 256-bit result appears in
976 # register bank at return from __ecp_nistz256_mul_mont, so that we can
977 # skip over reloading it from memory. This means that below functions
978 # use custom calling sequence accepting 256-bit input in registers,
979 # output pointer in r0, $r_ptr, and optional pointer in r2, $b_ptr.
981 # See their "normal" counterparts for insights on calculations.
983 my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7,
984 $t0,$t1,$t2,$t3)=map("r$_",(11,3..10,12,14,1));
985 my $ff=$b_ptr;
987 $code.=<<___;
988 .type __ecp_nistz256_sub_from,%function
989 .align 5
990 __ecp_nistz256_sub_from:
991 str lr,[sp,#-4]! @ push lr
993 ldr $t0,[$b_ptr,#0]
994 ldr $t1,[$b_ptr,#4]
995 ldr $t2,[$b_ptr,#8]
996 ldr $t3,[$b_ptr,#12]
997 subs $a0,$a0,$t0
998 ldr $t0,[$b_ptr,#16]
999 sbcs $a1,$a1,$t1
1000 ldr $t1,[$b_ptr,#20]
1001 sbcs $a2,$a2,$t2
1002 ldr $t2,[$b_ptr,#24]
1003 sbcs $a3,$a3,$t3
1004 ldr $t3,[$b_ptr,#28]
1005 sbcs $a4,$a4,$t0
1006 sbcs $a5,$a5,$t1
1007 sbcs $a6,$a6,$t2
1008 sbcs $a7,$a7,$t3
1009 sbc $ff,$ff,$ff @ broadcast borrow bit
1010 ldr lr,[sp],#4 @ pop lr
1012 adds $a0,$a0,$ff @ add synthesized modulus
1013 adcs $a1,$a1,$ff
1014 str $a0,[$r_ptr,#0]
1015 adcs $a2,$a2,$ff
1016 str $a1,[$r_ptr,#4]
1017 adcs $a3,$a3,#0
1018 str $a2,[$r_ptr,#8]
1019 adcs $a4,$a4,#0
1020 str $a3,[$r_ptr,#12]
1021 adcs $a5,$a5,#0
1022 str $a4,[$r_ptr,#16]
1023 adcs $a6,$a6,$ff,lsr#31
1024 str $a5,[$r_ptr,#20]
1025 adcs $a7,$a7,$ff
1026 str $a6,[$r_ptr,#24]
1027 str $a7,[$r_ptr,#28]
1029 mov pc,lr
1030 .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
1032 .type __ecp_nistz256_sub_morf,%function
1033 .align 5
1034 __ecp_nistz256_sub_morf:
1035 str lr,[sp,#-4]! @ push lr
1037 ldr $t0,[$b_ptr,#0]
1038 ldr $t1,[$b_ptr,#4]
1039 ldr $t2,[$b_ptr,#8]
1040 ldr $t3,[$b_ptr,#12]
1041 subs $a0,$t0,$a0
1042 ldr $t0,[$b_ptr,#16]
1043 sbcs $a1,$t1,$a1
1044 ldr $t1,[$b_ptr,#20]
1045 sbcs $a2,$t2,$a2
1046 ldr $t2,[$b_ptr,#24]
1047 sbcs $a3,$t3,$a3
1048 ldr $t3,[$b_ptr,#28]
1049 sbcs $a4,$t0,$a4
1050 sbcs $a5,$t1,$a5
1051 sbcs $a6,$t2,$a6
1052 sbcs $a7,$t3,$a7
1053 sbc $ff,$ff,$ff @ broadcast borrow bit
1054 ldr lr,[sp],#4 @ pop lr
1056 adds $a0,$a0,$ff @ add synthesized modulus
1057 adcs $a1,$a1,$ff
1058 str $a0,[$r_ptr,#0]
1059 adcs $a2,$a2,$ff
1060 str $a1,[$r_ptr,#4]
1061 adcs $a3,$a3,#0
1062 str $a2,[$r_ptr,#8]
1063 adcs $a4,$a4,#0
1064 str $a3,[$r_ptr,#12]
1065 adcs $a5,$a5,#0
1066 str $a4,[$r_ptr,#16]
1067 adcs $a6,$a6,$ff,lsr#31
1068 str $a5,[$r_ptr,#20]
1069 adcs $a7,$a7,$ff
1070 str $a6,[$r_ptr,#24]
1071 str $a7,[$r_ptr,#28]
1073 mov pc,lr
1074 .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
1076 .type __ecp_nistz256_add_self,%function
1077 .align 4
1078 __ecp_nistz256_add_self:
1079 adds $a0,$a0,$a0 @ a[0:7]+=a[0:7]
1080 adcs $a1,$a1,$a1
1081 adcs $a2,$a2,$a2
1082 adcs $a3,$a3,$a3
1083 adcs $a4,$a4,$a4
1084 adcs $a5,$a5,$a5
1085 adcs $a6,$a6,$a6
1086 mov $ff,#0
1087 adcs $a7,$a7,$a7
1088 adc $ff,$ff,#0
1090 @ if a+b >= modulus, subtract modulus.
1092 @ But since comparison implies subtraction, we subtract
1093 @ modulus and then add it back if subraction borrowed.
1095 subs $a0,$a0,#-1
1096 sbcs $a1,$a1,#-1
1097 sbcs $a2,$a2,#-1
1098 sbcs $a3,$a3,#0
1099 sbcs $a4,$a4,#0
1100 sbcs $a5,$a5,#0
1101 sbcs $a6,$a6,#1
1102 sbcs $a7,$a7,#-1
1103 sbc $ff,$ff,#0
1105 @ Note that because mod has special form, i.e. consists of
1106 @ 0xffffffff, 1 and 0s, we can conditionally synthesize it by
1107 @ using value of borrow as a whole or extracting single bit.
1108 @ Follow $ff register...
1110 adds $a0,$a0,$ff @ add synthesized modulus
1111 adcs $a1,$a1,$ff
1112 str $a0,[$r_ptr,#0]
1113 adcs $a2,$a2,$ff
1114 str $a1,[$r_ptr,#4]
1115 adcs $a3,$a3,#0
1116 str $a2,[$r_ptr,#8]
1117 adcs $a4,$a4,#0
1118 str $a3,[$r_ptr,#12]
1119 adcs $a5,$a5,#0
1120 str $a4,[$r_ptr,#16]
1121 adcs $a6,$a6,$ff,lsr#31
1122 str $a5,[$r_ptr,#20]
1123 adcs $a7,$a7,$ff
1124 str $a6,[$r_ptr,#24]
1125 str $a7,[$r_ptr,#28]
1127 mov pc,lr
1128 .size __ecp_nistz256_add_self,.-__ecp_nistz256_add_self
1132 ########################################################################
1133 # following subroutines are "literal" implementation of those found in
1134 # ecp_nistz256.c
1136 ########################################################################
1137 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
1140 my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1141 # above map() describes stack layout with 5 temporary
1142 # 256-bit vectors on top. Then note that we push
1143 # starting from r0, which means that we have copy of
1144 # input arguments just below these temporary vectors.
1146 $code.=<<___;
1147 .globl ecp_nistz256_point_double
1148 .type ecp_nistz256_point_double,%function
1149 .align 5
1150 ecp_nistz256_point_double:
1151 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1152 sub sp,sp,#32*5
1154 .Lpoint_double_shortcut:
1155 add r3,sp,#$in_x
1156 ldmia $a_ptr!,{r4-r11} @ copy in_x
1157 stmia r3,{r4-r11}
1159 add $r_ptr,sp,#$S
1160 bl __ecp_nistz256_mul_by_2 @ p256_mul_by_2(S, in_y);
1162 add $b_ptr,$a_ptr,#32
1163 add $a_ptr,$a_ptr,#32
1164 add $r_ptr,sp,#$Zsqr
1165 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Zsqr, in_z);
1167 add $a_ptr,sp,#$S
1168 add $b_ptr,sp,#$S
1169 add $r_ptr,sp,#$S
1170 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(S, S);
1172 ldr $b_ptr,[sp,#32*5+4]
1173 add $a_ptr,$b_ptr,#32
1174 add $b_ptr,$b_ptr,#64
1175 add $r_ptr,sp,#$tmp0
1176 bl __ecp_nistz256_mul_mont @ p256_mul_mont(tmp0, in_z, in_y);
1178 ldr $r_ptr,[sp,#32*5]
1179 add $r_ptr,$r_ptr,#64
1180 bl __ecp_nistz256_add_self @ p256_mul_by_2(res_z, tmp0);
1182 add $a_ptr,sp,#$in_x
1183 add $b_ptr,sp,#$Zsqr
1184 add $r_ptr,sp,#$M
1185 bl __ecp_nistz256_add @ p256_add(M, in_x, Zsqr);
1187 add $a_ptr,sp,#$in_x
1188 add $b_ptr,sp,#$Zsqr
1189 add $r_ptr,sp,#$Zsqr
1190 bl __ecp_nistz256_sub @ p256_sub(Zsqr, in_x, Zsqr);
1192 add $a_ptr,sp,#$S
1193 add $b_ptr,sp,#$S
1194 add $r_ptr,sp,#$tmp0
1195 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(tmp0, S);
1197 add $a_ptr,sp,#$Zsqr
1198 add $b_ptr,sp,#$M
1199 add $r_ptr,sp,#$M
1200 bl __ecp_nistz256_mul_mont @ p256_mul_mont(M, M, Zsqr);
1202 ldr $r_ptr,[sp,#32*5]
1203 add $a_ptr,sp,#$tmp0
1204 add $r_ptr,$r_ptr,#32
1205 bl __ecp_nistz256_div_by_2 @ p256_div_by_2(res_y, tmp0);
1207 add $a_ptr,sp,#$M
1208 add $r_ptr,sp,#$M
1209 bl __ecp_nistz256_mul_by_3 @ p256_mul_by_3(M, M);
1211 add $a_ptr,sp,#$in_x
1212 add $b_ptr,sp,#$S
1213 add $r_ptr,sp,#$S
1214 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, in_x);
1216 add $r_ptr,sp,#$tmp0
1217 bl __ecp_nistz256_add_self @ p256_mul_by_2(tmp0, S);
1219 ldr $r_ptr,[sp,#32*5]
1220 add $a_ptr,sp,#$M
1221 add $b_ptr,sp,#$M
1222 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(res_x, M);
1224 add $b_ptr,sp,#$tmp0
1225 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, tmp0);
1227 add $b_ptr,sp,#$S
1228 add $r_ptr,sp,#$S
1229 bl __ecp_nistz256_sub_morf @ p256_sub(S, S, res_x);
1231 add $a_ptr,sp,#$M
1232 add $b_ptr,sp,#$S
1233 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S, S, M);
1235 ldr $r_ptr,[sp,#32*5]
1236 add $b_ptr,$r_ptr,#32
1237 add $r_ptr,$r_ptr,#32
1238 bl __ecp_nistz256_sub_from @ p256_sub(res_y, S, res_y);
1240 add sp,sp,#32*5+16 @ +16 means "skip even over saved r0-r3"
1241 #if __ARM_ARCH__>=5 || !defined(__thumb__)
1242 ldmia sp!,{r4-r12,pc}
1243 #else
1244 ldmia sp!,{r4-r12,lr}
1245 bx lr @ interoperable with Thumb ISA:-)
1246 #endif
1247 .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
1251 ########################################################################
1252 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
1253 # const P256_POINT *in2);
1255 my ($res_x,$res_y,$res_z,
1256 $in1_x,$in1_y,$in1_z,
1257 $in2_x,$in2_y,$in2_z,
1258 $H,$Hsqr,$R,$Rsqr,$Hcub,
1259 $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
1260 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1261 # above map() describes stack layout with 18 temporary
1262 # 256-bit vectors on top. Then note that we push
1263 # starting from r0, which means that we have copy of
1264 # input arguments just below these temporary vectors.
1265 # We use three of them for !in1infty, !in2intfy and
1266 # result of check for zero.
1268 $code.=<<___;
1269 .globl ecp_nistz256_point_add
1270 .type ecp_nistz256_point_add,%function
1271 .align 5
1272 ecp_nistz256_point_add:
1273 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1274 sub sp,sp,#32*18+16
1276 ldmia $b_ptr!,{r4-r11} @ copy in2_x
1277 add r3,sp,#$in2_x
1278 stmia r3!,{r4-r11}
1279 ldmia $b_ptr!,{r4-r11} @ copy in2_y
1280 stmia r3!,{r4-r11}
1281 ldmia $b_ptr,{r4-r11} @ copy in2_z
1282 orr r12,r4,r5
1283 orr r12,r12,r6
1284 orr r12,r12,r7
1285 orr r12,r12,r8
1286 orr r12,r12,r9
1287 orr r12,r12,r10
1288 orr r12,r12,r11
1289 cmp r12,#0
1290 #ifdef __thumb2__
1291 it ne
1292 #endif
1293 movne r12,#-1
1294 stmia r3,{r4-r11}
1295 str r12,[sp,#32*18+8] @ !in2infty
1297 ldmia $a_ptr!,{r4-r11} @ copy in1_x
1298 add r3,sp,#$in1_x
1299 stmia r3!,{r4-r11}
1300 ldmia $a_ptr!,{r4-r11} @ copy in1_y
1301 stmia r3!,{r4-r11}
1302 ldmia $a_ptr,{r4-r11} @ copy in1_z
1303 orr r12,r4,r5
1304 orr r12,r12,r6
1305 orr r12,r12,r7
1306 orr r12,r12,r8
1307 orr r12,r12,r9
1308 orr r12,r12,r10
1309 orr r12,r12,r11
1310 cmp r12,#0
1311 #ifdef __thumb2__
1312 it ne
1313 #endif
1314 movne r12,#-1
1315 stmia r3,{r4-r11}
1316 str r12,[sp,#32*18+4] @ !in1infty
1318 add $a_ptr,sp,#$in2_z
1319 add $b_ptr,sp,#$in2_z
1320 add $r_ptr,sp,#$Z2sqr
1321 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z2sqr, in2_z);
1323 add $a_ptr,sp,#$in1_z
1324 add $b_ptr,sp,#$in1_z
1325 add $r_ptr,sp,#$Z1sqr
1326 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
1328 add $a_ptr,sp,#$in2_z
1329 add $b_ptr,sp,#$Z2sqr
1330 add $r_ptr,sp,#$S1
1331 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, Z2sqr, in2_z);
1333 add $a_ptr,sp,#$in1_z
1334 add $b_ptr,sp,#$Z1sqr
1335 add $r_ptr,sp,#$S2
1336 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
1338 add $a_ptr,sp,#$in1_y
1339 add $b_ptr,sp,#$S1
1340 add $r_ptr,sp,#$S1
1341 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S1, S1, in1_y);
1343 add $a_ptr,sp,#$in2_y
1344 add $b_ptr,sp,#$S2
1345 add $r_ptr,sp,#$S2
1346 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
1348 add $b_ptr,sp,#$S1
1349 add $r_ptr,sp,#$R
1350 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, S1);
1352 orr $a0,$a0,$a1 @ see if result is zero
1353 orr $a2,$a2,$a3
1354 orr $a4,$a4,$a5
1355 orr $a0,$a0,$a2
1356 orr $a4,$a4,$a6
1357 orr $a0,$a0,$a7
1358 add $a_ptr,sp,#$in1_x
1359 orr $a0,$a0,$a4
1360 add $b_ptr,sp,#$Z2sqr
1361 str $a0,[sp,#32*18+12]
1363 add $r_ptr,sp,#$U1
1364 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U1, in1_x, Z2sqr);
1366 add $a_ptr,sp,#$in2_x
1367 add $b_ptr,sp,#$Z1sqr
1368 add $r_ptr,sp,#$U2
1369 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in2_x, Z1sqr);
1371 add $b_ptr,sp,#$U1
1372 add $r_ptr,sp,#$H
1373 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, U1);
1375 orr $a0,$a0,$a1 @ see if result is zero
1376 orr $a2,$a2,$a3
1377 orr $a4,$a4,$a5
1378 orr $a0,$a0,$a2
1379 orr $a4,$a4,$a6
1380 orr $a0,$a0,$a7
1381 orrs $a0,$a0,$a4
1383 bne .Ladd_proceed @ is_equal(U1,U2)?
1385 ldr $t0,[sp,#32*18+4]
1386 ldr $t1,[sp,#32*18+8]
1387 ldr $t2,[sp,#32*18+12]
1388 tst $t0,$t1
1389 beq .Ladd_proceed @ (in1infty || in2infty)?
1390 tst $t2,$t2
1391 beq .Ladd_double @ is_equal(S1,S2)?
1393 ldr $r_ptr,[sp,#32*18+16]
1394 eor r4,r4,r4
1395 eor r5,r5,r5
1396 eor r6,r6,r6
1397 eor r7,r7,r7
1398 eor r8,r8,r8
1399 eor r9,r9,r9
1400 eor r10,r10,r10
1401 eor r11,r11,r11
1402 stmia $r_ptr!,{r4-r11}
1403 stmia $r_ptr!,{r4-r11}
1404 stmia $r_ptr!,{r4-r11}
1405 b .Ladd_done
1407 .align 4
1408 .Ladd_double:
1409 ldr $a_ptr,[sp,#32*18+20]
1410 add sp,sp,#32*(18-5)+16 @ difference in frame sizes
1411 b .Lpoint_double_shortcut
1413 .align 4
1414 .Ladd_proceed:
1415 add $a_ptr,sp,#$R
1416 add $b_ptr,sp,#$R
1417 add $r_ptr,sp,#$Rsqr
1418 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
1420 add $a_ptr,sp,#$H
1421 add $b_ptr,sp,#$in1_z
1422 add $r_ptr,sp,#$res_z
1423 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
1425 add $a_ptr,sp,#$H
1426 add $b_ptr,sp,#$H
1427 add $r_ptr,sp,#$Hsqr
1428 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
1430 add $a_ptr,sp,#$in2_z
1431 add $b_ptr,sp,#$res_z
1432 add $r_ptr,sp,#$res_z
1433 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, res_z, in2_z);
1435 add $a_ptr,sp,#$H
1436 add $b_ptr,sp,#$Hsqr
1437 add $r_ptr,sp,#$Hcub
1438 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
1440 add $a_ptr,sp,#$Hsqr
1441 add $b_ptr,sp,#$U1
1442 add $r_ptr,sp,#$U2
1443 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, U1, Hsqr);
1445 add $r_ptr,sp,#$Hsqr
1446 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
1448 add $b_ptr,sp,#$Rsqr
1449 add $r_ptr,sp,#$res_x
1450 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
1452 add $b_ptr,sp,#$Hcub
1453 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
1455 add $b_ptr,sp,#$U2
1456 add $r_ptr,sp,#$res_y
1457 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
1459 add $a_ptr,sp,#$Hcub
1460 add $b_ptr,sp,#$S1
1461 add $r_ptr,sp,#$S2
1462 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S1, Hcub);
1464 add $a_ptr,sp,#$R
1465 add $b_ptr,sp,#$res_y
1466 add $r_ptr,sp,#$res_y
1467 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
1469 add $b_ptr,sp,#$S2
1470 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
1472 ldr r11,[sp,#32*18+4] @ !in1intfy
1473 ldr r12,[sp,#32*18+8] @ !in2intfy
1474 add r1,sp,#$res_x
1475 add r2,sp,#$in2_x
1476 and r10,r11,r12
1477 mvn r11,r11
1478 add r3,sp,#$in1_x
1479 and r11,r11,r12
1480 mvn r12,r12
1481 ldr $r_ptr,[sp,#32*18+16]
1483 for($i=0;$i<96;$i+=8) { # conditional moves
1484 $code.=<<___;
1485 ldmia r1!,{r4-r5} @ res_x
1486 ldmia r2!,{r6-r7} @ in2_x
1487 ldmia r3!,{r8-r9} @ in1_x
1488 and r4,r4,r10
1489 and r5,r5,r10
1490 and r6,r6,r11
1491 and r7,r7,r11
1492 and r8,r8,r12
1493 and r9,r9,r12
1494 orr r4,r4,r6
1495 orr r5,r5,r7
1496 orr r4,r4,r8
1497 orr r5,r5,r9
1498 stmia $r_ptr!,{r4-r5}
1501 $code.=<<___;
1502 .Ladd_done:
1503 add sp,sp,#32*18+16+16 @ +16 means "skip even over saved r0-r3"
1504 #if __ARM_ARCH__>=5 || defined(__thumb__)
1505 ldmia sp!,{r4-r12,pc}
1506 #else
1507 ldmia sp!,{r4-r12,lr}
1508 bx lr @ interoperable with Thumb ISA:-)
1509 #endif
1510 .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
1514 ########################################################################
1515 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1516 # const P256_POINT_AFFINE *in2);
1518 my ($res_x,$res_y,$res_z,
1519 $in1_x,$in1_y,$in1_z,
1520 $in2_x,$in2_y,
1521 $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
1522 my $Z1sqr = $S2;
1523 # above map() describes stack layout with 18 temporary
1524 # 256-bit vectors on top. Then note that we push
1525 # starting from r0, which means that we have copy of
1526 # input arguments just below these temporary vectors.
1527 # We use two of them for !in1infty, !in2intfy.
1529 my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1531 $code.=<<___;
1532 .globl ecp_nistz256_point_add_affine
1533 .type ecp_nistz256_point_add_affine,%function
1534 .align 5
1535 ecp_nistz256_point_add_affine:
1536 stmdb sp!,{r0-r12,lr} @ push from r0, unusual, but intentional
1537 sub sp,sp,#32*15
1539 ldmia $a_ptr!,{r4-r11} @ copy in1_x
1540 add r3,sp,#$in1_x
1541 stmia r3!,{r4-r11}
1542 ldmia $a_ptr!,{r4-r11} @ copy in1_y
1543 stmia r3!,{r4-r11}
1544 ldmia $a_ptr,{r4-r11} @ copy in1_z
1545 orr r12,r4,r5
1546 orr r12,r12,r6
1547 orr r12,r12,r7
1548 orr r12,r12,r8
1549 orr r12,r12,r9
1550 orr r12,r12,r10
1551 orr r12,r12,r11
1552 cmp r12,#0
1553 #ifdef __thumb2__
1554 it ne
1555 #endif
1556 movne r12,#-1
1557 stmia r3,{r4-r11}
1558 str r12,[sp,#32*15+4] @ !in1infty
1560 ldmia $b_ptr!,{r4-r11} @ copy in2_x
1561 add r3,sp,#$in2_x
1562 orr r12,r4,r5
1563 orr r12,r12,r6
1564 orr r12,r12,r7
1565 orr r12,r12,r8
1566 orr r12,r12,r9
1567 orr r12,r12,r10
1568 orr r12,r12,r11
1569 stmia r3!,{r4-r11}
1570 ldmia $b_ptr!,{r4-r11} @ copy in2_y
1571 orr r12,r12,r4
1572 orr r12,r12,r5
1573 orr r12,r12,r6
1574 orr r12,r12,r7
1575 orr r12,r12,r8
1576 orr r12,r12,r9
1577 orr r12,r12,r10
1578 orr r12,r12,r11
1579 stmia r3!,{r4-r11}
1580 cmp r12,#0
1581 #ifdef __thumb2__
1582 it ne
1583 #endif
1584 movne r12,#-1
1585 str r12,[sp,#32*15+8] @ !in2infty
1587 add $a_ptr,sp,#$in1_z
1588 add $b_ptr,sp,#$in1_z
1589 add $r_ptr,sp,#$Z1sqr
1590 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Z1sqr, in1_z);
1592 add $a_ptr,sp,#$Z1sqr
1593 add $b_ptr,sp,#$in2_x
1594 add $r_ptr,sp,#$U2
1595 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, Z1sqr, in2_x);
1597 add $b_ptr,sp,#$in1_x
1598 add $r_ptr,sp,#$H
1599 bl __ecp_nistz256_sub_from @ p256_sub(H, U2, in1_x);
1601 add $a_ptr,sp,#$Z1sqr
1602 add $b_ptr,sp,#$in1_z
1603 add $r_ptr,sp,#$S2
1604 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, Z1sqr, in1_z);
1606 add $a_ptr,sp,#$H
1607 add $b_ptr,sp,#$in1_z
1608 add $r_ptr,sp,#$res_z
1609 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_z, H, in1_z);
1611 add $a_ptr,sp,#$in2_y
1612 add $b_ptr,sp,#$S2
1613 add $r_ptr,sp,#$S2
1614 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, S2, in2_y);
1616 add $b_ptr,sp,#$in1_y
1617 add $r_ptr,sp,#$R
1618 bl __ecp_nistz256_sub_from @ p256_sub(R, S2, in1_y);
1620 add $a_ptr,sp,#$H
1621 add $b_ptr,sp,#$H
1622 add $r_ptr,sp,#$Hsqr
1623 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Hsqr, H);
1625 add $a_ptr,sp,#$R
1626 add $b_ptr,sp,#$R
1627 add $r_ptr,sp,#$Rsqr
1628 bl __ecp_nistz256_mul_mont @ p256_sqr_mont(Rsqr, R);
1630 add $a_ptr,sp,#$H
1631 add $b_ptr,sp,#$Hsqr
1632 add $r_ptr,sp,#$Hcub
1633 bl __ecp_nistz256_mul_mont @ p256_mul_mont(Hcub, Hsqr, H);
1635 add $a_ptr,sp,#$Hsqr
1636 add $b_ptr,sp,#$in1_x
1637 add $r_ptr,sp,#$U2
1638 bl __ecp_nistz256_mul_mont @ p256_mul_mont(U2, in1_x, Hsqr);
1640 add $r_ptr,sp,#$Hsqr
1641 bl __ecp_nistz256_add_self @ p256_mul_by_2(Hsqr, U2);
1643 add $b_ptr,sp,#$Rsqr
1644 add $r_ptr,sp,#$res_x
1645 bl __ecp_nistz256_sub_morf @ p256_sub(res_x, Rsqr, Hsqr);
1647 add $b_ptr,sp,#$Hcub
1648 bl __ecp_nistz256_sub_from @ p256_sub(res_x, res_x, Hcub);
1650 add $b_ptr,sp,#$U2
1651 add $r_ptr,sp,#$res_y
1652 bl __ecp_nistz256_sub_morf @ p256_sub(res_y, U2, res_x);
1654 add $a_ptr,sp,#$Hcub
1655 add $b_ptr,sp,#$in1_y
1656 add $r_ptr,sp,#$S2
1657 bl __ecp_nistz256_mul_mont @ p256_mul_mont(S2, in1_y, Hcub);
1659 add $a_ptr,sp,#$R
1660 add $b_ptr,sp,#$res_y
1661 add $r_ptr,sp,#$res_y
1662 bl __ecp_nistz256_mul_mont @ p256_mul_mont(res_y, res_y, R);
1664 add $b_ptr,sp,#$S2
1665 bl __ecp_nistz256_sub_from @ p256_sub(res_y, res_y, S2);
1667 ldr r11,[sp,#32*15+4] @ !in1intfy
1668 ldr r12,[sp,#32*15+8] @ !in2intfy
1669 add r1,sp,#$res_x
1670 add r2,sp,#$in2_x
1671 and r10,r11,r12
1672 mvn r11,r11
1673 add r3,sp,#$in1_x
1674 and r11,r11,r12
1675 mvn r12,r12
1676 ldr $r_ptr,[sp,#32*15]
1678 for($i=0;$i<64;$i+=8) { # conditional moves
1679 $code.=<<___;
1680 ldmia r1!,{r4-r5} @ res_x
1681 ldmia r2!,{r6-r7} @ in2_x
1682 ldmia r3!,{r8-r9} @ in1_x
1683 and r4,r4,r10
1684 and r5,r5,r10
1685 and r6,r6,r11
1686 and r7,r7,r11
1687 and r8,r8,r12
1688 and r9,r9,r12
1689 orr r4,r4,r6
1690 orr r5,r5,r7
1691 orr r4,r4,r8
1692 orr r5,r5,r9
1693 stmia $r_ptr!,{r4-r5}
1696 for(;$i<96;$i+=8) {
1697 my $j=($i-64)/4;
1698 $code.=<<___;
1699 ldmia r1!,{r4-r5} @ res_z
1700 ldmia r3!,{r8-r9} @ in1_z
1701 and r4,r4,r10
1702 and r5,r5,r10
1703 and r6,r11,#@ONE_mont[$j]
1704 and r7,r11,#@ONE_mont[$j+1]
1705 and r8,r8,r12
1706 and r9,r9,r12
1707 orr r4,r4,r6
1708 orr r5,r5,r7
1709 orr r4,r4,r8
1710 orr r5,r5,r9
1711 stmia $r_ptr!,{r4-r5}
1714 $code.=<<___;
1715 add sp,sp,#32*15+16 @ +16 means "skip even over saved r0-r3"
1716 #if __ARM_ARCH__>=5 || !defined(__thumb__)
1717 ldmia sp!,{r4-r12,pc}
1718 #else
1719 ldmia sp!,{r4-r12,lr}
1720 bx lr @ interoperable with Thumb ISA:-)
1721 #endif
1722 .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1724 } }}}
1726 foreach (split("\n",$code)) {
1727 s/\`([^\`]*)\`/eval $1/geo;
1729 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
1731 print $_,"\n";
1733 close STDOUT; # enforce flush