3 .extern OPENSSL_armcap_P
8 // forward "declarations" are required for Apple
13 .type poly1305_init,%function
17 stp xzr,xzr,[x0] // zero hash value
18 stp xzr,xzr,[x0,#16] // [along with is_base2_26]
24 adrp x17,OPENSSL_armcap_P
25 ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
28 ldp x7,x8,[x1] // load key
29 mov x9,#0xfffffffc0fffffff
30 movk x9,#0x0fff,lsl#48
32 rev x7,x7 // flip bytes
35 and x7,x7,x9 // &=0ffffffc0fffffff
37 and x8,x8,x9 // &=0ffffffc0ffffffc
39 stp x7,x8,[x0,#32] // save key value
40 str w9,[x0,#48] // impossible key power value
45 adr x12,.Lpoly1305_blocks
46 adr x7,.Lpoly1305_blocks_neon
47 adr x13,.Lpoly1305_emit
60 .size poly1305_init,.-poly1305_init
62 .type poly1305_blocks,%function
69 ldp x4,x5,[x0] // load hash value
70 ldp x6,x17,[x0,#16] // [along with is_base2_26]
71 ldp x7,x8,[x0,#32] // load key value
87 add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
89 adds x12,x12,x14,lsl#52
90 add x13,x13,x15,lsl#14
93 adds x13,x13,x16,lsl#40
96 cmp x17,#0 // is_base2_26?
97 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
98 csel x4,x4,x12,eq // choose between radixes
103 ldp x10,x11,[x1],#16 // load input
109 adds x4,x4,x10 // accumulate input
112 mul x12,x4,x7 // h0*r0
116 mul x10,x5,x9 // h1*5*r1
120 mul x10,x4,x8 // h0*r1
125 mul x10,x5,x7 // h1*r0
130 mul x10,x6,x9 // h2*5*r1
132 mul x11,x6,x7 // h2*r0
137 and x10,x14,#-4 // final reduction
139 add x10,x10,x14,lsr#2
146 stp x4,x5,[x0] // store hash value
147 stp x6,xzr,[x0,#16] // [and clear is_base2_26]
151 .size poly1305_blocks,.-poly1305_blocks
153 .type poly1305_emit,%function
157 ldp x4,x5,[x0] // load hash base 2^64
158 ldp x6,x7,[x0,#16] // [along with is_base2_26]
159 ldp x10,x11,[x2] // load nonce
175 add x12,x12,x13,lsl#26 // base 2^26 -> base 2^64
177 adds x12,x12,x14,lsl#52
178 add x13,x13,x15,lsl#14
181 adds x13,x13,x16,lsl#40
184 cmp x7,#0 // is_base2_26?
185 csel x4,x4,x12,eq // choose between radixes
189 adds x12,x4,#5 // compare to modulus
193 tst x14,#-4 // see if it's carried/borrowed
199 ror x10,x10,#32 // flip nonce words
202 adds x4,x4,x10 // accumulate nonce
205 rev x4,x4 // flip output bytes
208 stp x4,x5,[x1] // write result
211 .size poly1305_emit,.-poly1305_emit
212 .type poly1305_mult,%function
215 mul x12,x4,x7 // h0*r0
218 mul x10,x5,x9 // h1*5*r1
222 mul x10,x4,x8 // h0*r1
227 mul x10,x5,x7 // h1*r0
232 mul x10,x6,x9 // h2*5*r1
234 mul x11,x6,x7 // h2*r0
239 and x10,x14,#-4 // final reduction
241 add x10,x10,x14,lsr#2
247 .size poly1305_mult,.-poly1305_mult
249 .type poly1305_splat,%function
252 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26
255 and x14,x14,#0x03ffffff
259 str w12,[x0,#16*0] // r0
260 add w12,w13,w13,lsl#2 // r1*5
261 str w13,[x0,#16*1] // r1
262 add w13,w14,w14,lsl#2 // r2*5
263 str w12,[x0,#16*2] // s1
264 str w14,[x0,#16*3] // r2
265 add w14,w15,w15,lsl#2 // r3*5
266 str w13,[x0,#16*4] // s2
267 str w15,[x0,#16*5] // r3
268 add w15,w16,w16,lsl#2 // r4*5
269 str w14,[x0,#16*6] // s3
270 str w16,[x0,#16*7] // r4
271 str w15,[x0,#16*8] // s4
274 .size poly1305_splat,.-poly1305_splat
277 .globl poly1305_blocks_neon
279 .type poly1305_blocks_neon,%function
281 poly1305_blocks_neon:
282 .Lpoly1305_blocks_neon:
285 b.lo .Lpoly1305_blocks
287 .inst 0xd503233f // paciasp
288 stp x29,x30,[sp,#-80]!
291 stp d8,d9,[sp,#16] // meet ABI requirements
296 cbz x17,.Lbase2_64_neon
298 ldp w10,w11,[x0] // load hash value base 2^26
305 ldp x7,x8,[x0,#32] // load key value
307 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64
309 adds x4,x4,x12,lsl#52
313 adds x5,x5,x14,lsl#40
314 adc x14,x6,xzr // can be partially reduced...
316 ldp x12,x13,[x1],#16 // load input
318 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
324 adds x4,x4,x12 // accumulate input
330 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
333 and x12,x12,#0x03ffffff
341 ldp x7,x8,[x0,#32] // load key value
343 ldp x4,x5,[x0] // load hash value base 2^64
349 ldp x12,x13,[x1],#16 // load input
351 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
356 adds x4,x4,x12 // accumulate input
363 ldr w17,[x0,#48] // first table element
364 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26
367 and x12,x12,#0x03ffffff
371 cmp w17,#-1 // is value impossible?
380 ////////////////////////////////// initialize r^n table
382 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2)
388 bl poly1305_mult // r^2
392 bl poly1305_mult // r^3
396 bl poly1305_mult // r^4
399 sub x0,x0,#48 // restore original x0
411 ldp x8,x12,[x1,#32] // inp[2:3]
426 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
427 and x5,x9,#0x03ffffff
430 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
433 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
435 and x8,x8,#0x03ffffff
436 and x9,x9,#0x03ffffff
439 add x12,x3,x12,lsr#40
440 add x13,x3,x13,lsr#40
441 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
443 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
444 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
449 ldp x8,x12,[x1],#16 // inp[0:1]
452 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
453 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
462 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
463 and x5,x9,#0x03ffffff
466 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
469 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
471 and x8,x8,#0x03ffffff
472 and x9,x9,#0x03ffffff
475 add x12,x3,x12,lsr#40
476 add x13,x3,x13,lsr#40
477 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
479 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
480 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
485 ushr v31.2d,v31.2d,#38
491 ////////////////////////////////////////////////////////////////
492 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
493 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
494 // ___________________/
495 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
496 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
497 // ___________________/ ____________________/
499 // Note that we start with inp[2:3]*r^2. This is because it
500 // doesn't depend on reduction in previous iteration.
501 ////////////////////////////////////////////////////////////////
502 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0
503 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4
504 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3
505 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2
506 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
509 umull v23.2d,v14.2s,v7.s[2]
511 umull v22.2d,v14.2s,v5.s[2]
512 umull v21.2d,v14.2s,v3.s[2]
513 ldp x8,x12,[x16],#16 // inp[2:3] (or zero)
514 umull v20.2d,v14.2s,v1.s[2]
516 umull v19.2d,v14.2s,v0.s[2]
524 umlal v23.2d,v15.2s,v5.s[2]
525 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
526 umlal v22.2d,v15.2s,v3.s[2]
527 and x5,x9,#0x03ffffff
528 umlal v21.2d,v15.2s,v1.s[2]
530 umlal v20.2d,v15.2s,v0.s[2]
532 umlal v19.2d,v15.2s,v8.s[2]
533 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
535 umlal v23.2d,v16.2s,v3.s[2]
537 umlal v22.2d,v16.2s,v1.s[2]
539 umlal v21.2d,v16.2s,v0.s[2]
540 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
541 umlal v20.2d,v16.2s,v8.s[2]
543 umlal v19.2d,v16.2s,v6.s[2]
544 and x8,x8,#0x03ffffff
546 umlal v23.2d,v17.2s,v1.s[2]
547 and x9,x9,#0x03ffffff
548 umlal v22.2d,v17.2s,v0.s[2]
550 umlal v21.2d,v17.2s,v8.s[2]
552 umlal v20.2d,v17.2s,v6.s[2]
553 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
554 umlal v19.2d,v17.2s,v4.s[2]
557 add v11.2s,v11.2s,v26.2s
558 add x12,x3,x12,lsr#40
559 umlal v23.2d,v18.2s,v0.s[2]
560 add x13,x3,x13,lsr#40
561 umlal v22.2d,v18.2s,v8.s[2]
562 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
563 umlal v21.2d,v18.2s,v6.s[2]
564 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
565 umlal v20.2d,v18.2s,v4.s[2]
567 umlal v19.2d,v18.2s,v2.s[2]
570 ////////////////////////////////////////////////////////////////
571 // (hash+inp[0:1])*r^4 and accumulate
573 add v9.2s,v9.2s,v24.2s
575 umlal v22.2d,v11.2s,v1.s[0]
576 ldp x8,x12,[x1],#16 // inp[0:1]
577 umlal v19.2d,v11.2s,v6.s[0]
579 umlal v23.2d,v11.2s,v3.s[0]
580 umlal v20.2d,v11.2s,v8.s[0]
581 umlal v21.2d,v11.2s,v0.s[0]
589 add v10.2s,v10.2s,v25.2s
590 umlal v22.2d,v9.2s,v5.s[0]
591 umlal v23.2d,v9.2s,v7.s[0]
592 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26
593 umlal v21.2d,v9.2s,v3.s[0]
594 and x5,x9,#0x03ffffff
595 umlal v19.2d,v9.2s,v0.s[0]
597 umlal v20.2d,v9.2s,v1.s[0]
600 add v12.2s,v12.2s,v27.2s
601 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32
602 umlal v22.2d,v10.2s,v3.s[0]
604 umlal v23.2d,v10.2s,v5.s[0]
606 umlal v19.2d,v10.2s,v8.s[0]
607 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32
608 umlal v21.2d,v10.2s,v1.s[0]
610 umlal v20.2d,v10.2s,v0.s[0]
611 and x8,x8,#0x03ffffff
613 add v13.2s,v13.2s,v28.2s
614 and x9,x9,#0x03ffffff
615 umlal v22.2d,v12.2s,v0.s[0]
617 umlal v19.2d,v12.2s,v4.s[0]
619 umlal v23.2d,v12.2s,v1.s[0]
620 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32
621 umlal v20.2d,v12.2s,v6.s[0]
623 umlal v21.2d,v12.2s,v8.s[0]
624 add x12,x3,x12,lsr#40
626 umlal v22.2d,v13.2s,v8.s[0]
627 add x13,x3,x13,lsr#40
628 umlal v19.2d,v13.2s,v2.s[0]
629 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32
630 umlal v23.2d,v13.2s,v0.s[0]
631 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32
632 umlal v20.2d,v13.2s,v4.s[0]
634 umlal v21.2d,v13.2s,v6.s[0]
638 /////////////////////////////////////////////////////////////////
639 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
642 // [see discussion in poly1305-armv4 module]
644 ushr v29.2d,v22.2d,#26
646 ushr v30.2d,v19.2d,#26
647 and v19.16b,v19.16b,v31.16b
648 add v23.2d,v23.2d,v29.2d // h3 -> h4
649 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff
650 add v20.2d,v20.2d,v30.2d // h0 -> h1
652 ushr v29.2d,v23.2d,#26
654 ushr v30.2d,v20.2d,#26
656 bic v28.2s,#0xfc,lsl#24
657 add v21.2d,v21.2d,v30.2d // h1 -> h2
659 add v19.2d,v19.2d,v29.2d
661 shrn v30.2s,v21.2d,#26
663 add v19.2d,v19.2d,v29.2d // h4 -> h0
664 bic v25.2s,#0xfc,lsl#24
665 add v27.2s,v27.2s,v30.2s // h2 -> h3
666 bic v26.2s,#0xfc,lsl#24
668 shrn v29.2s,v19.2d,#26
670 ushr v30.2s,v27.2s,#26
671 bic v27.2s,#0xfc,lsl#24
672 bic v24.2s,#0xfc,lsl#24
673 add v25.2s,v25.2s,v29.2s // h0 -> h1
674 add v28.2s,v28.2s,v30.2s // h3 -> h4
680 add v11.2s,v11.2s,v26.2s
682 ////////////////////////////////////////////////////////////////
683 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
689 add v14.2s,v9.2s,v24.2s
690 add v17.2s,v12.2s,v27.2s
691 add v15.2s,v10.2s,v25.2s
692 add v18.2s,v13.2s,v28.2s
696 umull2 v19.2d,v16.4s,v6.4s
697 umull2 v22.2d,v16.4s,v1.4s
698 umull2 v23.2d,v16.4s,v3.4s
699 umull2 v21.2d,v16.4s,v0.4s
700 umull2 v20.2d,v16.4s,v8.4s
703 umlal2 v19.2d,v14.4s,v0.4s
704 umlal2 v21.2d,v14.4s,v3.4s
705 umlal2 v22.2d,v14.4s,v5.4s
706 umlal2 v23.2d,v14.4s,v7.4s
707 umlal2 v20.2d,v14.4s,v1.4s
710 umlal2 v19.2d,v15.4s,v8.4s
711 umlal2 v22.2d,v15.4s,v3.4s
712 umlal2 v21.2d,v15.4s,v1.4s
713 umlal2 v23.2d,v15.4s,v5.4s
714 umlal2 v20.2d,v15.4s,v0.4s
717 umlal2 v22.2d,v17.4s,v0.4s
718 umlal2 v23.2d,v17.4s,v1.4s
719 umlal2 v19.2d,v17.4s,v4.4s
720 umlal2 v20.2d,v17.4s,v6.4s
721 umlal2 v21.2d,v17.4s,v8.4s
723 umlal2 v22.2d,v18.4s,v8.4s
724 umlal2 v19.2d,v18.4s,v2.4s
725 umlal2 v23.2d,v18.4s,v0.4s
726 umlal2 v20.2d,v18.4s,v4.4s
727 umlal2 v21.2d,v18.4s,v6.4s
731 ////////////////////////////////////////////////////////////////
732 // (hash+inp[0:1])*r^4:r^3 and accumulate
734 add v9.2s,v9.2s,v24.2s
735 umlal v22.2d,v11.2s,v1.2s
736 umlal v19.2d,v11.2s,v6.2s
737 umlal v23.2d,v11.2s,v3.2s
738 umlal v20.2d,v11.2s,v8.2s
739 umlal v21.2d,v11.2s,v0.2s
741 add v10.2s,v10.2s,v25.2s
742 umlal v22.2d,v9.2s,v5.2s
743 umlal v19.2d,v9.2s,v0.2s
744 umlal v23.2d,v9.2s,v7.2s
745 umlal v20.2d,v9.2s,v1.2s
746 umlal v21.2d,v9.2s,v3.2s
748 add v12.2s,v12.2s,v27.2s
749 umlal v22.2d,v10.2s,v3.2s
750 umlal v19.2d,v10.2s,v8.2s
751 umlal v23.2d,v10.2s,v5.2s
752 umlal v20.2d,v10.2s,v0.2s
753 umlal v21.2d,v10.2s,v1.2s
755 add v13.2s,v13.2s,v28.2s
756 umlal v22.2d,v12.2s,v0.2s
757 umlal v19.2d,v12.2s,v4.2s
758 umlal v23.2d,v12.2s,v1.2s
759 umlal v20.2d,v12.2s,v6.2s
760 umlal v21.2d,v12.2s,v8.2s
762 umlal v22.2d,v13.2s,v8.2s
763 umlal v19.2d,v13.2s,v2.2s
764 umlal v23.2d,v13.2s,v0.2s
765 umlal v20.2d,v13.2s,v4.2s
766 umlal v21.2d,v13.2s,v6.2s
769 ////////////////////////////////////////////////////////////////
772 addp v22.2d,v22.2d,v22.2d
773 ldp d8,d9,[sp,#16] // meet ABI requirements
774 addp v19.2d,v19.2d,v19.2d
776 addp v23.2d,v23.2d,v23.2d
778 addp v20.2d,v20.2d,v20.2d
780 addp v21.2d,v21.2d,v21.2d
782 .inst 0xd50323bf // autiasp
784 ////////////////////////////////////////////////////////////////
785 // lazy reduction, but without narrowing
787 ushr v29.2d,v22.2d,#26
788 and v22.16b,v22.16b,v31.16b
789 ushr v30.2d,v19.2d,#26
790 and v19.16b,v19.16b,v31.16b
792 add v23.2d,v23.2d,v29.2d // h3 -> h4
793 add v20.2d,v20.2d,v30.2d // h0 -> h1
795 ushr v29.2d,v23.2d,#26
796 and v23.16b,v23.16b,v31.16b
797 ushr v30.2d,v20.2d,#26
798 and v20.16b,v20.16b,v31.16b
799 add v21.2d,v21.2d,v30.2d // h1 -> h2
801 add v19.2d,v19.2d,v29.2d
803 ushr v30.2d,v21.2d,#26
804 and v21.16b,v21.16b,v31.16b
805 add v19.2d,v19.2d,v29.2d // h4 -> h0
806 add v22.2d,v22.2d,v30.2d // h2 -> h3
808 ushr v29.2d,v19.2d,#26
809 and v19.16b,v19.16b,v31.16b
810 ushr v30.2d,v22.2d,#26
811 and v22.16b,v22.16b,v31.16b
812 add v20.2d,v20.2d,v29.2d // h0 -> h1
813 add v23.2d,v23.2d,v30.2d // h3 -> h4
815 ////////////////////////////////////////////////////////////////
816 // write the result, can be partially reduced
818 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
821 str x4,[x0,#8] // set is_base2_26
825 .size poly1305_blocks_neon,.-poly1305_blocks_neon
829 .long 0,0,0,0,0,0,0,0
830 .asciz "Poly1305 for ARMv8, CRYPTOGAMS by @dot-asm"
832 #if !defined(__KERNEL__) && !defined(_WIN64)
833 .comm OPENSSL_armcap_P,4,4
834 .hidden OPENSSL_armcap_P