4 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
5 # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
6 # define poly1305_init poly1305_init_arm
7 # define poly1305_blocks poly1305_blocks_arm
8 # define poly1305_emit poly1305_emit_arm
9 .globl poly1305_blocks_neon
12 #if defined(__thumb2__)
22 .globl poly1305_blocks
24 .type poly1305_init,%function
32 str r3,[r0,#0] @ zero hash value
37 str r3,[r0,#36] @ clear is_base2_26
46 #if __ARM_MAX_ARCH__>=7
48 str r3,[r0,#28] @ impossible key power value
50 adr r11,.Lpoly1305_init
51 ldr r12,.LOPENSSL_armcap
57 and r3,r10,#-4 @ 0x0ffffffc
68 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
70 ldr r12,[r11,r12] @ OPENSSL_armcap_P
72 # if defined(__APPLE__) || defined(_WIN32)
85 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
86 tst r12,#ARMV7_NEON @ check for NEON
88 adr r9,.Lpoly1305_blocks_neon
89 adr r11,.Lpoly1305_blocks
92 adr r12,.Lpoly1305_emit
93 orr r11,r11,#1 @ thumb-ify addresses
96 add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
98 addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
99 addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
120 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
121 stmia r2,{r11,r12} @ fill functions table
132 moveq pc,lr @ be binary compatible with V4, yet
133 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
135 .size poly1305_init,.-poly1305_init
136 .type poly1305_blocks,%function
140 stmdb sp!,{r3-r11,lr}
145 add r2,r2,r1 @ end pointer
149 ldmia r0,{r4-r12} @ load context
151 str r2,[sp,#16] @ offload stuff
154 ldr lr,[r0,#36] @ is_base2_26
155 ldmia r0!,{r4-r8} @ load hash value
156 str r2,[sp,#16] @ offload stuff
159 adds r9,r4,r5,lsl#26 @ base 2^26 -> base 2^32
161 adcs r10,r10,r6,lsl#20
163 adcs r11,r11,r7,lsl#14
165 adcs r12,r12,r8,lsl#8
168 str r2,[r0,#16] @ clear is_base2_26
172 movne r4,r9 @ choose between radixes
176 ldmia r0,{r9-r12} @ load key
191 ldrb r0,[lr],#16 @ load input
195 addhi r8,r8,#1 @ 1<<128
205 adds r4,r4,r3 @ accumulate input
227 str lr,[sp,#8] @ offload input pointer
229 add r10,r10,r10,lsr#2
232 ldr r0,[lr],#16 @ load input
234 addhi r8,r8,#1 @ padbit
244 adds r4,r4,r0 @ accumulate input
245 str lr,[sp,#8] @ offload input pointer
247 add r10,r10,r10,lsr#2
250 add r11,r11,r11,lsr#2
252 add r12,r12,r12,lsr#2
259 ldr r10,[sp,#20] @ reload r10
265 str r0,[sp,#0] @ future r4
267 ldr r11,[sp,#24] @ reload r11
268 adds r2,r2,r1 @ d1+=d0>>32
270 adc lr,r3,#0 @ future r6
271 str r2,[sp,#4] @ future r5
276 ldr r12,[sp,#28] @ reload r12
288 adds r6,lr,r0 @ d2+=d1>>32
289 ldr lr,[sp,#8] @ reload input pointer
291 adds r7,r2,r1 @ d3+=d2>>32
292 ldr r0,[sp,#16] @ reload end pointer
294 add r8,r8,r3 @ h4+=d3>>32
298 add r1,r1,r1,lsr#2 @ *=5
305 cmp r0,lr @ done yet?
310 stmdb r0,{r4-r8} @ store the result
314 ldmia sp!,{r3-r11,pc}
316 ldmia sp!,{r3-r11,lr}
318 moveq pc,lr @ be binary compatible with V4, yet
319 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
321 .size poly1305_blocks,.-poly1305_blocks
322 .type poly1305_emit,%function
331 ldr ip,[r0,#36] @ is_base2_26
333 adds r8,r3,r4,lsl#26 @ base 2^26 -> base 2^32
337 adcs r10,r10,r6,lsl#14
339 adcs r11,r11,r7,lsl#8
353 adds r8,r3,#5 @ compare to modulus
358 tst r0,#4 @ did it carry/borrow?
435 moveq pc,lr @ be binary compatible with V4, yet
436 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
438 .size poly1305_emit,.-poly1305_emit
439 #if __ARM_MAX_ARCH__>=7
442 .type poly1305_init_neon,%function
445 .Lpoly1305_init_neon:
446 ldr r3,[r0,#48] @ first table element
447 cmp r3,#-1 @ is value impossible?
450 ldr r4,[r0,#20] @ load key base 2^32
455 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
463 and r3,r3,#0x03ffffff
464 and r4,r4,#0x03ffffff
465 and r5,r5,#0x03ffffff
467 vdup.32 d0,r2 @ r^1 in both lanes
468 add r2,r3,r3,lsl#2 @ *5
484 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
485 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
486 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
487 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
488 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
489 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
491 vmull.u32 q5,d0,d0[1]
492 vmull.u32 q6,d1,d0[1]
493 vmull.u32 q7,d3,d0[1]
494 vmull.u32 q8,d5,d0[1]
495 vmull.u32 q9,d7,d0[1]
497 vmlal.u32 q5,d7,d2[1]
498 vmlal.u32 q6,d0,d1[1]
499 vmlal.u32 q7,d1,d1[1]
500 vmlal.u32 q8,d3,d1[1]
501 vmlal.u32 q9,d5,d1[1]
503 vmlal.u32 q5,d5,d4[1]
504 vmlal.u32 q6,d7,d4[1]
505 vmlal.u32 q8,d1,d3[1]
506 vmlal.u32 q7,d0,d3[1]
507 vmlal.u32 q9,d3,d3[1]
509 vmlal.u32 q5,d3,d6[1]
510 vmlal.u32 q8,d0,d5[1]
511 vmlal.u32 q6,d5,d6[1]
512 vmlal.u32 q7,d7,d6[1]
513 vmlal.u32 q9,d1,d5[1]
515 vmlal.u32 q8,d7,d8[1]
516 vmlal.u32 q5,d1,d8[1]
517 vmlal.u32 q6,d3,d8[1]
518 vmlal.u32 q7,d5,d8[1]
519 vmlal.u32 q9,d0,d7[1]
521 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
522 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
525 @ H0>>+H1>>+H2>>+H3>>+H4
526 @ H3>>+H4>>*5+H0>>+H1
530 @ Result of multiplication of n-bit number by m-bit number is
531 @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
532 @ m-bit number multiplied by 2^n is still n+m bits wide.
534 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
535 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
536 @ one is n+1 bits wide.
538 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
539 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
540 @ can be 27. However! In cases when their width exceeds 26 bits
541 @ they are limited by 2^26+2^6. This in turn means that *sum*
542 @ of the products with these values can still be viewed as sum
543 @ of 52-bit numbers as long as the amount of addends is not a
544 @ power of 2. For example,
546 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
548 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
549 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
550 @ 8 * (2^52) or 2^55. However, the value is then multiplied by
551 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
552 @ which is less than 32 * (2^52) or 2^57. And when processing
553 @ data we are looking at triple as many addends...
555 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
556 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
557 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
558 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
559 @ instruction accepts 2x32-bit input and writes 2x64-bit result.
560 @ This means that result of reduction have to be compressed upon
561 @ loop wrap-around. This can be done in the process of reduction
562 @ to minimize amount of instructions [as well as amount of
563 @ 128-bit instructions, which benefits low-end processors], but
564 @ one has to watch for H2 (which is narrower than H0) and 5*H4
565 @ not being wider than 58 bits, so that result of right shift
566 @ by 26 bits fits in 32 bits. This is also useful on x86,
567 @ because it allows to use paddd in place for paddq, which
568 @ benefits Atom, where paddq is ridiculously slow.
574 vadd.i64 q9,q9,q15 @ h3 -> h4
575 vbic.i32 d16,#0xfc000000 @ &=0x03ffffff
576 vadd.i64 q6,q6,q4 @ h0 -> h1
577 vbic.i32 d10,#0xfc000000
583 vadd.i64 q7,q7,q4 @ h1 -> h2
584 vbic.i32 d18,#0xfc000000
585 vbic.i32 d12,#0xfc000000
591 vadd.i32 d10,d10,d30 @ h4 -> h0
592 vadd.i32 d16,d16,d8 @ h2 -> h3
593 vbic.i32 d14,#0xfc000000
596 vbic.i32 d10,#0xfc000000
598 vbic.i32 d16,#0xfc000000
599 vadd.i32 d12,d12,d30 @ h0 -> h1
600 vadd.i32 d18,d18,d8 @ h3 -> h4
603 beq .Lsquare_break_neon
605 add r6,r0,#(48+0*9*4)
606 add r7,r0,#(48+1*9*4)
608 vtrn.32 d0,d10 @ r^2:r^1
614 vshl.u32 d4,d3,#2 @ *5
623 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
624 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
625 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
626 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
627 vst1.32 {d8[0]},[r6,:32]
628 vst1.32 {d8[1]},[r7,:32]
634 add r6,r0,#(48+2*4*9)
635 add r7,r0,#(48+3*4*9)
637 vmov d0,d10 @ r^4:r^3
638 vshl.u32 d2,d12,#2 @ *5
651 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]!
652 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]!
653 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
654 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
660 .size poly1305_init_neon,.-poly1305_init_neon
662 .type poly1305_blocks_neon,%function
664 poly1305_blocks_neon:
665 .Lpoly1305_blocks_neon:
666 ldr ip,[r0,#36] @ is_base2_26
669 blo .Lpoly1305_blocks
672 vstmdb sp!,{d8-d15} @ ABI specification says so
674 tst ip,ip @ is_base2_26?
678 bl .Lpoly1305_init_neon
680 ldr r4,[r0,#0] @ load hash value base 2^32
686 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26
698 and r3,r3,#0x03ffffff
701 and r4,r4,#0x03ffffff
703 and r5,r5,#0x03ffffff
704 str r1,[r0,#36] @ set is_base2_26
718 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
726 vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
728 vld1.32 {d18[0]},[r0]
729 sub r0,r0,#16 @ rewind
737 vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]!
748 vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26
753 vadd.i32 d29,d28,d18 @ add hash value and move to #hi
755 vbic.i32 d26,#0xfc000000
759 vbic.i32 d24,#0xfc000000
763 vbic.i32 d20,#0xfc000000
764 vbic.i32 d22,#0xfc000000
782 vmov.i32 q14,#1<<24 @ padbit, yes, always
783 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
785 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
788 addhi r7,r0,#(48+1*9*4)
789 addhi r6,r0,#(48+3*9*4)
797 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
803 vbic.i32 q13,#0xfc000000
807 vbic.i32 q12,#0xfc000000
810 vbic.i32 q10,#0xfc000000
811 vbic.i32 q11,#0xfc000000
815 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2
816 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
817 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
818 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
823 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
824 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
825 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
826 @ ___________________/
827 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
828 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
829 @ ___________________/ ____________________/
831 @ Note that we start with inp[2:3]*r^2. This is because it
832 @ doesn't depend on reduction in previous iteration.
833 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
834 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
835 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
836 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
837 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
838 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
840 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
843 vadd.i32 d24,d24,d14 @ accumulate inp[0:1]
844 vmull.u32 q7,d25,d0[1]
846 vmull.u32 q5,d21,d0[1]
848 vmull.u32 q8,d27,d0[1]
849 vmlal.u32 q7,d23,d1[1]
851 vmull.u32 q6,d23,d0[1]
854 vmull.u32 q9,d29,d0[1]
856 vmlal.u32 q5,d29,d2[1]
859 vmlal.u32 q8,d25,d1[1]
860 vld1.32 d8[1],[r7,:32]
861 vmlal.u32 q6,d21,d1[1]
862 vmlal.u32 q9,d27,d1[1]
864 vmlal.u32 q5,d27,d4[1]
865 vmlal.u32 q8,d23,d3[1]
866 vmlal.u32 q9,d25,d3[1]
867 vmlal.u32 q6,d29,d4[1]
868 vmlal.u32 q7,d21,d3[1]
870 vmlal.u32 q8,d21,d5[1]
871 vmlal.u32 q5,d25,d6[1]
872 vmlal.u32 q9,d23,d5[1]
873 vmlal.u32 q6,d27,d6[1]
874 vmlal.u32 q7,d29,d6[1]
876 vmlal.u32 q8,d29,d8[1]
877 vmlal.u32 q5,d23,d8[1]
878 vmlal.u32 q9,d21,d7[1]
879 vmlal.u32 q6,d25,d8[1]
880 vmlal.u32 q7,d27,d8[1]
882 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0)
885 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
886 @ (hash+inp[0:1])*r^4 and accumulate
888 vmlal.u32 q8,d26,d0[0]
889 vmlal.u32 q5,d20,d0[0]
890 vmlal.u32 q9,d28,d0[0]
891 vmlal.u32 q6,d22,d0[0]
892 vmlal.u32 q7,d24,d0[0]
893 vld1.32 d8[0],[r6,:32]
895 vmlal.u32 q8,d24,d1[0]
896 vmlal.u32 q5,d28,d2[0]
897 vmlal.u32 q9,d26,d1[0]
898 vmlal.u32 q6,d20,d1[0]
899 vmlal.u32 q7,d22,d1[0]
901 vmlal.u32 q8,d22,d3[0]
902 vmlal.u32 q5,d26,d4[0]
903 vmlal.u32 q9,d24,d3[0]
904 vmlal.u32 q6,d28,d4[0]
905 vmlal.u32 q7,d20,d3[0]
907 vmlal.u32 q8,d20,d5[0]
908 vmlal.u32 q5,d24,d6[0]
909 vmlal.u32 q9,d22,d5[0]
910 vmlal.u32 q6,d26,d6[0]
911 vmlal.u32 q8,d28,d8[0]
913 vmlal.u32 q7,d28,d6[0]
914 vmlal.u32 q5,d22,d8[0]
915 vmlal.u32 q9,d20,d7[0]
916 vmov.i32 q14,#1<<24 @ padbit, yes, always
917 vmlal.u32 q6,d24,d8[0]
918 vmlal.u32 q7,d26,d8[0]
920 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1]
929 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
930 @ lazy reduction interleaved with base 2^32 -> base 2^26 of
931 @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14.
937 vadd.i64 q9,q9,q15 @ h3 -> h4
938 vbic.i32 d16,#0xfc000000
939 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26
940 vadd.i64 q6,q6,q4 @ h0 -> h1
942 vbic.i32 d10,#0xfc000000
948 vadd.i64 q7,q7,q4 @ h1 -> h2
950 vbic.i32 d18,#0xfc000000
952 vbic.i32 d12,#0xfc000000
956 vbic.i32 q13,#0xfc000000
959 vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec]
961 vadd.i32 d16,d16,d8 @ h2 -> h3
963 vbic.i32 d14,#0xfc000000
964 vbic.i32 q12,#0xfc000000
966 vshrn.u64 d30,q5,#26 @ re-narrow
969 vbic.i32 q10,#0xfc000000
971 vbic.i32 d16,#0xfc000000
972 vbic.i32 d10,#0xfc000000
973 vadd.i32 d12,d12,d30 @ h0 -> h1
974 vadd.i32 d18,d18,d8 @ h3 -> h4
975 vbic.i32 q11,#0xfc000000
980 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
981 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
983 add r7,r0,#(48+0*9*4)
984 add r6,r0,#(48+1*9*4)
990 vadd.i32 d25,d24,d14 @ add hash value and move to #hi
997 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1
998 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2
1000 vadd.i32 d24,d24,d14 @ can be redundant
1002 vadd.i32 d20,d20,d10
1004 vadd.i32 d26,d26,d16
1006 vadd.i32 d22,d22,d12
1008 vadd.i32 d28,d28,d18
1012 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
1014 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1020 vld1.32 d8[1],[r7,:32]
1022 vld1.32 d8[0],[r6,:32]
1029 addne r7,r0,#(48+2*9*4)
1032 addne r6,r0,#(48+3*9*4)
1038 vorn q0,q0,q0 @ all-ones, can be redundant
1047 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1048 @ (hash+inp[0:1])*r^4:r^3 and accumulate
1050 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3
1051 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4
1060 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]!
1062 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]!
1068 vld1.32 d8[1],[r7,:32]
1070 vld1.32 d8[0],[r6,:32]
1082 vorn q0,q0,q0 @ all-ones
1090 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1091 @ horizontal addition
1093 vadd.i64 d16,d16,d17
1094 vadd.i64 d10,d10,d11
1095 vadd.i64 d18,d18,d19
1096 vadd.i64 d12,d12,d13
1097 vadd.i64 d14,d14,d15
1099 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1100 @ lazy reduction, but without narrowing
1106 vadd.i64 q9,q9,q15 @ h3 -> h4
1107 vadd.i64 q6,q6,q4 @ h0 -> h1
1113 vadd.i64 q7,q7,q4 @ h1 -> h2
1119 vadd.i64 q5,q5,q15 @ h4 -> h0
1120 vadd.i64 q8,q8,q4 @ h2 -> h3
1126 vadd.i64 q6,q6,q15 @ h0 -> h1
1127 vadd.i64 q9,q9,q4 @ h3 -> h4
1132 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1135 vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]!
1136 vst1.32 {d18[0]},[r0]
1138 vldmia sp!,{d8-d15} @ epilogue
1141 .size poly1305_blocks_neon,.-poly1305_blocks_neon
1145 .long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1149 .word OPENSSL_armcap_P
1151 .word OPENSSL_armcap_P-.Lpoly1305_init
1153 .comm OPENSSL_armcap_P,4,4
1154 .hidden OPENSSL_armcap_P
1157 .asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by @dot-asm"