6 .type mul_1x1_ialu,%function
10 bic r5,r1,#3<<30 @ a1=a&0x3fffffff
11 str r4,[sp,#0] @ tab[0]=0
12 add r6,r5,r5 @ a2=a1<<1
13 str r5,[sp,#4] @ tab[1]=a1
15 str r6,[sp,#8] @ tab[2]=a2
16 mov r8,r5,lsl#2 @ a4=a1<<2
17 str r7,[sp,#12] @ tab[3]=a1^a2
19 str r8,[sp,#16] @ tab[4]=a4
21 str r9,[sp,#20] @ tab[5]=a1^a4
22 eor r7,r7,r8 @ a1^a2^a4
23 str r4,[sp,#24] @ tab[6]=a2^a4
25 str r7,[sp,#28] @ tab[7]=a1^a2^a4
28 ldr r5,[sp,r8] @ tab[b & 0x7]
30 ldr r7,[sp,r9] @ tab[b >> 3 & 0x7]
32 ldr r6,[sp,r8] @ tab[b >> 6 & 0x7]
33 eor r5,r5,r7,lsl#3 @ stall
35 ldr r7,[sp,r9] @ tab[b >> 9 & 0x7]
40 ldr r6,[sp,r8] @ tab[b >> 12 & 0x7]
45 ldr r7,[sp,r9] @ tab[b >> 15 & 0x7]
50 ldr r6,[sp,r8] @ tab[b >> 18 & 0x7]
55 ldr r7,[sp,r9] @ tab[b >> 21 & 0x7]
60 ldr r6,[sp,r8] @ tab[b >> 24 & 0x7]
65 ldr r7,[sp,r9] @ tab[b >> 27 & 0x7]
71 ldr r6,[sp,r8] @ tab[b >> 30 ]
84 .size mul_1x1_ialu,.-mul_1x1_ialu
85 .global bn_GF2m_mul_2x2
86 .type bn_GF2m_mul_2x2,%function
89 #if __ARM_MAX_ARCH__>=7
90 ldr r12,.LOPENSSL_armcap
91 .Lpic: ldr r12,[pc,r12]
96 mov r10,r0 @ reassign 1st argument
98 ldr r3,[sp,#32] @ load b0
100 sub sp,sp,#32 @ allocate tab[8]
102 bl mul_1x1_ialu @ a1·b1
106 eor r0,r0,r3 @ flip b0 and b1
107 eor r1,r1,r2 @ flip a0 and a1
112 bl mul_1x1_ialu @ a0·b0
118 bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
128 add sp,sp,#32 @ destroy tab[8]
132 ldmia sp!,{r4-r10,pc}
134 ldmia sp!,{r4-r10,lr}
136 moveq pc,lr @ be binary compatible with V4, yet
137 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
139 #if __ARM_MAX_ARCH__>=7
145 ldr r12, [sp] @ 5th argument
148 vmov.i64 d28, #0x0000ffffffffffff
149 vmov.i64 d29, #0x00000000ffffffff
150 vmov.i64 d30, #0x000000000000ffff
152 vext.8 d2, d26, d26, #1 @ A1
153 vmull.p8 q1, d2, d27 @ F = A1*B
154 vext.8 d0, d27, d27, #1 @ B1
155 vmull.p8 q0, d26, d0 @ E = A*B1
156 vext.8 d4, d26, d26, #2 @ A2
157 vmull.p8 q2, d4, d27 @ H = A2*B
158 vext.8 d16, d27, d27, #2 @ B2
159 vmull.p8 q8, d26, d16 @ G = A*B2
160 vext.8 d6, d26, d26, #3 @ A3
161 veor q1, q1, q0 @ L = E + F
162 vmull.p8 q3, d6, d27 @ J = A3*B
163 vext.8 d0, d27, d27, #3 @ B3
164 veor q2, q2, q8 @ M = G + H
165 vmull.p8 q0, d26, d0 @ I = A*B3
166 veor d2, d2, d3 @ t0 = (L) (P0 + P1) << 8
168 vext.8 d16, d27, d27, #4 @ B4
169 veor d4, d4, d5 @ t1 = (M) (P2 + P3) << 16
171 vmull.p8 q8, d26, d16 @ K = A*B4
172 veor q3, q3, q0 @ N = I + J
175 veor d6, d6, d7 @ t2 = (N) (P4 + P5) << 24
177 vext.8 q1, q1, q1, #15
178 veor d16, d16, d17 @ t3 = (K) (P6 + P7) << 32
180 vext.8 q2, q2, q2, #14
182 vmull.p8 q0, d26, d27 @ D = A*B
183 vext.8 q8, q8, q8, #12
184 vext.8 q3, q3, q3, #13
193 .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
194 #if __ARM_MAX_ARCH__>=7
197 .word OPENSSL_armcap_P-(.Lpic+8)
199 .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"
202 #if __ARM_MAX_ARCH__>=7
203 .comm OPENSSL_armcap_P,4,4