11 .short 0x0000,0x1C20,0x3840,0x2460
12 .short 0x7080,0x6CA0,0x48C0,0x54E0
13 .short 0xE100,0xFD20,0xD940,0xC560
14 .short 0x9180,0x8DA0,0xA9C0,0xB5E0
15 .size rem_4bit,.-rem_4bit
17 .type rem_4bit_get,%function
20 sub r2,r2,#32 @ &rem_4bit
23 .size rem_4bit_get,.-rem_4bit_get
25 .global gcm_ghash_4bit
26 .type gcm_ghash_4bit,%function
29 add r3,r2,r3 @ r3 to point at the end
30 stmdb sp!,{r3-r11,lr} @ save r3/end too
31 sub r12,r12,#48 @ &rem_4bit
33 ldmia r12,{r4-r11} @ copy rem_4bit ...
34 stmdb sp!,{r4-r11} @ ... to stack
45 ldmia r7,{r4-r7} @ load Htbl[nlo]
50 ldmia r11,{r8-r11} @ load Htbl[nhi]
53 ldrh r8,[sp,r14] @ rem_4bit[rem]
71 ldmia r11,{r8-r11} @ load Htbl[nlo]
76 ldrh r8,[sp,r12] @ rem_4bit[rem]
84 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
86 ldmia r11,{r8-r11} @ load Htbl[nhi]
99 eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]
102 ldr r3,[sp,#32] @ re-load r3/end
105 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
108 #elif defined(__ARMEB__)
120 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
123 #elif defined(__ARMEB__)
135 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
138 #elif defined(__ARMEB__)
150 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
153 #elif defined(__ARMEB__)
169 ldmia sp!,{r4-r11,pc}
171 ldmia sp!,{r4-r11,lr}
173 moveq pc,lr @ be binary compatible with V4, yet
174 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
176 .size gcm_ghash_4bit,.-gcm_ghash_4bit
178 .global gcm_gmult_4bit
179 .type gcm_gmult_4bit,%function
181 stmdb sp!,{r4-r11,lr}
190 ldmia r7,{r4-r7} @ load Htbl[nlo]
194 and r14,r4,#0xf @ rem
195 ldmia r11,{r8-r11} @ load Htbl[nhi]
198 ldrh r8,[r2,r14] @ rem_4bit[rem]
211 and r12,r4,#0xf @ rem
214 ldmia r11,{r8-r11} @ load Htbl[nlo]
219 ldrh r8,[r2,r12] @ rem_4bit[rem]
226 and r14,r4,#0xf @ rem
227 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
229 ldmia r11,{r8-r11} @ load Htbl[nhi]
233 ldrh r8,[r2,r14] @ rem_4bit[rem]
240 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
242 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
245 #elif defined(__ARMEB__)
257 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
260 #elif defined(__ARMEB__)
272 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
275 #elif defined(__ARMEB__)
287 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
290 #elif defined(__ARMEB__)
303 ldmia sp!,{r4-r11,pc}
305 ldmia sp!,{r4-r11,lr}
307 moveq pc,lr @ be binary compatible with V4, yet
308 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
310 .size gcm_gmult_4bit,.-gcm_gmult_4bit
311 #if __ARM_MAX_ARCH__>=7
315 .global gcm_init_neon
316 .type gcm_init_neon,%function
319 vld1.64 d7,[r1,:64]! @ load H
323 vshr.u64 d16,#63 @ t0=0xc2....01
326 vshr.s8 q9,#7 @ broadcast carry bit
330 veor q3,q3,q8 @ twisted H
334 .size gcm_init_neon,.-gcm_init_neon
336 .global gcm_gmult_neon
337 .type gcm_gmult_neon,%function
340 vld1.64 d7,[r0,:64]! @ load Xi
342 vmov.i64 d29,#0x0000ffffffffffff
343 vldmia r1,{d26-d27} @ load twisted H
344 vmov.i64 d30,#0x00000000ffffffff
348 vmov.i64 d31,#0x000000000000ffff
349 veor d28,d26,d27 @ Karatsuba pre-processing
352 .size gcm_gmult_neon,.-gcm_gmult_neon
354 .global gcm_ghash_neon
355 .type gcm_ghash_neon,%function
358 vld1.64 d1,[r0,:64]! @ load Xi
360 vmov.i64 d29,#0x0000ffffffffffff
361 vldmia r1,{d26-d27} @ load twisted H
362 vmov.i64 d30,#0x00000000ffffffff
366 vmov.i64 d31,#0x000000000000ffff
367 veor d28,d26,d27 @ Karatsuba pre-processing
370 vld1.64 d7,[r2]! @ load inp
377 vext.8 d16, d26, d26, #1 @ A1
378 vmull.p8 q8, d16, d6 @ F = A1*B
379 vext.8 d0, d6, d6, #1 @ B1
380 vmull.p8 q0, d26, d0 @ E = A*B1
381 vext.8 d18, d26, d26, #2 @ A2
382 vmull.p8 q9, d18, d6 @ H = A2*B
383 vext.8 d22, d6, d6, #2 @ B2
384 vmull.p8 q11, d26, d22 @ G = A*B2
385 vext.8 d20, d26, d26, #3 @ A3
386 veor q8, q8, q0 @ L = E + F
387 vmull.p8 q10, d20, d6 @ J = A3*B
388 vext.8 d0, d6, d6, #3 @ B3
389 veor q9, q9, q11 @ M = G + H
390 vmull.p8 q0, d26, d0 @ I = A*B3
391 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
393 vext.8 d22, d6, d6, #4 @ B4
394 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
396 vmull.p8 q11, d26, d22 @ K = A*B4
397 veor q10, q10, q0 @ N = I + J
400 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
402 vext.8 q8, q8, q8, #15
403 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
405 vext.8 q9, q9, q9, #14
407 vmull.p8 q0, d26, d6 @ D = A*B
408 vext.8 q11, q11, q11, #12
409 vext.8 q10, q10, q10, #13
414 veor d6,d6,d7 @ Karatsuba pre-processing
415 vext.8 d16, d28, d28, #1 @ A1
416 vmull.p8 q8, d16, d6 @ F = A1*B
417 vext.8 d2, d6, d6, #1 @ B1
418 vmull.p8 q1, d28, d2 @ E = A*B1
419 vext.8 d18, d28, d28, #2 @ A2
420 vmull.p8 q9, d18, d6 @ H = A2*B
421 vext.8 d22, d6, d6, #2 @ B2
422 vmull.p8 q11, d28, d22 @ G = A*B2
423 vext.8 d20, d28, d28, #3 @ A3
424 veor q8, q8, q1 @ L = E + F
425 vmull.p8 q10, d20, d6 @ J = A3*B
426 vext.8 d2, d6, d6, #3 @ B3
427 veor q9, q9, q11 @ M = G + H
428 vmull.p8 q1, d28, d2 @ I = A*B3
429 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
431 vext.8 d22, d6, d6, #4 @ B4
432 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
434 vmull.p8 q11, d28, d22 @ K = A*B4
435 veor q10, q10, q1 @ N = I + J
438 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
440 vext.8 q8, q8, q8, #15
441 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
443 vext.8 q9, q9, q9, #14
445 vmull.p8 q1, d28, d6 @ D = A*B
446 vext.8 q11, q11, q11, #12
447 vext.8 q10, q10, q10, #13
452 vext.8 d16, d27, d27, #1 @ A1
453 vmull.p8 q8, d16, d7 @ F = A1*B
454 vext.8 d4, d7, d7, #1 @ B1
455 vmull.p8 q2, d27, d4 @ E = A*B1
456 vext.8 d18, d27, d27, #2 @ A2
457 vmull.p8 q9, d18, d7 @ H = A2*B
458 vext.8 d22, d7, d7, #2 @ B2
459 vmull.p8 q11, d27, d22 @ G = A*B2
460 vext.8 d20, d27, d27, #3 @ A3
461 veor q8, q8, q2 @ L = E + F
462 vmull.p8 q10, d20, d7 @ J = A3*B
463 vext.8 d4, d7, d7, #3 @ B3
464 veor q9, q9, q11 @ M = G + H
465 vmull.p8 q2, d27, d4 @ I = A*B3
466 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
468 vext.8 d22, d7, d7, #4 @ B4
469 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
471 vmull.p8 q11, d27, d22 @ K = A*B4
472 veor q10, q10, q2 @ N = I + J
475 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
477 vext.8 q8, q8, q8, #15
478 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
480 vext.8 q9, q9, q9, #14
482 vmull.p8 q2, d27, d7 @ D = A*B
483 vext.8 q11, q11, q11, #12
484 vext.8 q10, q10, q10, #13
489 veor q1,q1,q0 @ Karatsuba post-processing
492 veor d4,d4,d3 @ Xh|Xl - 256-bit result
494 @ equivalent of reduction_avx from ghash-x86_64.pl
495 vshl.i64 q9,q0,#57 @ 1st phase
503 vshr.u64 q10,q0,#1 @ 2nd phase
518 vst1.64 d1,[r0,:64]! @ write out Xi
522 .size gcm_ghash_neon,.-gcm_ghash_neon
524 .asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"