12 .short 0x0000,0x1C20,0x3840,0x2460
13 .short 0x7080,0x6CA0,0x48C0,0x54E0
14 .short 0xE100,0xFD20,0xD940,0xC560
15 .short 0x9180,0x8DA0,0xA9C0,0xB5E0
16 .size rem_4bit,.-rem_4bit
18 .type rem_4bit_get,%function
21 sub r2,r2,#32 @ &rem_4bit
24 .size rem_4bit_get,.-rem_4bit_get
26 .global gcm_ghash_4bit
27 .hidden gcm_ghash_4bit
28 .type gcm_ghash_4bit,%function
31 add r3,r2,r3 @ r3 to point at the end
32 stmdb sp!,{r3-r11,lr} @ save r3/end too
33 sub r12,r12,#48 @ &rem_4bit
35 ldmia r12,{r4-r11} @ copy rem_4bit ...
36 stmdb sp!,{r4-r11} @ ... to stack
47 ldmia r7,{r4-r7} @ load Htbl[nlo]
52 ldmia r11,{r8-r11} @ load Htbl[nhi]
55 ldrh r8,[sp,r14] @ rem_4bit[rem]
73 ldmia r11,{r8-r11} @ load Htbl[nlo]
78 ldrh r8,[sp,r12] @ rem_4bit[rem]
86 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
88 ldmia r11,{r8-r11} @ load Htbl[nhi]
101 eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem]
104 ldr r3,[sp,#32] @ re-load r3/end
107 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
110 #elif defined(__ARMEB__)
122 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
125 #elif defined(__ARMEB__)
137 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
140 #elif defined(__ARMEB__)
152 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
155 #elif defined(__ARMEB__)
171 ldmia sp!,{r4-r11,pc}
173 ldmia sp!,{r4-r11,lr}
175 moveq pc,lr @ be binary compatible with V4, yet
176 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
178 .size gcm_ghash_4bit,.-gcm_ghash_4bit
180 .global gcm_gmult_4bit
181 .hidden gcm_gmult_4bit
182 .type gcm_gmult_4bit,%function
184 stmdb sp!,{r4-r11,lr}
193 ldmia r7,{r4-r7} @ load Htbl[nlo]
197 and r14,r4,#0xf @ rem
198 ldmia r11,{r8-r11} @ load Htbl[nhi]
201 ldrh r8,[r2,r14] @ rem_4bit[rem]
214 and r12,r4,#0xf @ rem
217 ldmia r11,{r8-r11} @ load Htbl[nlo]
222 ldrh r8,[r2,r12] @ rem_4bit[rem]
229 and r14,r4,#0xf @ rem
230 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
232 ldmia r11,{r8-r11} @ load Htbl[nhi]
236 ldrh r8,[r2,r14] @ rem_4bit[rem]
243 eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem]
245 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
248 #elif defined(__ARMEB__)
260 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
263 #elif defined(__ARMEB__)
275 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
278 #elif defined(__ARMEB__)
290 #if __ARM_ARCH__>=7 && defined(__ARMEL__)
293 #elif defined(__ARMEB__)
306 ldmia sp!,{r4-r11,pc}
308 ldmia sp!,{r4-r11,lr}
310 moveq pc,lr @ be binary compatible with V4, yet
311 .word 0xe12fff1e @ interoperable with Thumb ISA:-)
313 .size gcm_gmult_4bit,.-gcm_gmult_4bit
317 .global gcm_init_neon
318 .hidden gcm_init_neon
319 .type gcm_init_neon,%function
322 vld1.64 d7,[r1,:64]! @ load H
326 vshr.u64 d16,#63 @ t0=0xc2....01
329 vshr.s8 q9,#7 @ broadcast carry bit
333 veor q3,q3,q8 @ twisted H
337 .size gcm_init_neon,.-gcm_init_neon
339 .global gcm_gmult_neon
340 .hidden gcm_gmult_neon
341 .type gcm_gmult_neon,%function
344 vld1.64 d7,[r0,:64]! @ load Xi
346 vmov.i64 d29,#0x0000ffffffffffff
347 vldmia r1,{d26-d27} @ load twisted H
348 vmov.i64 d30,#0x00000000ffffffff
352 vmov.i64 d31,#0x000000000000ffff
353 veor d28,d26,d27 @ Karatsuba pre-processing
356 .size gcm_gmult_neon,.-gcm_gmult_neon
358 .global gcm_ghash_neon
359 .hidden gcm_ghash_neon
360 .type gcm_ghash_neon,%function
363 vld1.64 d1,[r0,:64]! @ load Xi
365 vmov.i64 d29,#0x0000ffffffffffff
366 vldmia r1,{d26-d27} @ load twisted H
367 vmov.i64 d30,#0x00000000ffffffff
371 vmov.i64 d31,#0x000000000000ffff
372 veor d28,d26,d27 @ Karatsuba pre-processing
375 vld1.64 d7,[r2]! @ load inp
382 vext.8 d16, d26, d26, #1 @ A1
383 vmull.p8 q8, d16, d6 @ F = A1*B
384 vext.8 d0, d6, d6, #1 @ B1
385 vmull.p8 q0, d26, d0 @ E = A*B1
386 vext.8 d18, d26, d26, #2 @ A2
387 vmull.p8 q9, d18, d6 @ H = A2*B
388 vext.8 d22, d6, d6, #2 @ B2
389 vmull.p8 q11, d26, d22 @ G = A*B2
390 vext.8 d20, d26, d26, #3 @ A3
391 veor q8, q8, q0 @ L = E + F
392 vmull.p8 q10, d20, d6 @ J = A3*B
393 vext.8 d0, d6, d6, #3 @ B3
394 veor q9, q9, q11 @ M = G + H
395 vmull.p8 q0, d26, d0 @ I = A*B3
396 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
398 vext.8 d22, d6, d6, #4 @ B4
399 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
401 vmull.p8 q11, d26, d22 @ K = A*B4
402 veor q10, q10, q0 @ N = I + J
405 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
407 vext.8 q8, q8, q8, #15
408 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
410 vext.8 q9, q9, q9, #14
412 vmull.p8 q0, d26, d6 @ D = A*B
413 vext.8 q11, q11, q11, #12
414 vext.8 q10, q10, q10, #13
419 veor d6,d6,d7 @ Karatsuba pre-processing
420 vext.8 d16, d28, d28, #1 @ A1
421 vmull.p8 q8, d16, d6 @ F = A1*B
422 vext.8 d2, d6, d6, #1 @ B1
423 vmull.p8 q1, d28, d2 @ E = A*B1
424 vext.8 d18, d28, d28, #2 @ A2
425 vmull.p8 q9, d18, d6 @ H = A2*B
426 vext.8 d22, d6, d6, #2 @ B2
427 vmull.p8 q11, d28, d22 @ G = A*B2
428 vext.8 d20, d28, d28, #3 @ A3
429 veor q8, q8, q1 @ L = E + F
430 vmull.p8 q10, d20, d6 @ J = A3*B
431 vext.8 d2, d6, d6, #3 @ B3
432 veor q9, q9, q11 @ M = G + H
433 vmull.p8 q1, d28, d2 @ I = A*B3
434 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
436 vext.8 d22, d6, d6, #4 @ B4
437 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
439 vmull.p8 q11, d28, d22 @ K = A*B4
440 veor q10, q10, q1 @ N = I + J
443 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
445 vext.8 q8, q8, q8, #15
446 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
448 vext.8 q9, q9, q9, #14
450 vmull.p8 q1, d28, d6 @ D = A*B
451 vext.8 q11, q11, q11, #12
452 vext.8 q10, q10, q10, #13
457 vext.8 d16, d27, d27, #1 @ A1
458 vmull.p8 q8, d16, d7 @ F = A1*B
459 vext.8 d4, d7, d7, #1 @ B1
460 vmull.p8 q2, d27, d4 @ E = A*B1
461 vext.8 d18, d27, d27, #2 @ A2
462 vmull.p8 q9, d18, d7 @ H = A2*B
463 vext.8 d22, d7, d7, #2 @ B2
464 vmull.p8 q11, d27, d22 @ G = A*B2
465 vext.8 d20, d27, d27, #3 @ A3
466 veor q8, q8, q2 @ L = E + F
467 vmull.p8 q10, d20, d7 @ J = A3*B
468 vext.8 d4, d7, d7, #3 @ B3
469 veor q9, q9, q11 @ M = G + H
470 vmull.p8 q2, d27, d4 @ I = A*B3
471 veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8
473 vext.8 d22, d7, d7, #4 @ B4
474 veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16
476 vmull.p8 q11, d27, d22 @ K = A*B4
477 veor q10, q10, q2 @ N = I + J
480 veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24
482 vext.8 q8, q8, q8, #15
483 veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32
485 vext.8 q9, q9, q9, #14
487 vmull.p8 q2, d27, d7 @ D = A*B
488 vext.8 q11, q11, q11, #12
489 vext.8 q10, q10, q10, #13
494 veor q1,q1,q0 @ Karatsuba post-processing
497 veor d4,d4,d3 @ Xh|Xl - 256-bit result
499 @ equivalent of reduction_avx from ghash-x86_64.pl
500 vshl.i64 q9,q0,#57 @ 1st phase
508 vshr.u64 q10,q0,#1 @ 2nd phase
523 vst1.64 d1,[r0,:64]! @ write out Xi
527 .size gcm_ghash_neon,.-gcm_ghash_neon
529 .asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro@openssl.org>"