2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
34 #include <asm/frame.h>
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register. This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned). It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released. However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
49 # constants in mergeable sections, linker can reorder and merge
50 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
52 .Lgf128mul_x_ble_mask:
53 .octa 0x00000000000000010000000000000087
54 .section .rodata.cst16.POLY, "aM", @progbits, 16
56 POLY: .octa 0xC2000000000000000000000000000001
57 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
59 TWOONE: .octa 0x00000001000000000000000000000001
61 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
63 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
64 .section .rodata.cst16.MASK1, "aM", @progbits, 16
66 MASK1: .octa 0x0000000000000000ffffffffffffffff
67 .section .rodata.cst16.MASK2, "aM", @progbits, 16
69 MASK2: .octa 0xffffffffffffffff0000000000000000
70 .section .rodata.cst16.ONE, "aM", @progbits, 16
72 ONE: .octa 0x00000000000000000000000000000001
73 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
75 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
76 .section .rodata.cst16.dec, "aM", @progbits, 16
79 .section .rodata.cst16.enc, "aM", @progbits, 16
83 # order of these constants should not change.
84 # more specifically, ALL_F should follow SHIFT_MASK,
85 # and zero should follow ALL_F
86 .section .rodata, "a", @progbits
88 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
89 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
90 .octa 0x00000000000000000000000000000000
94 .type aad_shift_arr, @object
95 .size aad_shift_arr, 272
97 .octa 0xffffffffffffffffffffffffffffffff
98 .octa 0xffffffffffffffffffffffffffffff0C
99 .octa 0xffffffffffffffffffffffffffff0D0C
100 .octa 0xffffffffffffffffffffffffff0E0D0C
101 .octa 0xffffffffffffffffffffffff0F0E0D0C
102 .octa 0xffffffffffffffffffffff0C0B0A0908
103 .octa 0xffffffffffffffffffff0D0C0B0A0908
104 .octa 0xffffffffffffffffff0E0D0C0B0A0908
105 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
106 .octa 0xffffffffffffff0C0B0A090807060504
107 .octa 0xffffffffffff0D0C0B0A090807060504
108 .octa 0xffffffffff0E0D0C0B0A090807060504
109 .octa 0xffffffff0F0E0D0C0B0A090807060504
110 .octa 0xffffff0C0B0A09080706050403020100
111 .octa 0xffff0D0C0B0A09080706050403020100
112 .octa 0xff0E0D0C0B0A09080706050403020100
113 .octa 0x0F0E0D0C0B0A09080706050403020100
119 #define STACK_OFFSET 8*3
120 #define HashKey 16*0 // store HashKey <<1 mod poly here
121 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
122 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
123 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
124 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
125 // bits of HashKey <<1 mod poly here
126 //(for Karatsuba purposes)
127 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
128 // bits of HashKey^2 <<1 mod poly here
129 // (for Karatsuba purposes)
130 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
131 // bits of HashKey^3 <<1 mod poly here
132 // (for Karatsuba purposes)
133 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
134 // bits of HashKey^4 <<1 mod poly here
135 // (for Karatsuba purposes)
136 #define VARIABLE_OFFSET 16*8
144 #define arg7 STACK_OFFSET+8(%r14)
145 #define arg8 STACK_OFFSET+16(%r14)
146 #define arg9 STACK_OFFSET+24(%r14)
147 #define arg10 STACK_OFFSET+32(%r14)
148 #define keysize 2*15*16(%arg1)
165 #define BSWAP_MASK %xmm10
169 #define GF128MUL_MASK %xmm10
199 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
202 * Input: A and B (128-bits each, bit-reflected)
203 * Output: C = A*B*x mod poly, (i.e. >>1 )
204 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
205 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
208 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
210 pshufd $78, \GH, \TMP2
211 pshufd $78, \HK, \TMP3
212 pxor \GH, \TMP2 # TMP2 = a1+a0
213 pxor \HK, \TMP3 # TMP3 = b1+b0
214 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
215 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
216 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
218 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
220 pslldq $8, \TMP3 # left shift TMP3 2 DWs
221 psrldq $8, \TMP2 # right shift TMP2 2 DWs
223 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
225 # first phase of the reduction
229 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
230 # in in order to perform
232 pslld $31, \TMP2 # packed right shift <<31
233 pslld $30, \TMP3 # packed right shift <<30
234 pslld $25, \TMP4 # packed right shift <<25
235 pxor \TMP3, \TMP2 # xor the shifted versions
238 psrldq $4, \TMP5 # right shift TMP5 1 DW
239 pslldq $12, \TMP2 # left shift TMP2 3 DWs
242 # second phase of the reduction
244 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
245 # in in order to perform
249 psrld $1,\TMP2 # packed left shift >>1
250 psrld $2,\TMP3 # packed left shift >>2
251 psrld $7,\TMP4 # packed left shift >>7
252 pxor \TMP3,\TMP2 # xor the shifted versions
256 pxor \TMP1, \GH # result is in TMP1
260 * if a = number of total plaintext bytes
262 * num_initial_blocks = b mod 4
263 * encrypt the initial num_initial_blocks blocks and apply ghash on
265 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
267 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
271 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
272 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
273 MOVADQ SHUF_MASK(%rip), %xmm14
274 mov arg7, %r10 # %r10 = AAD
275 mov arg8, %r12 # %r12 = aadLen
281 jl _get_AAD_rest8\num_initial_blocks\operation
282 _get_AAD_blocks\num_initial_blocks\operation:
283 movdqu (%r10), %xmm\i
284 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
286 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
291 jge _get_AAD_blocks\num_initial_blocks\operation
295 je _get_AAD_done\num_initial_blocks\operation
299 /* read the last <16B of AAD. since we have at least 4B of
300 data right after the AAD (the ICV, and maybe some CT), we can
301 read 4B/8B blocks safely, and then get rid of the extra stuff */
302 _get_AAD_rest8\num_initial_blocks\operation:
304 jle _get_AAD_rest4\num_initial_blocks\operation
311 jmp _get_AAD_rest8\num_initial_blocks\operation
312 _get_AAD_rest4\num_initial_blocks\operation:
314 jle _get_AAD_rest0\num_initial_blocks\operation
322 _get_AAD_rest0\num_initial_blocks\operation:
323 /* finalize: shift out the extra bytes we read, and align
324 left. since pslldq can only shift by an immediate, we use
325 vpshufb and an array of shuffle masks */
328 movdqu aad_shift_arr(%r11), \TMP1
329 PSHUFB_XMM \TMP1, %xmm\i
330 _get_AAD_rest_final\num_initial_blocks\operation:
331 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
333 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
335 _get_AAD_done\num_initial_blocks\operation:
336 xor %r11, %r11 # initialise the data pointer offset as zero
337 # start AES for num_initial_blocks blocks
339 mov %arg5, %rax # %rax = *Y0
340 movdqu (%rax), \XMM0 # XMM0 = Y0
341 PSHUFB_XMM %xmm14, \XMM0
343 .if (\i == 5) || (\i == 6) || (\i == 7)
344 MOVADQ ONE(%RIP),\TMP1
347 paddd \TMP1, \XMM0 # INCR Y0
348 movdqa \XMM0, %xmm\index
349 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
350 pxor \TMP2, %xmm\index
354 shr $2,%eax # 128->4, 192->6, 256->8
355 add $5,%eax # 128->9, 192->11, 256->13
357 aes_loop_initial_dec\num_initial_blocks:
360 AESENC \TMP1, %xmm\index
364 jnz aes_loop_initial_dec\num_initial_blocks
368 AESENCLAST \TMP1, %xmm\index # Last Round
371 movdqu (%arg3 , %r11, 1), \TMP1
372 pxor \TMP1, %xmm\index
373 movdqu %xmm\index, (%arg2 , %r11, 1)
374 # write back plaintext/ciphertext for num_initial_blocks
377 movdqa \TMP1, %xmm\index
378 PSHUFB_XMM %xmm14, %xmm\index
379 # prepare plaintext/ciphertext for GHASH computation
383 # apply GHASH on num_initial_blocks blocks
387 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
389 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
391 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
394 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
396 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
399 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
402 jl _initial_blocks_done\num_initial_blocks\operation
403 # no need for precomputed values
406 * Precomputations for HashKey parallel with encryption of first 4 blocks.
407 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
409 MOVADQ ONE(%rip), \TMP1
410 paddd \TMP1, \XMM0 # INCR Y0
412 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
414 paddd \TMP1, \XMM0 # INCR Y0
416 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
418 paddd \TMP1, \XMM0 # INCR Y0
420 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
422 paddd \TMP1, \XMM0 # INCR Y0
424 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
426 MOVADQ 0(%arg1),\TMP1
432 pshufd $78, \TMP3, \TMP1
434 movdqa \TMP1, HashKey_k(%rsp)
435 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
436 # TMP5 = HashKey^2<<1 (mod poly)
437 movdqa \TMP5, HashKey_2(%rsp)
438 # HashKey_2 = HashKey^2<<1 (mod poly)
439 pshufd $78, \TMP5, \TMP1
441 movdqa \TMP1, HashKey_2_k(%rsp)
442 .irpc index, 1234 # do 4 rounds
443 movaps 0x10*\index(%arg1), \TMP1
449 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
450 # TMP5 = HashKey^3<<1 (mod poly)
451 movdqa \TMP5, HashKey_3(%rsp)
452 pshufd $78, \TMP5, \TMP1
454 movdqa \TMP1, HashKey_3_k(%rsp)
455 .irpc index, 56789 # do next 5 rounds
456 movaps 0x10*\index(%arg1), \TMP1
462 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
463 # TMP5 = HashKey^3<<1 (mod poly)
464 movdqa \TMP5, HashKey_4(%rsp)
465 pshufd $78, \TMP5, \TMP1
467 movdqa \TMP1, HashKey_4_k(%rsp)
470 shr $2,%eax # 128->4, 192->6, 256->8
471 sub $4,%eax # 128->0, 192->2, 256->4
472 jz aes_loop_pre_dec_done\num_initial_blocks
474 aes_loop_pre_dec\num_initial_blocks:
477 AESENC \TMP2, %xmm\index
481 jnz aes_loop_pre_dec\num_initial_blocks
483 aes_loop_pre_dec_done\num_initial_blocks:
485 AESENCLAST \TMP2, \XMM1
486 AESENCLAST \TMP2, \XMM2
487 AESENCLAST \TMP2, \XMM3
488 AESENCLAST \TMP2, \XMM4
489 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
491 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
493 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
495 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
497 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
499 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
501 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
503 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
506 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
508 # combine GHASHed value with the corresponding ciphertext
509 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
510 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
511 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
513 _initial_blocks_done\num_initial_blocks\operation:
519 * if a = number of total plaintext bytes
521 * num_initial_blocks = b mod 4
522 * encrypt the initial num_initial_blocks blocks and apply ghash on
524 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
526 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
530 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
531 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
532 MOVADQ SHUF_MASK(%rip), %xmm14
533 mov arg7, %r10 # %r10 = AAD
534 mov arg8, %r12 # %r12 = aadLen
540 jl _get_AAD_rest8\num_initial_blocks\operation
541 _get_AAD_blocks\num_initial_blocks\operation:
542 movdqu (%r10), %xmm\i
543 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
545 GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
550 jge _get_AAD_blocks\num_initial_blocks\operation
554 je _get_AAD_done\num_initial_blocks\operation
558 /* read the last <16B of AAD. since we have at least 4B of
559 data right after the AAD (the ICV, and maybe some PT), we can
560 read 4B/8B blocks safely, and then get rid of the extra stuff */
561 _get_AAD_rest8\num_initial_blocks\operation:
563 jle _get_AAD_rest4\num_initial_blocks\operation
570 jmp _get_AAD_rest8\num_initial_blocks\operation
571 _get_AAD_rest4\num_initial_blocks\operation:
573 jle _get_AAD_rest0\num_initial_blocks\operation
581 _get_AAD_rest0\num_initial_blocks\operation:
582 /* finalize: shift out the extra bytes we read, and align
583 left. since pslldq can only shift by an immediate, we use
584 vpshufb and an array of shuffle masks */
587 movdqu aad_shift_arr(%r11), \TMP1
588 PSHUFB_XMM \TMP1, %xmm\i
589 _get_AAD_rest_final\num_initial_blocks\operation:
590 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
592 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
594 _get_AAD_done\num_initial_blocks\operation:
595 xor %r11, %r11 # initialise the data pointer offset as zero
596 # start AES for num_initial_blocks blocks
598 mov %arg5, %rax # %rax = *Y0
599 movdqu (%rax), \XMM0 # XMM0 = Y0
600 PSHUFB_XMM %xmm14, \XMM0
602 .if (\i == 5) || (\i == 6) || (\i == 7)
604 MOVADQ ONE(%RIP),\TMP1
605 MOVADQ 0(%arg1),\TMP2
607 paddd \TMP1, \XMM0 # INCR Y0
608 MOVADQ \XMM0, %xmm\index
609 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
610 pxor \TMP2, %xmm\index
614 shr $2,%eax # 128->4, 192->6, 256->8
615 add $5,%eax # 128->9, 192->11, 256->13
617 aes_loop_initial_enc\num_initial_blocks:
620 AESENC \TMP1, %xmm\index
624 jnz aes_loop_initial_enc\num_initial_blocks
628 AESENCLAST \TMP1, %xmm\index # Last Round
631 movdqu (%arg3 , %r11, 1), \TMP1
632 pxor \TMP1, %xmm\index
633 movdqu %xmm\index, (%arg2 , %r11, 1)
634 # write back plaintext/ciphertext for num_initial_blocks
636 PSHUFB_XMM %xmm14, %xmm\index
638 # prepare plaintext/ciphertext for GHASH computation
642 # apply GHASH on num_initial_blocks blocks
646 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
648 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
650 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
653 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
655 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
658 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
661 jl _initial_blocks_done\num_initial_blocks\operation
662 # no need for precomputed values
665 * Precomputations for HashKey parallel with encryption of first 4 blocks.
666 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
668 MOVADQ ONE(%RIP),\TMP1
669 paddd \TMP1, \XMM0 # INCR Y0
671 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
673 paddd \TMP1, \XMM0 # INCR Y0
675 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
677 paddd \TMP1, \XMM0 # INCR Y0
679 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
681 paddd \TMP1, \XMM0 # INCR Y0
683 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
685 MOVADQ 0(%arg1),\TMP1
691 pshufd $78, \TMP3, \TMP1
693 movdqa \TMP1, HashKey_k(%rsp)
694 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
695 # TMP5 = HashKey^2<<1 (mod poly)
696 movdqa \TMP5, HashKey_2(%rsp)
697 # HashKey_2 = HashKey^2<<1 (mod poly)
698 pshufd $78, \TMP5, \TMP1
700 movdqa \TMP1, HashKey_2_k(%rsp)
701 .irpc index, 1234 # do 4 rounds
702 movaps 0x10*\index(%arg1), \TMP1
708 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
709 # TMP5 = HashKey^3<<1 (mod poly)
710 movdqa \TMP5, HashKey_3(%rsp)
711 pshufd $78, \TMP5, \TMP1
713 movdqa \TMP1, HashKey_3_k(%rsp)
714 .irpc index, 56789 # do next 5 rounds
715 movaps 0x10*\index(%arg1), \TMP1
721 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
722 # TMP5 = HashKey^3<<1 (mod poly)
723 movdqa \TMP5, HashKey_4(%rsp)
724 pshufd $78, \TMP5, \TMP1
726 movdqa \TMP1, HashKey_4_k(%rsp)
729 shr $2,%eax # 128->4, 192->6, 256->8
730 sub $4,%eax # 128->0, 192->2, 256->4
731 jz aes_loop_pre_enc_done\num_initial_blocks
733 aes_loop_pre_enc\num_initial_blocks:
736 AESENC \TMP2, %xmm\index
740 jnz aes_loop_pre_enc\num_initial_blocks
742 aes_loop_pre_enc_done\num_initial_blocks:
744 AESENCLAST \TMP2, \XMM1
745 AESENCLAST \TMP2, \XMM2
746 AESENCLAST \TMP2, \XMM3
747 AESENCLAST \TMP2, \XMM4
748 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
750 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
752 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
754 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
756 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
757 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
758 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
759 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
762 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
764 # combine GHASHed value with the corresponding ciphertext
765 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
766 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
767 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
769 _initial_blocks_done\num_initial_blocks\operation:
774 * encrypt 4 blocks at a time
775 * ghash the 4 previously encrypted ciphertext blocks
776 * arg1, %arg2, %arg3 are used as pointers only, not modified
777 * %r11 is the data offset value
779 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
780 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
787 movdqa SHUF_MASK(%rip), %xmm15
788 # multiply TMP5 * HashKey using karatsuba
791 pshufd $78, \XMM5, \TMP6
793 paddd ONE(%rip), \XMM0 # INCR CNT
794 movdqa HashKey_4(%rsp), \TMP5
795 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
797 paddd ONE(%rip), \XMM0 # INCR CNT
799 paddd ONE(%rip), \XMM0 # INCR CNT
801 paddd ONE(%rip), \XMM0 # INCR CNT
803 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
804 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
805 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
806 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
807 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
813 movdqa HashKey_4_k(%rsp), \TMP5
814 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
815 movaps 0x10(%arg1), \TMP1
816 AESENC \TMP1, \XMM1 # Round 1
820 movaps 0x20(%arg1), \TMP1
821 AESENC \TMP1, \XMM1 # Round 2
826 pshufd $78, \XMM6, \TMP2
828 movdqa HashKey_3(%rsp), \TMP5
829 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
830 movaps 0x30(%arg1), \TMP3
831 AESENC \TMP3, \XMM1 # Round 3
835 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
836 movaps 0x40(%arg1), \TMP3
837 AESENC \TMP3, \XMM1 # Round 4
841 movdqa HashKey_3_k(%rsp), \TMP5
842 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
843 movaps 0x50(%arg1), \TMP3
844 AESENC \TMP3, \XMM1 # Round 5
849 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
853 pshufd $78, \XMM7, \TMP2
855 movdqa HashKey_2(%rsp ), \TMP5
857 # Multiply TMP5 * HashKey using karatsuba
859 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
860 movaps 0x60(%arg1), \TMP3
861 AESENC \TMP3, \XMM1 # Round 6
865 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
866 movaps 0x70(%arg1), \TMP3
867 AESENC \TMP3, \XMM1 # Round 7
871 movdqa HashKey_2_k(%rsp), \TMP5
872 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
873 movaps 0x80(%arg1), \TMP3
874 AESENC \TMP3, \XMM1 # Round 8
879 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
883 # Multiply XMM8 * HashKey
884 # XMM8 and TMP5 hold the values for the two operands
887 pshufd $78, \XMM8, \TMP2
889 movdqa HashKey(%rsp), \TMP5
890 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
891 movaps 0x90(%arg1), \TMP3
892 AESENC \TMP3, \XMM1 # Round 9
896 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
899 shr $2,%eax # 128->4, 192->6, 256->8
900 sub $4,%eax # 128->0, 192->2, 256->4
901 jz aes_loop_par_enc_done
906 AESENC \TMP3, %xmm\index
912 aes_loop_par_enc_done:
914 AESENCLAST \TMP3, \XMM1 # Round 10
915 AESENCLAST \TMP3, \XMM2
916 AESENCLAST \TMP3, \XMM3
917 AESENCLAST \TMP3, \XMM4
918 movdqa HashKey_k(%rsp), \TMP5
919 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
920 movdqu (%arg3,%r11,1), \TMP3
921 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
922 movdqu 16(%arg3,%r11,1), \TMP3
923 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
924 movdqu 32(%arg3,%r11,1), \TMP3
925 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
926 movdqu 48(%arg3,%r11,1), \TMP3
927 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
928 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
929 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
930 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
931 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
932 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
933 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
934 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
935 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
943 pslldq $8, \TMP3 # left shift TMP3 2 DWs
944 psrldq $8, \TMP2 # right shift TMP2 2 DWs
946 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
948 # first phase of reduction
953 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
954 pslld $31, \TMP2 # packed right shift << 31
955 pslld $30, \TMP3 # packed right shift << 30
956 pslld $25, \TMP4 # packed right shift << 25
957 pxor \TMP3, \TMP2 # xor the shifted versions
960 psrldq $4, \TMP5 # right shift T5 1 DW
961 pslldq $12, \TMP2 # left shift T2 3 DWs
964 # second phase of reduction
966 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
969 psrld $1, \TMP2 # packed left shift >>1
970 psrld $2, \TMP3 # packed left shift >>2
971 psrld $7, \TMP4 # packed left shift >>7
972 pxor \TMP3,\TMP2 # xor the shifted versions
976 pxor \TMP1, \XMM5 # result is in TMP1
982 * decrypt 4 blocks at a time
983 * ghash the 4 previously decrypted ciphertext blocks
984 * arg1, %arg2, %arg3 are used as pointers only, not modified
985 * %r11 is the data offset value
987 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
988 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
995 movdqa SHUF_MASK(%rip), %xmm15
996 # multiply TMP5 * HashKey using karatsuba
999 pshufd $78, \XMM5, \TMP6
1001 paddd ONE(%rip), \XMM0 # INCR CNT
1002 movdqa HashKey_4(%rsp), \TMP5
1003 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1005 paddd ONE(%rip), \XMM0 # INCR CNT
1007 paddd ONE(%rip), \XMM0 # INCR CNT
1009 paddd ONE(%rip), \XMM0 # INCR CNT
1011 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1012 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1013 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1014 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1015 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1021 movdqa HashKey_4_k(%rsp), \TMP5
1022 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1023 movaps 0x10(%arg1), \TMP1
1024 AESENC \TMP1, \XMM1 # Round 1
1028 movaps 0x20(%arg1), \TMP1
1029 AESENC \TMP1, \XMM1 # Round 2
1034 pshufd $78, \XMM6, \TMP2
1036 movdqa HashKey_3(%rsp), \TMP5
1037 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1038 movaps 0x30(%arg1), \TMP3
1039 AESENC \TMP3, \XMM1 # Round 3
1043 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1044 movaps 0x40(%arg1), \TMP3
1045 AESENC \TMP3, \XMM1 # Round 4
1049 movdqa HashKey_3_k(%rsp), \TMP5
1050 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1051 movaps 0x50(%arg1), \TMP3
1052 AESENC \TMP3, \XMM1 # Round 5
1057 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1061 pshufd $78, \XMM7, \TMP2
1063 movdqa HashKey_2(%rsp ), \TMP5
1065 # Multiply TMP5 * HashKey using karatsuba
1067 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1068 movaps 0x60(%arg1), \TMP3
1069 AESENC \TMP3, \XMM1 # Round 6
1073 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1074 movaps 0x70(%arg1), \TMP3
1075 AESENC \TMP3, \XMM1 # Round 7
1079 movdqa HashKey_2_k(%rsp), \TMP5
1080 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1081 movaps 0x80(%arg1), \TMP3
1082 AESENC \TMP3, \XMM1 # Round 8
1087 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1091 # Multiply XMM8 * HashKey
1092 # XMM8 and TMP5 hold the values for the two operands
1095 pshufd $78, \XMM8, \TMP2
1097 movdqa HashKey(%rsp), \TMP5
1098 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1099 movaps 0x90(%arg1), \TMP3
1100 AESENC \TMP3, \XMM1 # Round 9
1104 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1105 lea 0xa0(%arg1),%r10
1107 shr $2,%eax # 128->4, 192->6, 256->8
1108 sub $4,%eax # 128->0, 192->2, 256->4
1109 jz aes_loop_par_dec_done
1114 AESENC \TMP3, %xmm\index
1118 jnz aes_loop_par_dec
1120 aes_loop_par_dec_done:
1121 MOVADQ (%r10), \TMP3
1122 AESENCLAST \TMP3, \XMM1 # last round
1123 AESENCLAST \TMP3, \XMM2
1124 AESENCLAST \TMP3, \XMM3
1125 AESENCLAST \TMP3, \XMM4
1126 movdqa HashKey_k(%rsp), \TMP5
1127 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1128 movdqu (%arg3,%r11,1), \TMP3
1129 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1130 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1132 movdqu 16(%arg3,%r11,1), \TMP3
1133 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1134 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1136 movdqu 32(%arg3,%r11,1), \TMP3
1137 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1138 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1140 movdqu 48(%arg3,%r11,1), \TMP3
1141 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1142 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1144 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1145 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1146 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1147 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1155 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1156 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1158 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1160 # first phase of reduction
1165 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1166 pslld $31, \TMP2 # packed right shift << 31
1167 pslld $30, \TMP3 # packed right shift << 30
1168 pslld $25, \TMP4 # packed right shift << 25
1169 pxor \TMP3, \TMP2 # xor the shifted versions
1172 psrldq $4, \TMP5 # right shift T5 1 DW
1173 pslldq $12, \TMP2 # left shift T2 3 DWs
1176 # second phase of reduction
1178 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1181 psrld $1, \TMP2 # packed left shift >>1
1182 psrld $2, \TMP3 # packed left shift >>2
1183 psrld $7, \TMP4 # packed left shift >>7
1184 pxor \TMP3,\TMP2 # xor the shifted versions
1188 pxor \TMP1, \XMM5 # result is in TMP1
1193 /* GHASH the last 4 ciphertext blocks. */
1194 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1195 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1197 # Multiply TMP6 * HashKey (using Karatsuba)
1200 pshufd $78, \XMM1, \TMP2
1202 movdqa HashKey_4(%rsp), \TMP5
1203 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1204 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1205 movdqa HashKey_4_k(%rsp), \TMP4
1206 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1207 movdqa \XMM1, \XMMDst
1208 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1210 # Multiply TMP1 * HashKey (using Karatsuba)
1213 pshufd $78, \XMM2, \TMP2
1215 movdqa HashKey_3(%rsp), \TMP5
1216 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1217 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1218 movdqa HashKey_3_k(%rsp), \TMP4
1219 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1223 # results accumulated in TMP6, XMMDst, XMM1
1225 # Multiply TMP1 * HashKey (using Karatsuba)
1228 pshufd $78, \XMM3, \TMP2
1230 movdqa HashKey_2(%rsp), \TMP5
1231 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1232 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1233 movdqa HashKey_2_k(%rsp), \TMP4
1234 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1237 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1239 # Multiply TMP1 * HashKey (using Karatsuba)
1241 pshufd $78, \XMM4, \TMP2
1243 movdqa HashKey(%rsp), \TMP5
1244 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1245 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1246 movdqa HashKey_k(%rsp), \TMP4
1247 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1253 # middle section of the temp results combined as in karatsuba algorithm
1255 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1256 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1259 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1260 # first phase of the reduction
1261 movdqa \XMMDst, \TMP2
1262 movdqa \XMMDst, \TMP3
1263 movdqa \XMMDst, \TMP4
1264 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1265 pslld $31, \TMP2 # packed right shifting << 31
1266 pslld $30, \TMP3 # packed right shifting << 30
1267 pslld $25, \TMP4 # packed right shifting << 25
1268 pxor \TMP3, \TMP2 # xor the shifted versions
1271 psrldq $4, \TMP7 # right shift TMP7 1 DW
1272 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1275 # second phase of the reduction
1276 movdqa \XMMDst, \TMP2
1277 # make 3 copies of XMMDst for doing 3 shift operations
1278 movdqa \XMMDst, \TMP3
1279 movdqa \XMMDst, \TMP4
1280 psrld $1, \TMP2 # packed left shift >> 1
1281 psrld $2, \TMP3 # packed left shift >> 2
1282 psrld $7, \TMP4 # packed left shift >> 7
1283 pxor \TMP3, \TMP2 # xor the shifted versions
1287 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1291 /* Encryption of a single block
1295 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1299 shr $2,%eax # 128->4, 192->6, 256->8
1300 add $5,%eax # 128->9, 192->11, 256->13
1301 lea 16(%arg1), %r10 # get first expanded key address
1311 AESENCLAST \TMP1,\XMM0
1313 /*****************************************************************************
1314 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1315 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1316 * const u8 *in, // Ciphertext input
1317 * u64 plaintext_len, // Length of data in bytes for decryption.
1318 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1319 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1320 * // concatenated with 0x00000001. 16-byte aligned pointer.
1321 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1322 * const u8 *aad, // Additional Authentication Data (AAD)
1323 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1324 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1325 * // given authentication tag and only return the plaintext if they match.
1326 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1327 * // (most likely), 12 or 8.
1332 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1333 * set of 11 keys in the data structure void *aes_ctx
1337 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1338 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1339 * | Salt (From the SA) |
1340 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1341 * | Initialization Vector |
1342 * | (This is the sequence number from IPSec header) |
1343 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1345 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1350 * AAD padded to 128 bits with 0
1351 * for example, assume AAD is a u32 vector
1353 * if AAD is 8 bytes:
1354 * AAD[3] = {A0, A1};
1355 * padded AAD in xmm register = {A1 A0 0 0}
1358 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1359 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1361 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1362 * | 32-bit Sequence Number (A0) |
1363 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1365 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1367 * AAD Format with 32-bit Sequence Number
1369 * if AAD is 12 bytes:
1370 * AAD[3] = {A0, A1, A2};
1371 * padded AAD in xmm register = {A2 A1 A0 0}
1374 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1375 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1376 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1377 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1379 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1380 * | 64-bit Extended Sequence Number {A1,A0} |
1382 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1384 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1386 * AAD Format with 64-bit Extended Sequence Number
1389 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1390 * The code supports 16 too but for other sizes, the code will fail.
1393 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1394 * For other sizes, the code will fail.
1396 * poly = x^128 + x^127 + x^126 + x^121 + 1
1398 *****************************************************************************/
1399 ENTRY(aesni_gcm_dec)
1405 * states of %xmm registers %xmm6:%xmm15 not saved
1406 * all %xmm registers are clobbered
1408 sub $VARIABLE_OFFSET, %rsp
1409 and $~63, %rsp # align rsp to 64 bytes
1411 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1412 movdqa SHUF_MASK(%rip), %xmm2
1413 PSHUFB_XMM %xmm2, %xmm13
1416 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1418 movdqa %xmm13, %xmm2
1428 pshufd $0x24, %xmm1, %xmm2
1429 pcmpeqd TWOONE(%rip), %xmm2
1430 pand POLY(%rip), %xmm2
1431 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1434 # Decrypt first few blocks
1436 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1437 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1438 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1441 jz _initial_num_blocks_is_0_decrypt
1443 jb _initial_num_blocks_is_1_decrypt
1444 je _initial_num_blocks_is_2_decrypt
1445 _initial_num_blocks_is_3_decrypt:
1446 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1447 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1449 jmp _initial_blocks_decrypted
1450 _initial_num_blocks_is_2_decrypt:
1451 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1452 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1454 jmp _initial_blocks_decrypted
1455 _initial_num_blocks_is_1_decrypt:
1456 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1457 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1459 jmp _initial_blocks_decrypted
1460 _initial_num_blocks_is_0_decrypt:
1461 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1462 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1463 _initial_blocks_decrypted:
1465 je _zero_cipher_left_decrypt
1467 je _four_cipher_left_decrypt
1469 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1470 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1474 _four_cipher_left_decrypt:
1475 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1476 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1477 _zero_cipher_left_decrypt:
1479 and $15, %r13 # %r13 = arg4 (mod 16)
1480 je _multiple_of_16_bytes_decrypt
1482 # Handle the last <16 byte block separately
1484 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1485 movdqa SHUF_MASK(%rip), %xmm10
1486 PSHUFB_XMM %xmm10, %xmm0
1488 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1491 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1492 lea SHIFT_MASK+16(%rip), %r12
1494 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1495 # (%r13 is the number of bytes in plaintext mod 16)
1496 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1497 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1500 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1501 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1502 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1503 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1505 movdqa SHUF_MASK(%rip), %xmm10
1506 PSHUFB_XMM %xmm10 ,%xmm2
1509 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1510 # GHASH computation for the last <16 byte block
1515 MOVQ_R64_XMM %xmm0, %rax
1517 jle _less_than_8_bytes_left_decrypt
1518 mov %rax, (%arg2 , %r11, 1)
1521 MOVQ_R64_XMM %xmm0, %rax
1523 _less_than_8_bytes_left_decrypt:
1524 mov %al, (%arg2, %r11, 1)
1528 jne _less_than_8_bytes_left_decrypt
1529 _multiple_of_16_bytes_decrypt:
1530 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1531 shl $3, %r12 # convert into number of bits
1532 movd %r12d, %xmm15 # len(A) in %xmm15
1533 shl $3, %arg4 # len(C) in bits (*128)
1534 MOVQ_R64_XMM %arg4, %xmm1
1535 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1536 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1538 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1539 # final GHASH computation
1540 movdqa SHUF_MASK(%rip), %xmm10
1541 PSHUFB_XMM %xmm10, %xmm8
1543 mov %arg5, %rax # %rax = *Y0
1544 movdqu (%rax), %xmm0 # %xmm0 = Y0
1545 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1548 mov arg9, %r10 # %r10 = authTag
1549 mov arg10, %r11 # %r11 = auth_tag_len
1555 MOVQ_R64_XMM %xmm0, %rax
1561 je _return_T_done_decrypt
1569 je _return_T_done_decrypt
1576 je _return_T_done_decrypt
1581 jmp _return_T_done_decrypt
1583 movdqu %xmm0, (%r10)
1584 _return_T_done_decrypt:
1590 ENDPROC(aesni_gcm_dec)
1593 /*****************************************************************************
1594 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1595 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1596 * const u8 *in, // Plaintext input
1597 * u64 plaintext_len, // Length of data in bytes for encryption.
1598 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1599 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1600 * // concatenated with 0x00000001. 16-byte aligned pointer.
1601 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1602 * const u8 *aad, // Additional Authentication Data (AAD)
1603 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1604 * u8 *auth_tag, // Authenticated Tag output.
1605 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1611 * keys are pre-expanded and aligned to 16 bytes. we are using the
1612 * first set of 11 keys in the data structure void *aes_ctx
1617 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1618 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1619 * | Salt (From the SA) |
1620 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1621 * | Initialization Vector |
1622 * | (This is the sequence number from IPSec header) |
1623 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1625 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1630 * AAD padded to 128 bits with 0
1631 * for example, assume AAD is a u32 vector
1633 * if AAD is 8 bytes:
1634 * AAD[3] = {A0, A1};
1635 * padded AAD in xmm register = {A1 A0 0 0}
1638 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1639 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1641 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1642 * | 32-bit Sequence Number (A0) |
1643 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1647 * AAD Format with 32-bit Sequence Number
1649 * if AAD is 12 bytes:
1650 * AAD[3] = {A0, A1, A2};
1651 * padded AAD in xmm register = {A2 A1 A0 0}
1654 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1655 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658 * | 64-bit Extended Sequence Number {A1,A0} |
1660 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1662 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1664 * AAD Format with 64-bit Extended Sequence Number
1667 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1668 * The code supports 16 too but for other sizes, the code will fail.
1671 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1672 * For other sizes, the code will fail.
1674 * poly = x^128 + x^127 + x^126 + x^121 + 1
1675 ***************************************************************************/
1676 ENTRY(aesni_gcm_enc)
1682 # states of %xmm registers %xmm6:%xmm15 not saved
1683 # all %xmm registers are clobbered
1685 sub $VARIABLE_OFFSET, %rsp
1688 movdqu (%r12), %xmm13
1689 movdqa SHUF_MASK(%rip), %xmm2
1690 PSHUFB_XMM %xmm2, %xmm13
1693 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1695 movdqa %xmm13, %xmm2
1705 pshufd $0x24, %xmm1, %xmm2
1706 pcmpeqd TWOONE(%rip), %xmm2
1707 pand POLY(%rip), %xmm2
1709 movdqa %xmm13, HashKey(%rsp)
1710 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1714 # Encrypt first few blocks
1717 jz _initial_num_blocks_is_0_encrypt
1719 jb _initial_num_blocks_is_1_encrypt
1720 je _initial_num_blocks_is_2_encrypt
1721 _initial_num_blocks_is_3_encrypt:
1722 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1723 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1725 jmp _initial_blocks_encrypted
1726 _initial_num_blocks_is_2_encrypt:
1727 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1728 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1730 jmp _initial_blocks_encrypted
1731 _initial_num_blocks_is_1_encrypt:
1732 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1733 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1735 jmp _initial_blocks_encrypted
1736 _initial_num_blocks_is_0_encrypt:
1737 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1738 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1739 _initial_blocks_encrypted:
1741 # Main loop - Encrypt remaining blocks
1744 je _zero_cipher_left_encrypt
1746 je _four_cipher_left_encrypt
1747 _encrypt_by_4_encrypt:
1748 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1749 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1752 jne _encrypt_by_4_encrypt
1753 _four_cipher_left_encrypt:
1754 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1755 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1756 _zero_cipher_left_encrypt:
1758 and $15, %r13 # %r13 = arg4 (mod 16)
1759 je _multiple_of_16_bytes_encrypt
1761 # Handle the last <16 Byte block separately
1762 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1763 movdqa SHUF_MASK(%rip), %xmm10
1764 PSHUFB_XMM %xmm10, %xmm0
1767 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1770 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1771 lea SHIFT_MASK+16(%rip), %r12
1773 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1774 # (%r13 is the number of bytes in plaintext mod 16)
1775 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1776 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1777 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1778 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1779 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1780 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1781 movdqa SHUF_MASK(%rip), %xmm10
1782 PSHUFB_XMM %xmm10,%xmm0
1785 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1786 # GHASH computation for the last <16 byte block
1790 movdqa SHUF_MASK(%rip), %xmm10
1791 PSHUFB_XMM %xmm10, %xmm0
1793 # shuffle xmm0 back to output as ciphertext
1796 MOVQ_R64_XMM %xmm0, %rax
1798 jle _less_than_8_bytes_left_encrypt
1799 mov %rax, (%arg2 , %r11, 1)
1802 MOVQ_R64_XMM %xmm0, %rax
1804 _less_than_8_bytes_left_encrypt:
1805 mov %al, (%arg2, %r11, 1)
1809 jne _less_than_8_bytes_left_encrypt
1810 _multiple_of_16_bytes_encrypt:
1811 mov arg8, %r12 # %r12 = addLen (number of bytes)
1813 movd %r12d, %xmm15 # len(A) in %xmm15
1814 shl $3, %arg4 # len(C) in bits (*128)
1815 MOVQ_R64_XMM %arg4, %xmm1
1816 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1817 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1819 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1820 # final GHASH computation
1821 movdqa SHUF_MASK(%rip), %xmm10
1822 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1824 mov %arg5, %rax # %rax = *Y0
1825 movdqu (%rax), %xmm0 # %xmm0 = Y0
1826 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1829 mov arg9, %r10 # %r10 = authTag
1830 mov arg10, %r11 # %r11 = auth_tag_len
1836 MOVQ_R64_XMM %xmm0, %rax
1842 je _return_T_done_encrypt
1850 je _return_T_done_encrypt
1857 je _return_T_done_encrypt
1862 jmp _return_T_done_encrypt
1864 movdqu %xmm0, (%r10)
1865 _return_T_done_encrypt:
1871 ENDPROC(aesni_gcm_enc)
1878 _key_expansion_256a:
1879 pshufd $0b11111111, %xmm1, %xmm1
1880 shufps $0b00010000, %xmm0, %xmm4
1882 shufps $0b10001100, %xmm0, %xmm4
1885 movaps %xmm0, (TKEYP)
1888 ENDPROC(_key_expansion_128)
1889 ENDPROC(_key_expansion_256a)
1892 _key_expansion_192a:
1893 pshufd $0b01010101, %xmm1, %xmm1
1894 shufps $0b00010000, %xmm0, %xmm4
1896 shufps $0b10001100, %xmm0, %xmm4
1903 pshufd $0b11111111, %xmm0, %xmm3
1908 shufps $0b01000100, %xmm0, %xmm6
1909 movaps %xmm6, (TKEYP)
1910 shufps $0b01001110, %xmm2, %xmm1
1911 movaps %xmm1, 0x10(TKEYP)
1914 ENDPROC(_key_expansion_192a)
1917 _key_expansion_192b:
1918 pshufd $0b01010101, %xmm1, %xmm1
1919 shufps $0b00010000, %xmm0, %xmm4
1921 shufps $0b10001100, %xmm0, %xmm4
1927 pshufd $0b11111111, %xmm0, %xmm3
1931 movaps %xmm0, (TKEYP)
1934 ENDPROC(_key_expansion_192b)
1937 _key_expansion_256b:
1938 pshufd $0b10101010, %xmm1, %xmm1
1939 shufps $0b00010000, %xmm2, %xmm4
1941 shufps $0b10001100, %xmm2, %xmm4
1944 movaps %xmm2, (TKEYP)
1947 ENDPROC(_key_expansion_256b)
1950 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1951 * unsigned int key_len)
1953 ENTRY(aesni_set_key)
1957 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1958 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1959 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1961 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1962 movaps %xmm0, (KEYP)
1963 lea 0x10(KEYP), TKEYP # key addr
1964 movl %edx, 480(KEYP)
1965 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1969 movups 0x10(UKEYP), %xmm2 # other user key
1970 movaps %xmm2, (TKEYP)
1972 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1973 call _key_expansion_256a
1974 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1975 call _key_expansion_256b
1976 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1977 call _key_expansion_256a
1978 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1979 call _key_expansion_256b
1980 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1981 call _key_expansion_256a
1982 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1983 call _key_expansion_256b
1984 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1985 call _key_expansion_256a
1986 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1987 call _key_expansion_256b
1988 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1989 call _key_expansion_256a
1990 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1991 call _key_expansion_256b
1992 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1993 call _key_expansion_256a
1994 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1995 call _key_expansion_256b
1996 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1997 call _key_expansion_256a
2000 movq 0x10(UKEYP), %xmm2 # other user key
2001 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
2002 call _key_expansion_192a
2003 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
2004 call _key_expansion_192b
2005 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
2006 call _key_expansion_192a
2007 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
2008 call _key_expansion_192b
2009 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
2010 call _key_expansion_192a
2011 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
2012 call _key_expansion_192b
2013 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
2014 call _key_expansion_192a
2015 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
2016 call _key_expansion_192b
2019 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
2020 call _key_expansion_128
2021 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
2022 call _key_expansion_128
2023 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
2024 call _key_expansion_128
2025 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
2026 call _key_expansion_128
2027 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
2028 call _key_expansion_128
2029 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
2030 call _key_expansion_128
2031 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
2032 call _key_expansion_128
2033 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
2034 call _key_expansion_128
2035 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
2036 call _key_expansion_128
2037 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
2038 call _key_expansion_128
2041 movaps (KEYP), %xmm0
2042 movaps (TKEYP), %xmm1
2043 movaps %xmm0, 240(TKEYP)
2044 movaps %xmm1, 240(KEYP)
2046 lea 240-16(TKEYP), UKEYP
2049 movaps (KEYP), %xmm0
2051 movaps %xmm1, (UKEYP)
2062 ENDPROC(aesni_set_key)
2065 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2072 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2073 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2074 movl (FRAME_OFFSET+20)(%esp), INP # src
2076 movl 480(KEYP), KLEN # key length
2077 movups (INP), STATE # input
2079 movups STATE, (OUTP) # output
2089 * _aesni_enc1: internal ABI
2091 * KEYP: key struct pointer
2093 * STATE: initial state (input)
2095 * STATE: finial state (output)
2102 movaps (KEYP), KEY # key
2104 pxor KEY, STATE # round 0
2108 lea 0x20(TKEYP), TKEYP
2111 movaps -0x60(TKEYP), KEY
2113 movaps -0x50(TKEYP), KEY
2117 movaps -0x40(TKEYP), KEY
2119 movaps -0x30(TKEYP), KEY
2123 movaps -0x20(TKEYP), KEY
2125 movaps -0x10(TKEYP), KEY
2129 movaps 0x10(TKEYP), KEY
2131 movaps 0x20(TKEYP), KEY
2133 movaps 0x30(TKEYP), KEY
2135 movaps 0x40(TKEYP), KEY
2137 movaps 0x50(TKEYP), KEY
2139 movaps 0x60(TKEYP), KEY
2141 movaps 0x70(TKEYP), KEY
2142 AESENCLAST KEY STATE
2144 ENDPROC(_aesni_enc1)
2147 * _aesni_enc4: internal ABI
2149 * KEYP: key struct pointer
2151 * STATE1: initial state (input)
2156 * STATE1: finial state (output)
2166 movaps (KEYP), KEY # key
2168 pxor KEY, STATE1 # round 0
2175 lea 0x20(TKEYP), TKEYP
2178 movaps -0x60(TKEYP), KEY
2183 movaps -0x50(TKEYP), KEY
2190 movaps -0x40(TKEYP), KEY
2195 movaps -0x30(TKEYP), KEY
2202 movaps -0x20(TKEYP), KEY
2207 movaps -0x10(TKEYP), KEY
2217 movaps 0x10(TKEYP), KEY
2222 movaps 0x20(TKEYP), KEY
2227 movaps 0x30(TKEYP), KEY
2232 movaps 0x40(TKEYP), KEY
2237 movaps 0x50(TKEYP), KEY
2242 movaps 0x60(TKEYP), KEY
2247 movaps 0x70(TKEYP), KEY
2248 AESENCLAST KEY STATE1 # last round
2249 AESENCLAST KEY STATE2
2250 AESENCLAST KEY STATE3
2251 AESENCLAST KEY STATE4
2253 ENDPROC(_aesni_enc4)
2256 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2263 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2264 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2265 movl (FRAME_OFFSET+20)(%esp), INP # src
2267 mov 480(KEYP), KLEN # key length
2269 movups (INP), STATE # input
2271 movups STATE, (OUTP) #output
2281 * _aesni_dec1: internal ABI
2283 * KEYP: key struct pointer
2285 * STATE: initial state (input)
2287 * STATE: finial state (output)
2294 movaps (KEYP), KEY # key
2296 pxor KEY, STATE # round 0
2300 lea 0x20(TKEYP), TKEYP
2303 movaps -0x60(TKEYP), KEY
2305 movaps -0x50(TKEYP), KEY
2309 movaps -0x40(TKEYP), KEY
2311 movaps -0x30(TKEYP), KEY
2315 movaps -0x20(TKEYP), KEY
2317 movaps -0x10(TKEYP), KEY
2321 movaps 0x10(TKEYP), KEY
2323 movaps 0x20(TKEYP), KEY
2325 movaps 0x30(TKEYP), KEY
2327 movaps 0x40(TKEYP), KEY
2329 movaps 0x50(TKEYP), KEY
2331 movaps 0x60(TKEYP), KEY
2333 movaps 0x70(TKEYP), KEY
2334 AESDECLAST KEY STATE
2336 ENDPROC(_aesni_dec1)
2339 * _aesni_dec4: internal ABI
2341 * KEYP: key struct pointer
2343 * STATE1: initial state (input)
2348 * STATE1: finial state (output)
2358 movaps (KEYP), KEY # key
2360 pxor KEY, STATE1 # round 0
2367 lea 0x20(TKEYP), TKEYP
2370 movaps -0x60(TKEYP), KEY
2375 movaps -0x50(TKEYP), KEY
2382 movaps -0x40(TKEYP), KEY
2387 movaps -0x30(TKEYP), KEY
2394 movaps -0x20(TKEYP), KEY
2399 movaps -0x10(TKEYP), KEY
2409 movaps 0x10(TKEYP), KEY
2414 movaps 0x20(TKEYP), KEY
2419 movaps 0x30(TKEYP), KEY
2424 movaps 0x40(TKEYP), KEY
2429 movaps 0x50(TKEYP), KEY
2434 movaps 0x60(TKEYP), KEY
2439 movaps 0x70(TKEYP), KEY
2440 AESDECLAST KEY STATE1 # last round
2441 AESDECLAST KEY STATE2
2442 AESDECLAST KEY STATE3
2443 AESDECLAST KEY STATE4
2445 ENDPROC(_aesni_dec4)
2448 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2451 ENTRY(aesni_ecb_enc)
2457 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2458 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2459 movl (FRAME_OFFSET+24)(%esp), INP # src
2460 movl (FRAME_OFFSET+28)(%esp), LEN # len
2462 test LEN, LEN # check length
2471 movups (INP), STATE1
2472 movups 0x10(INP), STATE2
2473 movups 0x20(INP), STATE3
2474 movups 0x30(INP), STATE4
2476 movups STATE1, (OUTP)
2477 movups STATE2, 0x10(OUTP)
2478 movups STATE3, 0x20(OUTP)
2479 movups STATE4, 0x30(OUTP)
2489 movups (INP), STATE1
2491 movups STATE1, (OUTP)
2505 ENDPROC(aesni_ecb_enc)
2508 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2511 ENTRY(aesni_ecb_dec)
2517 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2518 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2519 movl (FRAME_OFFSET+24)(%esp), INP # src
2520 movl (FRAME_OFFSET+28)(%esp), LEN # len
2532 movups (INP), STATE1
2533 movups 0x10(INP), STATE2
2534 movups 0x20(INP), STATE3
2535 movups 0x30(INP), STATE4
2537 movups STATE1, (OUTP)
2538 movups STATE2, 0x10(OUTP)
2539 movups STATE3, 0x20(OUTP)
2540 movups STATE4, 0x30(OUTP)
2550 movups (INP), STATE1
2552 movups STATE1, (OUTP)
2566 ENDPROC(aesni_ecb_dec)
2569 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2570 * size_t len, u8 *iv)
2572 ENTRY(aesni_cbc_enc)
2579 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2580 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2581 movl (FRAME_OFFSET+28)(%esp), INP # src
2582 movl (FRAME_OFFSET+32)(%esp), LEN # len
2583 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2588 movups (IVP), STATE # load iv as initial state
2591 movups (INP), IN # load input
2594 movups STATE, (OUTP) # store output
2610 ENDPROC(aesni_cbc_enc)
2613 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2614 * size_t len, u8 *iv)
2616 ENTRY(aesni_cbc_dec)
2623 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2624 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2625 movl (FRAME_OFFSET+28)(%esp), INP # src
2626 movl (FRAME_OFFSET+32)(%esp), LEN # len
2627 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2630 jb .Lcbc_dec_just_ret
2640 movups 0x10(INP), IN2
2643 movups 0x20(INP), IN3
2645 movups 0x30(INP), IN4
2648 movups 0x20(INP), IN1
2650 movups 0x30(INP), IN2
2665 movups 0x10(INP), IN2
2668 movups STATE1, (OUTP)
2669 movups STATE2, 0x10(OUTP)
2670 movups STATE3, 0x20(OUTP)
2671 movups STATE4, 0x30(OUTP)
2685 movups STATE, (OUTP)
2703 ENDPROC(aesni_cbc_dec)
2706 .pushsection .rodata
2709 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2713 * _aesni_inc_init: internal ABI
2714 * setup registers used by _aesni_inc
2718 * CTR: == IV, in little endian
2719 * TCTR_LOW: == lower qword of CTR
2720 * INC: == 1, in little endian
2721 * BSWAP_MASK == endian swapping mask
2725 movaps .Lbswap_mask, BSWAP_MASK
2727 PSHUFB_XMM BSWAP_MASK CTR
2729 MOVQ_R64_XMM TCTR_LOW INC
2730 MOVQ_R64_XMM CTR TCTR_LOW
2732 ENDPROC(_aesni_inc_init)
2735 * _aesni_inc: internal ABI
2736 * Increase IV by 1, IV is in big endian
2739 * CTR: == IV, in little endian
2740 * TCTR_LOW: == lower qword of CTR
2741 * INC: == 1, in little endian
2742 * BSWAP_MASK == endian swapping mask
2746 * CTR: == output IV, in little endian
2747 * TCTR_LOW: == lower qword of CTR
2759 PSHUFB_XMM BSWAP_MASK IV
2764 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2765 * size_t len, u8 *iv)
2767 ENTRY(aesni_ctr_enc)
2770 jb .Lctr_enc_just_ret
2773 call _aesni_inc_init
2783 movups 0x10(INP), IN2
2786 movups 0x20(INP), IN3
2789 movups 0x30(INP), IN4
2792 movups STATE1, (OUTP)
2794 movups STATE2, 0x10(OUTP)
2796 movups STATE3, 0x20(OUTP)
2798 movups STATE4, 0x30(OUTP)
2813 movups STATE, (OUTP)
2824 ENDPROC(aesni_ctr_enc)
2827 * _aesni_gf128mul_x_ble: internal ABI
2828 * Multiply in GF(2^128) for XTS IVs
2831 * GF128MUL_MASK == mask with 0x87 and 0x01
2835 * CTR: == temporary value
2837 #define _aesni_gf128mul_x_ble() \
2838 pshufd $0x13, IV, CTR; \
2841 pand GF128MUL_MASK, CTR; \
2845 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2848 ENTRY(aesni_xts_crypt8)
2853 leaq _aesni_enc4, %r11
2854 leaq _aesni_dec4, %rax
2858 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2865 movdqu 0x00(INP), INC
2867 movdqu IV, 0x00(OUTP)
2869 _aesni_gf128mul_x_ble()
2871 movdqu 0x10(INP), INC
2873 movdqu IV, 0x10(OUTP)
2875 _aesni_gf128mul_x_ble()
2877 movdqu 0x20(INP), INC
2879 movdqu IV, 0x20(OUTP)
2881 _aesni_gf128mul_x_ble()
2883 movdqu 0x30(INP), INC
2885 movdqu IV, 0x30(OUTP)
2889 movdqu 0x00(OUTP), INC
2891 movdqu STATE1, 0x00(OUTP)
2893 _aesni_gf128mul_x_ble()
2895 movdqu 0x40(INP), INC
2897 movdqu IV, 0x40(OUTP)
2899 movdqu 0x10(OUTP), INC
2901 movdqu STATE2, 0x10(OUTP)
2903 _aesni_gf128mul_x_ble()
2905 movdqu 0x50(INP), INC
2907 movdqu IV, 0x50(OUTP)
2909 movdqu 0x20(OUTP), INC
2911 movdqu STATE3, 0x20(OUTP)
2913 _aesni_gf128mul_x_ble()
2915 movdqu 0x60(INP), INC
2917 movdqu IV, 0x60(OUTP)
2919 movdqu 0x30(OUTP), INC
2921 movdqu STATE4, 0x30(OUTP)
2923 _aesni_gf128mul_x_ble()
2925 movdqu 0x70(INP), INC
2927 movdqu IV, 0x70(OUTP)
2929 _aesni_gf128mul_x_ble()
2934 movdqu 0x40(OUTP), INC
2936 movdqu STATE1, 0x40(OUTP)
2938 movdqu 0x50(OUTP), INC
2940 movdqu STATE2, 0x50(OUTP)
2942 movdqu 0x60(OUTP), INC
2944 movdqu STATE3, 0x60(OUTP)
2946 movdqu 0x70(OUTP), INC
2948 movdqu STATE4, 0x70(OUTP)
2952 ENDPROC(aesni_xts_crypt8)