2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
34 #include <asm/frame.h>
37 * The following macros are used to move an (un)aligned 16 byte value to/from
38 * an XMM register. This can done for either FP or integer values, for FP use
39 * movaps (move aligned packed single) or integer use movdqa (move double quad
40 * aligned). It doesn't make a performance difference which instruction is used
41 * since Nehalem (original Core i7) was released. However, the movaps is a byte
42 * shorter, so that is the one we'll use for now. (same for unaligned).
51 .Lgf128mul_x_ble_mask:
52 .octa 0x00000000000000010000000000000087
53 POLY: .octa 0xC2000000000000000000000000000001
54 TWOONE: .octa 0x00000001000000000000000000000001
56 # order of these constants should not change.
57 # more specifically, ALL_F should follow SHIFT_MASK,
58 # and ZERO should follow ALL_F
60 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
61 MASK1: .octa 0x0000000000000000ffffffffffffffff
62 MASK2: .octa 0xffffffffffffffff0000000000000000
63 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
64 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
65 ZERO: .octa 0x00000000000000000000000000000000
66 ONE: .octa 0x00000000000000000000000000000001
67 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
75 #define STACK_OFFSET 8*3
76 #define HashKey 16*0 // store HashKey <<1 mod poly here
77 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
78 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
79 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
80 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
81 // bits of HashKey <<1 mod poly here
82 //(for Karatsuba purposes)
83 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
84 // bits of HashKey^2 <<1 mod poly here
85 // (for Karatsuba purposes)
86 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
87 // bits of HashKey^3 <<1 mod poly here
88 // (for Karatsuba purposes)
89 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
90 // bits of HashKey^4 <<1 mod poly here
91 // (for Karatsuba purposes)
92 #define VARIABLE_OFFSET 16*8
100 #define arg7 STACK_OFFSET+8(%r14)
101 #define arg8 STACK_OFFSET+16(%r14)
102 #define arg9 STACK_OFFSET+24(%r14)
103 #define arg10 STACK_OFFSET+32(%r14)
104 #define keysize 2*15*16(%arg1)
121 #define BSWAP_MASK %xmm10
125 #define GF128MUL_MASK %xmm10
155 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
158 * Input: A and B (128-bits each, bit-reflected)
159 * Output: C = A*B*x mod poly, (i.e. >>1 )
160 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
161 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
164 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
166 pshufd $78, \GH, \TMP2
167 pshufd $78, \HK, \TMP3
168 pxor \GH, \TMP2 # TMP2 = a1+a0
169 pxor \HK, \TMP3 # TMP3 = b1+b0
170 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
171 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
172 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
174 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
176 pslldq $8, \TMP3 # left shift TMP3 2 DWs
177 psrldq $8, \TMP2 # right shift TMP2 2 DWs
179 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
181 # first phase of the reduction
185 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
186 # in in order to perform
188 pslld $31, \TMP2 # packed right shift <<31
189 pslld $30, \TMP3 # packed right shift <<30
190 pslld $25, \TMP4 # packed right shift <<25
191 pxor \TMP3, \TMP2 # xor the shifted versions
194 psrldq $4, \TMP5 # right shift TMP5 1 DW
195 pslldq $12, \TMP2 # left shift TMP2 3 DWs
198 # second phase of the reduction
200 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
201 # in in order to perform
205 psrld $1,\TMP2 # packed left shift >>1
206 psrld $2,\TMP3 # packed left shift >>2
207 psrld $7,\TMP4 # packed left shift >>7
208 pxor \TMP3,\TMP2 # xor the shifted versions
212 pxor \TMP1, \GH # result is in TMP1
216 * if a = number of total plaintext bytes
218 * num_initial_blocks = b mod 4
219 * encrypt the initial num_initial_blocks blocks and apply ghash on
221 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
223 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
227 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
228 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
229 MOVADQ SHUF_MASK(%rip), %xmm14
230 mov arg7, %r10 # %r10 = AAD
231 mov arg8, %r12 # %r12 = aadLen
235 _get_AAD_loop\num_initial_blocks\operation:
242 jne _get_AAD_loop\num_initial_blocks\operation
245 je _get_AAD_loop2_done\num_initial_blocks\operation
248 _get_AAD_loop2\num_initial_blocks\operation:
252 jne _get_AAD_loop2\num_initial_blocks\operation
254 _get_AAD_loop2_done\num_initial_blocks\operation:
255 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
257 xor %r11, %r11 # initialise the data pointer offset as zero
259 # start AES for num_initial_blocks blocks
261 mov %arg5, %rax # %rax = *Y0
262 movdqu (%rax), \XMM0 # XMM0 = Y0
263 PSHUFB_XMM %xmm14, \XMM0
265 .if (\i == 5) || (\i == 6) || (\i == 7)
266 MOVADQ ONE(%RIP),\TMP1
269 paddd \TMP1, \XMM0 # INCR Y0
270 movdqa \XMM0, %xmm\index
271 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
272 pxor \TMP2, %xmm\index
276 shr $2,%eax # 128->4, 192->6, 256->8
277 add $5,%eax # 128->9, 192->11, 256->13
279 aes_loop_initial_dec\num_initial_blocks:
282 AESENC \TMP1, %xmm\index
286 jnz aes_loop_initial_dec\num_initial_blocks
290 AESENCLAST \TMP1, %xmm\index # Last Round
293 movdqu (%arg3 , %r11, 1), \TMP1
294 pxor \TMP1, %xmm\index
295 movdqu %xmm\index, (%arg2 , %r11, 1)
296 # write back plaintext/ciphertext for num_initial_blocks
299 movdqa \TMP1, %xmm\index
300 PSHUFB_XMM %xmm14, %xmm\index
301 # prepare plaintext/ciphertext for GHASH computation
304 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
305 # apply GHASH on num_initial_blocks blocks
309 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
311 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
313 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
324 jl _initial_blocks_done\num_initial_blocks\operation
325 # no need for precomputed values
328 * Precomputations for HashKey parallel with encryption of first 4 blocks.
329 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
331 MOVADQ ONE(%rip), \TMP1
332 paddd \TMP1, \XMM0 # INCR Y0
334 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
336 paddd \TMP1, \XMM0 # INCR Y0
338 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
340 paddd \TMP1, \XMM0 # INCR Y0
342 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
344 paddd \TMP1, \XMM0 # INCR Y0
346 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
348 MOVADQ 0(%arg1),\TMP1
354 pshufd $78, \TMP3, \TMP1
356 movdqa \TMP1, HashKey_k(%rsp)
357 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
358 # TMP5 = HashKey^2<<1 (mod poly)
359 movdqa \TMP5, HashKey_2(%rsp)
360 # HashKey_2 = HashKey^2<<1 (mod poly)
361 pshufd $78, \TMP5, \TMP1
363 movdqa \TMP1, HashKey_2_k(%rsp)
364 .irpc index, 1234 # do 4 rounds
365 movaps 0x10*\index(%arg1), \TMP1
371 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
372 # TMP5 = HashKey^3<<1 (mod poly)
373 movdqa \TMP5, HashKey_3(%rsp)
374 pshufd $78, \TMP5, \TMP1
376 movdqa \TMP1, HashKey_3_k(%rsp)
377 .irpc index, 56789 # do next 5 rounds
378 movaps 0x10*\index(%arg1), \TMP1
384 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
385 # TMP5 = HashKey^3<<1 (mod poly)
386 movdqa \TMP5, HashKey_4(%rsp)
387 pshufd $78, \TMP5, \TMP1
389 movdqa \TMP1, HashKey_4_k(%rsp)
392 shr $2,%eax # 128->4, 192->6, 256->8
393 sub $4,%eax # 128->0, 192->2, 256->4
394 jz aes_loop_pre_dec_done\num_initial_blocks
396 aes_loop_pre_dec\num_initial_blocks:
399 AESENC \TMP2, %xmm\index
403 jnz aes_loop_pre_dec\num_initial_blocks
405 aes_loop_pre_dec_done\num_initial_blocks:
407 AESENCLAST \TMP2, \XMM1
408 AESENCLAST \TMP2, \XMM2
409 AESENCLAST \TMP2, \XMM3
410 AESENCLAST \TMP2, \XMM4
411 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
413 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
415 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
417 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
419 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
421 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
423 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
425 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
428 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
430 # combine GHASHed value with the corresponding ciphertext
431 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
432 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
433 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
435 _initial_blocks_done\num_initial_blocks\operation:
441 * if a = number of total plaintext bytes
443 * num_initial_blocks = b mod 4
444 * encrypt the initial num_initial_blocks blocks and apply ghash on
446 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
448 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
452 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
453 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
454 MOVADQ SHUF_MASK(%rip), %xmm14
455 mov arg7, %r10 # %r10 = AAD
456 mov arg8, %r12 # %r12 = aadLen
459 _get_AAD_loop\num_initial_blocks\operation:
466 jne _get_AAD_loop\num_initial_blocks\operation
468 je _get_AAD_loop2_done\num_initial_blocks\operation
470 _get_AAD_loop2\num_initial_blocks\operation:
474 jne _get_AAD_loop2\num_initial_blocks\operation
475 _get_AAD_loop2_done\num_initial_blocks\operation:
476 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
478 xor %r11, %r11 # initialise the data pointer offset as zero
480 # start AES for num_initial_blocks blocks
482 mov %arg5, %rax # %rax = *Y0
483 movdqu (%rax), \XMM0 # XMM0 = Y0
484 PSHUFB_XMM %xmm14, \XMM0
486 .if (\i == 5) || (\i == 6) || (\i == 7)
488 MOVADQ ONE(%RIP),\TMP1
489 MOVADQ 0(%arg1),\TMP2
491 paddd \TMP1, \XMM0 # INCR Y0
492 MOVADQ \XMM0, %xmm\index
493 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
494 pxor \TMP2, %xmm\index
498 shr $2,%eax # 128->4, 192->6, 256->8
499 add $5,%eax # 128->9, 192->11, 256->13
501 aes_loop_initial_enc\num_initial_blocks:
504 AESENC \TMP1, %xmm\index
508 jnz aes_loop_initial_enc\num_initial_blocks
512 AESENCLAST \TMP1, %xmm\index # Last Round
515 movdqu (%arg3 , %r11, 1), \TMP1
516 pxor \TMP1, %xmm\index
517 movdqu %xmm\index, (%arg2 , %r11, 1)
518 # write back plaintext/ciphertext for num_initial_blocks
520 PSHUFB_XMM %xmm14, %xmm\index
522 # prepare plaintext/ciphertext for GHASH computation
525 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
526 # apply GHASH on num_initial_blocks blocks
530 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
532 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
534 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
537 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
539 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
542 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
545 jl _initial_blocks_done\num_initial_blocks\operation
546 # no need for precomputed values
549 * Precomputations for HashKey parallel with encryption of first 4 blocks.
550 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
552 MOVADQ ONE(%RIP),\TMP1
553 paddd \TMP1, \XMM0 # INCR Y0
555 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
557 paddd \TMP1, \XMM0 # INCR Y0
559 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
561 paddd \TMP1, \XMM0 # INCR Y0
563 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
565 paddd \TMP1, \XMM0 # INCR Y0
567 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
569 MOVADQ 0(%arg1),\TMP1
575 pshufd $78, \TMP3, \TMP1
577 movdqa \TMP1, HashKey_k(%rsp)
578 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
579 # TMP5 = HashKey^2<<1 (mod poly)
580 movdqa \TMP5, HashKey_2(%rsp)
581 # HashKey_2 = HashKey^2<<1 (mod poly)
582 pshufd $78, \TMP5, \TMP1
584 movdqa \TMP1, HashKey_2_k(%rsp)
585 .irpc index, 1234 # do 4 rounds
586 movaps 0x10*\index(%arg1), \TMP1
592 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
593 # TMP5 = HashKey^3<<1 (mod poly)
594 movdqa \TMP5, HashKey_3(%rsp)
595 pshufd $78, \TMP5, \TMP1
597 movdqa \TMP1, HashKey_3_k(%rsp)
598 .irpc index, 56789 # do next 5 rounds
599 movaps 0x10*\index(%arg1), \TMP1
605 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
606 # TMP5 = HashKey^3<<1 (mod poly)
607 movdqa \TMP5, HashKey_4(%rsp)
608 pshufd $78, \TMP5, \TMP1
610 movdqa \TMP1, HashKey_4_k(%rsp)
613 shr $2,%eax # 128->4, 192->6, 256->8
614 sub $4,%eax # 128->0, 192->2, 256->4
615 jz aes_loop_pre_enc_done\num_initial_blocks
617 aes_loop_pre_enc\num_initial_blocks:
620 AESENC \TMP2, %xmm\index
624 jnz aes_loop_pre_enc\num_initial_blocks
626 aes_loop_pre_enc_done\num_initial_blocks:
628 AESENCLAST \TMP2, \XMM1
629 AESENCLAST \TMP2, \XMM2
630 AESENCLAST \TMP2, \XMM3
631 AESENCLAST \TMP2, \XMM4
632 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
634 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
636 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
638 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
640 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
641 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
642 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
643 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
646 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
648 # combine GHASHed value with the corresponding ciphertext
649 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
650 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
651 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
653 _initial_blocks_done\num_initial_blocks\operation:
658 * encrypt 4 blocks at a time
659 * ghash the 4 previously encrypted ciphertext blocks
660 * arg1, %arg2, %arg3 are used as pointers only, not modified
661 * %r11 is the data offset value
663 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
664 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
671 movdqa SHUF_MASK(%rip), %xmm15
672 # multiply TMP5 * HashKey using karatsuba
675 pshufd $78, \XMM5, \TMP6
677 paddd ONE(%rip), \XMM0 # INCR CNT
678 movdqa HashKey_4(%rsp), \TMP5
679 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
681 paddd ONE(%rip), \XMM0 # INCR CNT
683 paddd ONE(%rip), \XMM0 # INCR CNT
685 paddd ONE(%rip), \XMM0 # INCR CNT
687 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
688 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
689 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
690 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
691 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
697 movdqa HashKey_4_k(%rsp), \TMP5
698 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
699 movaps 0x10(%arg1), \TMP1
700 AESENC \TMP1, \XMM1 # Round 1
704 movaps 0x20(%arg1), \TMP1
705 AESENC \TMP1, \XMM1 # Round 2
710 pshufd $78, \XMM6, \TMP2
712 movdqa HashKey_3(%rsp), \TMP5
713 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
714 movaps 0x30(%arg1), \TMP3
715 AESENC \TMP3, \XMM1 # Round 3
719 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
720 movaps 0x40(%arg1), \TMP3
721 AESENC \TMP3, \XMM1 # Round 4
725 movdqa HashKey_3_k(%rsp), \TMP5
726 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
727 movaps 0x50(%arg1), \TMP3
728 AESENC \TMP3, \XMM1 # Round 5
733 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
737 pshufd $78, \XMM7, \TMP2
739 movdqa HashKey_2(%rsp ), \TMP5
741 # Multiply TMP5 * HashKey using karatsuba
743 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
744 movaps 0x60(%arg1), \TMP3
745 AESENC \TMP3, \XMM1 # Round 6
749 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
750 movaps 0x70(%arg1), \TMP3
751 AESENC \TMP3, \XMM1 # Round 7
755 movdqa HashKey_2_k(%rsp), \TMP5
756 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
757 movaps 0x80(%arg1), \TMP3
758 AESENC \TMP3, \XMM1 # Round 8
763 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
767 # Multiply XMM8 * HashKey
768 # XMM8 and TMP5 hold the values for the two operands
771 pshufd $78, \XMM8, \TMP2
773 movdqa HashKey(%rsp), \TMP5
774 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
775 movaps 0x90(%arg1), \TMP3
776 AESENC \TMP3, \XMM1 # Round 9
780 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
783 shr $2,%eax # 128->4, 192->6, 256->8
784 sub $4,%eax # 128->0, 192->2, 256->4
785 jz aes_loop_par_enc_done
790 AESENC \TMP3, %xmm\index
796 aes_loop_par_enc_done:
798 AESENCLAST \TMP3, \XMM1 # Round 10
799 AESENCLAST \TMP3, \XMM2
800 AESENCLAST \TMP3, \XMM3
801 AESENCLAST \TMP3, \XMM4
802 movdqa HashKey_k(%rsp), \TMP5
803 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
804 movdqu (%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
806 movdqu 16(%arg3,%r11,1), \TMP3
807 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
808 movdqu 32(%arg3,%r11,1), \TMP3
809 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
810 movdqu 48(%arg3,%r11,1), \TMP3
811 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
812 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
813 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
814 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
815 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
816 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
817 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
818 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
819 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
827 pslldq $8, \TMP3 # left shift TMP3 2 DWs
828 psrldq $8, \TMP2 # right shift TMP2 2 DWs
830 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
832 # first phase of reduction
837 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
838 pslld $31, \TMP2 # packed right shift << 31
839 pslld $30, \TMP3 # packed right shift << 30
840 pslld $25, \TMP4 # packed right shift << 25
841 pxor \TMP3, \TMP2 # xor the shifted versions
844 psrldq $4, \TMP5 # right shift T5 1 DW
845 pslldq $12, \TMP2 # left shift T2 3 DWs
848 # second phase of reduction
850 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
853 psrld $1, \TMP2 # packed left shift >>1
854 psrld $2, \TMP3 # packed left shift >>2
855 psrld $7, \TMP4 # packed left shift >>7
856 pxor \TMP3,\TMP2 # xor the shifted versions
860 pxor \TMP1, \XMM5 # result is in TMP1
866 * decrypt 4 blocks at a time
867 * ghash the 4 previously decrypted ciphertext blocks
868 * arg1, %arg2, %arg3 are used as pointers only, not modified
869 * %r11 is the data offset value
871 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
872 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
879 movdqa SHUF_MASK(%rip), %xmm15
880 # multiply TMP5 * HashKey using karatsuba
883 pshufd $78, \XMM5, \TMP6
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa HashKey_4(%rsp), \TMP5
887 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
889 paddd ONE(%rip), \XMM0 # INCR CNT
891 paddd ONE(%rip), \XMM0 # INCR CNT
893 paddd ONE(%rip), \XMM0 # INCR CNT
895 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
896 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
897 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
898 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
899 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
905 movdqa HashKey_4_k(%rsp), \TMP5
906 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
907 movaps 0x10(%arg1), \TMP1
908 AESENC \TMP1, \XMM1 # Round 1
912 movaps 0x20(%arg1), \TMP1
913 AESENC \TMP1, \XMM1 # Round 2
918 pshufd $78, \XMM6, \TMP2
920 movdqa HashKey_3(%rsp), \TMP5
921 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
922 movaps 0x30(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 3
927 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
928 movaps 0x40(%arg1), \TMP3
929 AESENC \TMP3, \XMM1 # Round 4
933 movdqa HashKey_3_k(%rsp), \TMP5
934 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
935 movaps 0x50(%arg1), \TMP3
936 AESENC \TMP3, \XMM1 # Round 5
941 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
945 pshufd $78, \XMM7, \TMP2
947 movdqa HashKey_2(%rsp ), \TMP5
949 # Multiply TMP5 * HashKey using karatsuba
951 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
952 movaps 0x60(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 6
957 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
958 movaps 0x70(%arg1), \TMP3
959 AESENC \TMP3, \XMM1 # Round 7
963 movdqa HashKey_2_k(%rsp), \TMP5
964 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
965 movaps 0x80(%arg1), \TMP3
966 AESENC \TMP3, \XMM1 # Round 8
971 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
975 # Multiply XMM8 * HashKey
976 # XMM8 and TMP5 hold the values for the two operands
979 pshufd $78, \XMM8, \TMP2
981 movdqa HashKey(%rsp), \TMP5
982 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
983 movaps 0x90(%arg1), \TMP3
984 AESENC \TMP3, \XMM1 # Round 9
988 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
991 shr $2,%eax # 128->4, 192->6, 256->8
992 sub $4,%eax # 128->0, 192->2, 256->4
993 jz aes_loop_par_dec_done
998 AESENC \TMP3, %xmm\index
1002 jnz aes_loop_par_dec
1004 aes_loop_par_dec_done:
1005 MOVADQ (%r10), \TMP3
1006 AESENCLAST \TMP3, \XMM1 # last round
1007 AESENCLAST \TMP3, \XMM2
1008 AESENCLAST \TMP3, \XMM3
1009 AESENCLAST \TMP3, \XMM4
1010 movdqa HashKey_k(%rsp), \TMP5
1011 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1012 movdqu (%arg3,%r11,1), \TMP3
1013 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1014 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1016 movdqu 16(%arg3,%r11,1), \TMP3
1017 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1018 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1020 movdqu 32(%arg3,%r11,1), \TMP3
1021 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1022 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1024 movdqu 48(%arg3,%r11,1), \TMP3
1025 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1026 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1028 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1029 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1030 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1031 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1039 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1040 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1042 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1044 # first phase of reduction
1049 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1050 pslld $31, \TMP2 # packed right shift << 31
1051 pslld $30, \TMP3 # packed right shift << 30
1052 pslld $25, \TMP4 # packed right shift << 25
1053 pxor \TMP3, \TMP2 # xor the shifted versions
1056 psrldq $4, \TMP5 # right shift T5 1 DW
1057 pslldq $12, \TMP2 # left shift T2 3 DWs
1060 # second phase of reduction
1062 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1065 psrld $1, \TMP2 # packed left shift >>1
1066 psrld $2, \TMP3 # packed left shift >>2
1067 psrld $7, \TMP4 # packed left shift >>7
1068 pxor \TMP3,\TMP2 # xor the shifted versions
1072 pxor \TMP1, \XMM5 # result is in TMP1
1077 /* GHASH the last 4 ciphertext blocks. */
1078 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1079 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1081 # Multiply TMP6 * HashKey (using Karatsuba)
1084 pshufd $78, \XMM1, \TMP2
1086 movdqa HashKey_4(%rsp), \TMP5
1087 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1088 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1089 movdqa HashKey_4_k(%rsp), \TMP4
1090 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1091 movdqa \XMM1, \XMMDst
1092 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1094 # Multiply TMP1 * HashKey (using Karatsuba)
1097 pshufd $78, \XMM2, \TMP2
1099 movdqa HashKey_3(%rsp), \TMP5
1100 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1101 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1102 movdqa HashKey_3_k(%rsp), \TMP4
1103 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1107 # results accumulated in TMP6, XMMDst, XMM1
1109 # Multiply TMP1 * HashKey (using Karatsuba)
1112 pshufd $78, \XMM3, \TMP2
1114 movdqa HashKey_2(%rsp), \TMP5
1115 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1116 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1117 movdqa HashKey_2_k(%rsp), \TMP4
1118 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1121 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1123 # Multiply TMP1 * HashKey (using Karatsuba)
1125 pshufd $78, \XMM4, \TMP2
1127 movdqa HashKey(%rsp), \TMP5
1128 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1129 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1130 movdqa HashKey_k(%rsp), \TMP4
1131 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1137 # middle section of the temp results combined as in karatsuba algorithm
1139 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1140 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1143 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1144 # first phase of the reduction
1145 movdqa \XMMDst, \TMP2
1146 movdqa \XMMDst, \TMP3
1147 movdqa \XMMDst, \TMP4
1148 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1149 pslld $31, \TMP2 # packed right shifting << 31
1150 pslld $30, \TMP3 # packed right shifting << 30
1151 pslld $25, \TMP4 # packed right shifting << 25
1152 pxor \TMP3, \TMP2 # xor the shifted versions
1155 psrldq $4, \TMP7 # right shift TMP7 1 DW
1156 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1159 # second phase of the reduction
1160 movdqa \XMMDst, \TMP2
1161 # make 3 copies of XMMDst for doing 3 shift operations
1162 movdqa \XMMDst, \TMP3
1163 movdqa \XMMDst, \TMP4
1164 psrld $1, \TMP2 # packed left shift >> 1
1165 psrld $2, \TMP3 # packed left shift >> 2
1166 psrld $7, \TMP4 # packed left shift >> 7
1167 pxor \TMP3, \TMP2 # xor the shifted versions
1171 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1175 /* Encryption of a single block
1179 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1183 shr $2,%eax # 128->4, 192->6, 256->8
1184 add $5,%eax # 128->9, 192->11, 256->13
1185 lea 16(%arg1), %r10 # get first expanded key address
1195 AESENCLAST \TMP1,\XMM0
1197 /*****************************************************************************
1198 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1199 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1200 * const u8 *in, // Ciphertext input
1201 * u64 plaintext_len, // Length of data in bytes for decryption.
1202 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1203 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1204 * // concatenated with 0x00000001. 16-byte aligned pointer.
1205 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1206 * const u8 *aad, // Additional Authentication Data (AAD)
1207 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1208 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1209 * // given authentication tag and only return the plaintext if they match.
1210 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1211 * // (most likely), 12 or 8.
1216 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1217 * set of 11 keys in the data structure void *aes_ctx
1221 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1222 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1223 * | Salt (From the SA) |
1224 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1225 * | Initialization Vector |
1226 * | (This is the sequence number from IPSec header) |
1227 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1229 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1234 * AAD padded to 128 bits with 0
1235 * for example, assume AAD is a u32 vector
1237 * if AAD is 8 bytes:
1238 * AAD[3] = {A0, A1};
1239 * padded AAD in xmm register = {A1 A0 0 0}
1242 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1245 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246 * | 32-bit Sequence Number (A0) |
1247 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251 * AAD Format with 32-bit Sequence Number
1253 * if AAD is 12 bytes:
1254 * AAD[3] = {A0, A1, A2};
1255 * padded AAD in xmm register = {A2 A1 A0 0}
1258 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1259 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1260 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1261 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1263 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1264 * | 64-bit Extended Sequence Number {A1,A0} |
1266 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1268 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1270 * AAD Format with 64-bit Extended Sequence Number
1273 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1274 * The code supports 16 too but for other sizes, the code will fail.
1277 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1278 * For other sizes, the code will fail.
1280 * poly = x^128 + x^127 + x^126 + x^121 + 1
1282 *****************************************************************************/
1283 ENTRY(aesni_gcm_dec)
1289 * states of %xmm registers %xmm6:%xmm15 not saved
1290 * all %xmm registers are clobbered
1292 sub $VARIABLE_OFFSET, %rsp
1293 and $~63, %rsp # align rsp to 64 bytes
1295 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1296 movdqa SHUF_MASK(%rip), %xmm2
1297 PSHUFB_XMM %xmm2, %xmm13
1300 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1302 movdqa %xmm13, %xmm2
1312 pshufd $0x24, %xmm1, %xmm2
1313 pcmpeqd TWOONE(%rip), %xmm2
1314 pand POLY(%rip), %xmm2
1315 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1318 # Decrypt first few blocks
1320 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1321 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1322 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1325 jz _initial_num_blocks_is_0_decrypt
1327 jb _initial_num_blocks_is_1_decrypt
1328 je _initial_num_blocks_is_2_decrypt
1329 _initial_num_blocks_is_3_decrypt:
1330 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1331 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1333 jmp _initial_blocks_decrypted
1334 _initial_num_blocks_is_2_decrypt:
1335 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1336 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1338 jmp _initial_blocks_decrypted
1339 _initial_num_blocks_is_1_decrypt:
1340 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1341 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1343 jmp _initial_blocks_decrypted
1344 _initial_num_blocks_is_0_decrypt:
1345 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1346 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1347 _initial_blocks_decrypted:
1349 je _zero_cipher_left_decrypt
1351 je _four_cipher_left_decrypt
1353 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1354 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1358 _four_cipher_left_decrypt:
1359 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1360 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1361 _zero_cipher_left_decrypt:
1363 and $15, %r13 # %r13 = arg4 (mod 16)
1364 je _multiple_of_16_bytes_decrypt
1366 # Handle the last <16 byte block separately
1368 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1369 movdqa SHUF_MASK(%rip), %xmm10
1370 PSHUFB_XMM %xmm10, %xmm0
1372 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1375 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1376 lea SHIFT_MASK+16(%rip), %r12
1378 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1379 # (%r13 is the number of bytes in plaintext mod 16)
1380 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1381 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1384 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1385 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1386 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1387 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1389 movdqa SHUF_MASK(%rip), %xmm10
1390 PSHUFB_XMM %xmm10 ,%xmm2
1393 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1394 # GHASH computation for the last <16 byte block
1399 MOVQ_R64_XMM %xmm0, %rax
1401 jle _less_than_8_bytes_left_decrypt
1402 mov %rax, (%arg2 , %r11, 1)
1405 MOVQ_R64_XMM %xmm0, %rax
1407 _less_than_8_bytes_left_decrypt:
1408 mov %al, (%arg2, %r11, 1)
1412 jne _less_than_8_bytes_left_decrypt
1413 _multiple_of_16_bytes_decrypt:
1414 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1415 shl $3, %r12 # convert into number of bits
1416 movd %r12d, %xmm15 # len(A) in %xmm15
1417 shl $3, %arg4 # len(C) in bits (*128)
1418 MOVQ_R64_XMM %arg4, %xmm1
1419 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1420 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1422 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1423 # final GHASH computation
1424 movdqa SHUF_MASK(%rip), %xmm10
1425 PSHUFB_XMM %xmm10, %xmm8
1427 mov %arg5, %rax # %rax = *Y0
1428 movdqu (%rax), %xmm0 # %xmm0 = Y0
1429 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1432 mov arg9, %r10 # %r10 = authTag
1433 mov arg10, %r11 # %r11 = auth_tag_len
1439 MOVQ_R64_XMM %xmm0, %rax
1441 jmp _return_T_done_decrypt
1443 MOVQ_R64_XMM %xmm0, %rax
1448 jmp _return_T_done_decrypt
1450 movdqu %xmm0, (%r10)
1451 _return_T_done_decrypt:
1457 ENDPROC(aesni_gcm_dec)
1460 /*****************************************************************************
1461 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1462 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1463 * const u8 *in, // Plaintext input
1464 * u64 plaintext_len, // Length of data in bytes for encryption.
1465 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1466 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1467 * // concatenated with 0x00000001. 16-byte aligned pointer.
1468 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1469 * const u8 *aad, // Additional Authentication Data (AAD)
1470 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1471 * u8 *auth_tag, // Authenticated Tag output.
1472 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1478 * keys are pre-expanded and aligned to 16 bytes. we are using the
1479 * first set of 11 keys in the data structure void *aes_ctx
1484 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1485 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1486 * | Salt (From the SA) |
1487 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1488 * | Initialization Vector |
1489 * | (This is the sequence number from IPSec header) |
1490 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1492 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1497 * AAD padded to 128 bits with 0
1498 * for example, assume AAD is a u32 vector
1500 * if AAD is 8 bytes:
1501 * AAD[3] = {A0, A1};
1502 * padded AAD in xmm register = {A1 A0 0 0}
1505 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1506 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1508 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1509 * | 32-bit Sequence Number (A0) |
1510 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1514 * AAD Format with 32-bit Sequence Number
1516 * if AAD is 12 bytes:
1517 * AAD[3] = {A0, A1, A2};
1518 * padded AAD in xmm register = {A2 A1 A0 0}
1521 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1522 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1524 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1525 * | 64-bit Extended Sequence Number {A1,A0} |
1527 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1529 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1531 * AAD Format with 64-bit Extended Sequence Number
1534 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1535 * The code supports 16 too but for other sizes, the code will fail.
1538 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1539 * For other sizes, the code will fail.
1541 * poly = x^128 + x^127 + x^126 + x^121 + 1
1542 ***************************************************************************/
1543 ENTRY(aesni_gcm_enc)
1549 # states of %xmm registers %xmm6:%xmm15 not saved
1550 # all %xmm registers are clobbered
1552 sub $VARIABLE_OFFSET, %rsp
1555 movdqu (%r12), %xmm13
1556 movdqa SHUF_MASK(%rip), %xmm2
1557 PSHUFB_XMM %xmm2, %xmm13
1560 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1562 movdqa %xmm13, %xmm2
1572 pshufd $0x24, %xmm1, %xmm2
1573 pcmpeqd TWOONE(%rip), %xmm2
1574 pand POLY(%rip), %xmm2
1576 movdqa %xmm13, HashKey(%rsp)
1577 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1581 # Encrypt first few blocks
1584 jz _initial_num_blocks_is_0_encrypt
1586 jb _initial_num_blocks_is_1_encrypt
1587 je _initial_num_blocks_is_2_encrypt
1588 _initial_num_blocks_is_3_encrypt:
1589 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1590 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1592 jmp _initial_blocks_encrypted
1593 _initial_num_blocks_is_2_encrypt:
1594 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1595 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1597 jmp _initial_blocks_encrypted
1598 _initial_num_blocks_is_1_encrypt:
1599 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1600 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1602 jmp _initial_blocks_encrypted
1603 _initial_num_blocks_is_0_encrypt:
1604 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1605 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1606 _initial_blocks_encrypted:
1608 # Main loop - Encrypt remaining blocks
1611 je _zero_cipher_left_encrypt
1613 je _four_cipher_left_encrypt
1614 _encrypt_by_4_encrypt:
1615 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1616 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1619 jne _encrypt_by_4_encrypt
1620 _four_cipher_left_encrypt:
1621 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1622 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1623 _zero_cipher_left_encrypt:
1625 and $15, %r13 # %r13 = arg4 (mod 16)
1626 je _multiple_of_16_bytes_encrypt
1628 # Handle the last <16 Byte block separately
1629 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10, %xmm0
1634 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1637 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1638 lea SHIFT_MASK+16(%rip), %r12
1640 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1641 # (%r13 is the number of bytes in plaintext mod 16)
1642 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1643 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1644 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1645 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1646 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1647 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1648 movdqa SHUF_MASK(%rip), %xmm10
1649 PSHUFB_XMM %xmm10,%xmm0
1652 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1653 # GHASH computation for the last <16 byte block
1657 movdqa SHUF_MASK(%rip), %xmm10
1658 PSHUFB_XMM %xmm10, %xmm0
1660 # shuffle xmm0 back to output as ciphertext
1663 MOVQ_R64_XMM %xmm0, %rax
1665 jle _less_than_8_bytes_left_encrypt
1666 mov %rax, (%arg2 , %r11, 1)
1669 MOVQ_R64_XMM %xmm0, %rax
1671 _less_than_8_bytes_left_encrypt:
1672 mov %al, (%arg2, %r11, 1)
1676 jne _less_than_8_bytes_left_encrypt
1677 _multiple_of_16_bytes_encrypt:
1678 mov arg8, %r12 # %r12 = addLen (number of bytes)
1680 movd %r12d, %xmm15 # len(A) in %xmm15
1681 shl $3, %arg4 # len(C) in bits (*128)
1682 MOVQ_R64_XMM %arg4, %xmm1
1683 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1684 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1686 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1687 # final GHASH computation
1688 movdqa SHUF_MASK(%rip), %xmm10
1689 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1691 mov %arg5, %rax # %rax = *Y0
1692 movdqu (%rax), %xmm0 # %xmm0 = Y0
1693 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1696 mov arg9, %r10 # %r10 = authTag
1697 mov arg10, %r11 # %r11 = auth_tag_len
1703 MOVQ_R64_XMM %xmm0, %rax
1705 jmp _return_T_done_encrypt
1707 MOVQ_R64_XMM %xmm0, %rax
1712 jmp _return_T_done_encrypt
1714 movdqu %xmm0, (%r10)
1715 _return_T_done_encrypt:
1721 ENDPROC(aesni_gcm_enc)
1728 _key_expansion_256a:
1729 pshufd $0b11111111, %xmm1, %xmm1
1730 shufps $0b00010000, %xmm0, %xmm4
1732 shufps $0b10001100, %xmm0, %xmm4
1735 movaps %xmm0, (TKEYP)
1738 ENDPROC(_key_expansion_128)
1739 ENDPROC(_key_expansion_256a)
1742 _key_expansion_192a:
1743 pshufd $0b01010101, %xmm1, %xmm1
1744 shufps $0b00010000, %xmm0, %xmm4
1746 shufps $0b10001100, %xmm0, %xmm4
1753 pshufd $0b11111111, %xmm0, %xmm3
1758 shufps $0b01000100, %xmm0, %xmm6
1759 movaps %xmm6, (TKEYP)
1760 shufps $0b01001110, %xmm2, %xmm1
1761 movaps %xmm1, 0x10(TKEYP)
1764 ENDPROC(_key_expansion_192a)
1767 _key_expansion_192b:
1768 pshufd $0b01010101, %xmm1, %xmm1
1769 shufps $0b00010000, %xmm0, %xmm4
1771 shufps $0b10001100, %xmm0, %xmm4
1777 pshufd $0b11111111, %xmm0, %xmm3
1781 movaps %xmm0, (TKEYP)
1784 ENDPROC(_key_expansion_192b)
1787 _key_expansion_256b:
1788 pshufd $0b10101010, %xmm1, %xmm1
1789 shufps $0b00010000, %xmm2, %xmm4
1791 shufps $0b10001100, %xmm2, %xmm4
1794 movaps %xmm2, (TKEYP)
1797 ENDPROC(_key_expansion_256b)
1800 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1801 * unsigned int key_len)
1803 ENTRY(aesni_set_key)
1807 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1808 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1809 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1811 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1812 movaps %xmm0, (KEYP)
1813 lea 0x10(KEYP), TKEYP # key addr
1814 movl %edx, 480(KEYP)
1815 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1819 movups 0x10(UKEYP), %xmm2 # other user key
1820 movaps %xmm2, (TKEYP)
1822 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1823 call _key_expansion_256a
1824 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1825 call _key_expansion_256b
1826 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1827 call _key_expansion_256a
1828 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1829 call _key_expansion_256b
1830 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1831 call _key_expansion_256a
1832 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1833 call _key_expansion_256b
1834 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1835 call _key_expansion_256a
1836 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1837 call _key_expansion_256b
1838 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1839 call _key_expansion_256a
1840 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1841 call _key_expansion_256b
1842 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1843 call _key_expansion_256a
1844 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1845 call _key_expansion_256b
1846 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1847 call _key_expansion_256a
1850 movq 0x10(UKEYP), %xmm2 # other user key
1851 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1852 call _key_expansion_192a
1853 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1854 call _key_expansion_192b
1855 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1856 call _key_expansion_192a
1857 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1858 call _key_expansion_192b
1859 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1860 call _key_expansion_192a
1861 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1862 call _key_expansion_192b
1863 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1864 call _key_expansion_192a
1865 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1866 call _key_expansion_192b
1869 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1870 call _key_expansion_128
1871 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1872 call _key_expansion_128
1873 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1874 call _key_expansion_128
1875 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1876 call _key_expansion_128
1877 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1878 call _key_expansion_128
1879 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1880 call _key_expansion_128
1881 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1882 call _key_expansion_128
1883 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1884 call _key_expansion_128
1885 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1886 call _key_expansion_128
1887 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1888 call _key_expansion_128
1891 movaps (KEYP), %xmm0
1892 movaps (TKEYP), %xmm1
1893 movaps %xmm0, 240(TKEYP)
1894 movaps %xmm1, 240(KEYP)
1896 lea 240-16(TKEYP), UKEYP
1899 movaps (KEYP), %xmm0
1901 movaps %xmm1, (UKEYP)
1912 ENDPROC(aesni_set_key)
1915 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1922 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1923 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1924 movl (FRAME_OFFSET+20)(%esp), INP # src
1926 movl 480(KEYP), KLEN # key length
1927 movups (INP), STATE # input
1929 movups STATE, (OUTP) # output
1939 * _aesni_enc1: internal ABI
1941 * KEYP: key struct pointer
1943 * STATE: initial state (input)
1945 * STATE: finial state (output)
1952 movaps (KEYP), KEY # key
1954 pxor KEY, STATE # round 0
1958 lea 0x20(TKEYP), TKEYP
1961 movaps -0x60(TKEYP), KEY
1963 movaps -0x50(TKEYP), KEY
1967 movaps -0x40(TKEYP), KEY
1969 movaps -0x30(TKEYP), KEY
1973 movaps -0x20(TKEYP), KEY
1975 movaps -0x10(TKEYP), KEY
1979 movaps 0x10(TKEYP), KEY
1981 movaps 0x20(TKEYP), KEY
1983 movaps 0x30(TKEYP), KEY
1985 movaps 0x40(TKEYP), KEY
1987 movaps 0x50(TKEYP), KEY
1989 movaps 0x60(TKEYP), KEY
1991 movaps 0x70(TKEYP), KEY
1992 AESENCLAST KEY STATE
1994 ENDPROC(_aesni_enc1)
1997 * _aesni_enc4: internal ABI
1999 * KEYP: key struct pointer
2001 * STATE1: initial state (input)
2006 * STATE1: finial state (output)
2016 movaps (KEYP), KEY # key
2018 pxor KEY, STATE1 # round 0
2025 lea 0x20(TKEYP), TKEYP
2028 movaps -0x60(TKEYP), KEY
2033 movaps -0x50(TKEYP), KEY
2040 movaps -0x40(TKEYP), KEY
2045 movaps -0x30(TKEYP), KEY
2052 movaps -0x20(TKEYP), KEY
2057 movaps -0x10(TKEYP), KEY
2067 movaps 0x10(TKEYP), KEY
2072 movaps 0x20(TKEYP), KEY
2077 movaps 0x30(TKEYP), KEY
2082 movaps 0x40(TKEYP), KEY
2087 movaps 0x50(TKEYP), KEY
2092 movaps 0x60(TKEYP), KEY
2097 movaps 0x70(TKEYP), KEY
2098 AESENCLAST KEY STATE1 # last round
2099 AESENCLAST KEY STATE2
2100 AESENCLAST KEY STATE3
2101 AESENCLAST KEY STATE4
2103 ENDPROC(_aesni_enc4)
2106 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2113 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2114 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2115 movl (FRAME_OFFSET+20)(%esp), INP # src
2117 mov 480(KEYP), KLEN # key length
2119 movups (INP), STATE # input
2121 movups STATE, (OUTP) #output
2131 * _aesni_dec1: internal ABI
2133 * KEYP: key struct pointer
2135 * STATE: initial state (input)
2137 * STATE: finial state (output)
2144 movaps (KEYP), KEY # key
2146 pxor KEY, STATE # round 0
2150 lea 0x20(TKEYP), TKEYP
2153 movaps -0x60(TKEYP), KEY
2155 movaps -0x50(TKEYP), KEY
2159 movaps -0x40(TKEYP), KEY
2161 movaps -0x30(TKEYP), KEY
2165 movaps -0x20(TKEYP), KEY
2167 movaps -0x10(TKEYP), KEY
2171 movaps 0x10(TKEYP), KEY
2173 movaps 0x20(TKEYP), KEY
2175 movaps 0x30(TKEYP), KEY
2177 movaps 0x40(TKEYP), KEY
2179 movaps 0x50(TKEYP), KEY
2181 movaps 0x60(TKEYP), KEY
2183 movaps 0x70(TKEYP), KEY
2184 AESDECLAST KEY STATE
2186 ENDPROC(_aesni_dec1)
2189 * _aesni_dec4: internal ABI
2191 * KEYP: key struct pointer
2193 * STATE1: initial state (input)
2198 * STATE1: finial state (output)
2208 movaps (KEYP), KEY # key
2210 pxor KEY, STATE1 # round 0
2217 lea 0x20(TKEYP), TKEYP
2220 movaps -0x60(TKEYP), KEY
2225 movaps -0x50(TKEYP), KEY
2232 movaps -0x40(TKEYP), KEY
2237 movaps -0x30(TKEYP), KEY
2244 movaps -0x20(TKEYP), KEY
2249 movaps -0x10(TKEYP), KEY
2259 movaps 0x10(TKEYP), KEY
2264 movaps 0x20(TKEYP), KEY
2269 movaps 0x30(TKEYP), KEY
2274 movaps 0x40(TKEYP), KEY
2279 movaps 0x50(TKEYP), KEY
2284 movaps 0x60(TKEYP), KEY
2289 movaps 0x70(TKEYP), KEY
2290 AESDECLAST KEY STATE1 # last round
2291 AESDECLAST KEY STATE2
2292 AESDECLAST KEY STATE3
2293 AESDECLAST KEY STATE4
2295 ENDPROC(_aesni_dec4)
2298 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2301 ENTRY(aesni_ecb_enc)
2307 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2308 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2309 movl (FRAME_OFFSET+24)(%esp), INP # src
2310 movl (FRAME_OFFSET+28)(%esp), LEN # len
2312 test LEN, LEN # check length
2321 movups (INP), STATE1
2322 movups 0x10(INP), STATE2
2323 movups 0x20(INP), STATE3
2324 movups 0x30(INP), STATE4
2326 movups STATE1, (OUTP)
2327 movups STATE2, 0x10(OUTP)
2328 movups STATE3, 0x20(OUTP)
2329 movups STATE4, 0x30(OUTP)
2339 movups (INP), STATE1
2341 movups STATE1, (OUTP)
2355 ENDPROC(aesni_ecb_enc)
2358 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2361 ENTRY(aesni_ecb_dec)
2367 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2368 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2369 movl (FRAME_OFFSET+24)(%esp), INP # src
2370 movl (FRAME_OFFSET+28)(%esp), LEN # len
2382 movups (INP), STATE1
2383 movups 0x10(INP), STATE2
2384 movups 0x20(INP), STATE3
2385 movups 0x30(INP), STATE4
2387 movups STATE1, (OUTP)
2388 movups STATE2, 0x10(OUTP)
2389 movups STATE3, 0x20(OUTP)
2390 movups STATE4, 0x30(OUTP)
2400 movups (INP), STATE1
2402 movups STATE1, (OUTP)
2416 ENDPROC(aesni_ecb_dec)
2419 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2420 * size_t len, u8 *iv)
2422 ENTRY(aesni_cbc_enc)
2429 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2430 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2431 movl (FRAME_OFFSET+28)(%esp), INP # src
2432 movl (FRAME_OFFSET+32)(%esp), LEN # len
2433 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2438 movups (IVP), STATE # load iv as initial state
2441 movups (INP), IN # load input
2444 movups STATE, (OUTP) # store output
2460 ENDPROC(aesni_cbc_enc)
2463 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2464 * size_t len, u8 *iv)
2466 ENTRY(aesni_cbc_dec)
2473 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2474 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2475 movl (FRAME_OFFSET+28)(%esp), INP # src
2476 movl (FRAME_OFFSET+32)(%esp), LEN # len
2477 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2480 jb .Lcbc_dec_just_ret
2490 movups 0x10(INP), IN2
2493 movups 0x20(INP), IN3
2495 movups 0x30(INP), IN4
2498 movups 0x20(INP), IN1
2500 movups 0x30(INP), IN2
2515 movups 0x10(INP), IN2
2518 movups STATE1, (OUTP)
2519 movups STATE2, 0x10(OUTP)
2520 movups STATE3, 0x20(OUTP)
2521 movups STATE4, 0x30(OUTP)
2535 movups STATE, (OUTP)
2553 ENDPROC(aesni_cbc_dec)
2556 .pushsection .rodata
2559 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2563 * _aesni_inc_init: internal ABI
2564 * setup registers used by _aesni_inc
2568 * CTR: == IV, in little endian
2569 * TCTR_LOW: == lower qword of CTR
2570 * INC: == 1, in little endian
2571 * BSWAP_MASK == endian swapping mask
2575 movaps .Lbswap_mask, BSWAP_MASK
2577 PSHUFB_XMM BSWAP_MASK CTR
2579 MOVQ_R64_XMM TCTR_LOW INC
2580 MOVQ_R64_XMM CTR TCTR_LOW
2582 ENDPROC(_aesni_inc_init)
2585 * _aesni_inc: internal ABI
2586 * Increase IV by 1, IV is in big endian
2589 * CTR: == IV, in little endian
2590 * TCTR_LOW: == lower qword of CTR
2591 * INC: == 1, in little endian
2592 * BSWAP_MASK == endian swapping mask
2596 * CTR: == output IV, in little endian
2597 * TCTR_LOW: == lower qword of CTR
2609 PSHUFB_XMM BSWAP_MASK IV
2614 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2615 * size_t len, u8 *iv)
2617 ENTRY(aesni_ctr_enc)
2620 jb .Lctr_enc_just_ret
2623 call _aesni_inc_init
2633 movups 0x10(INP), IN2
2636 movups 0x20(INP), IN3
2639 movups 0x30(INP), IN4
2642 movups STATE1, (OUTP)
2644 movups STATE2, 0x10(OUTP)
2646 movups STATE3, 0x20(OUTP)
2648 movups STATE4, 0x30(OUTP)
2663 movups STATE, (OUTP)
2674 ENDPROC(aesni_ctr_enc)
2677 * _aesni_gf128mul_x_ble: internal ABI
2678 * Multiply in GF(2^128) for XTS IVs
2681 * GF128MUL_MASK == mask with 0x87 and 0x01
2685 * CTR: == temporary value
2687 #define _aesni_gf128mul_x_ble() \
2688 pshufd $0x13, IV, CTR; \
2691 pand GF128MUL_MASK, CTR; \
2695 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2698 ENTRY(aesni_xts_crypt8)
2703 leaq _aesni_enc4, %r11
2704 leaq _aesni_dec4, %rax
2708 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2715 movdqu 0x00(INP), INC
2717 movdqu IV, 0x00(OUTP)
2719 _aesni_gf128mul_x_ble()
2721 movdqu 0x10(INP), INC
2723 movdqu IV, 0x10(OUTP)
2725 _aesni_gf128mul_x_ble()
2727 movdqu 0x20(INP), INC
2729 movdqu IV, 0x20(OUTP)
2731 _aesni_gf128mul_x_ble()
2733 movdqu 0x30(INP), INC
2735 movdqu IV, 0x30(OUTP)
2739 movdqu 0x00(OUTP), INC
2741 movdqu STATE1, 0x00(OUTP)
2743 _aesni_gf128mul_x_ble()
2745 movdqu 0x40(INP), INC
2747 movdqu IV, 0x40(OUTP)
2749 movdqu 0x10(OUTP), INC
2751 movdqu STATE2, 0x10(OUTP)
2753 _aesni_gf128mul_x_ble()
2755 movdqu 0x50(INP), INC
2757 movdqu IV, 0x50(OUTP)
2759 movdqu 0x20(OUTP), INC
2761 movdqu STATE3, 0x20(OUTP)
2763 _aesni_gf128mul_x_ble()
2765 movdqu 0x60(INP), INC
2767 movdqu IV, 0x60(OUTP)
2769 movdqu 0x30(OUTP), INC
2771 movdqu STATE4, 0x30(OUTP)
2773 _aesni_gf128mul_x_ble()
2775 movdqu 0x70(INP), INC
2777 movdqu IV, 0x70(OUTP)
2779 _aesni_gf128mul_x_ble()
2784 movdqu 0x40(OUTP), INC
2786 movdqu STATE1, 0x40(OUTP)
2788 movdqu 0x50(OUTP), INC
2790 movdqu STATE2, 0x50(OUTP)
2792 movdqu 0x60(OUTP), INC
2794 movdqu STATE3, 0x60(OUTP)
2796 movdqu 0x70(OUTP), INC
2798 movdqu STATE4, 0x70(OUTP)
2802 ENDPROC(aesni_xts_crypt8)