2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
36 * The following macros are used to move an (un)aligned 16 byte value to/from
37 * an XMM register. This can done for either FP or integer values, for FP use
38 * movaps (move aligned packed single) or integer use movdqa (move double quad
39 * aligned). It doesn't make a performance difference which instruction is used
40 * since Nehalem (original Core i7) was released. However, the movaps is a byte
41 * shorter, so that is the one we'll use for now. (same for unaligned).
50 .Lgf128mul_x_ble_mask:
51 .octa 0x00000000000000010000000000000087
52 POLY: .octa 0xC2000000000000000000000000000001
53 TWOONE: .octa 0x00000001000000000000000000000001
55 # order of these constants should not change.
56 # more specifically, ALL_F should follow SHIFT_MASK,
57 # and ZERO should follow ALL_F
59 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
60 MASK1: .octa 0x0000000000000000ffffffffffffffff
61 MASK2: .octa 0xffffffffffffffff0000000000000000
62 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
63 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
64 ZERO: .octa 0x00000000000000000000000000000000
65 ONE: .octa 0x00000000000000000000000000000001
66 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
74 #define STACK_OFFSET 8*3
75 #define HashKey 16*0 // store HashKey <<1 mod poly here
76 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
77 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
78 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
79 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
80 // bits of HashKey <<1 mod poly here
81 //(for Karatsuba purposes)
82 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
83 // bits of HashKey^2 <<1 mod poly here
84 // (for Karatsuba purposes)
85 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
86 // bits of HashKey^3 <<1 mod poly here
87 // (for Karatsuba purposes)
88 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
89 // bits of HashKey^4 <<1 mod poly here
90 // (for Karatsuba purposes)
91 #define VARIABLE_OFFSET 16*8
99 #define arg7 STACK_OFFSET+8(%r14)
100 #define arg8 STACK_OFFSET+16(%r14)
101 #define arg9 STACK_OFFSET+24(%r14)
102 #define arg10 STACK_OFFSET+32(%r14)
103 #define keysize 2*15*16(%arg1)
120 #define BSWAP_MASK %xmm10
124 #define GF128MUL_MASK %xmm10
154 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
157 * Input: A and B (128-bits each, bit-reflected)
158 * Output: C = A*B*x mod poly, (i.e. >>1 )
159 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
160 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
163 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
165 pshufd $78, \GH, \TMP2
166 pshufd $78, \HK, \TMP3
167 pxor \GH, \TMP2 # TMP2 = a1+a0
168 pxor \HK, \TMP3 # TMP3 = b1+b0
169 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
170 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
171 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
173 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
175 pslldq $8, \TMP3 # left shift TMP3 2 DWs
176 psrldq $8, \TMP2 # right shift TMP2 2 DWs
178 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
180 # first phase of the reduction
184 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
185 # in in order to perform
187 pslld $31, \TMP2 # packed right shift <<31
188 pslld $30, \TMP3 # packed right shift <<30
189 pslld $25, \TMP4 # packed right shift <<25
190 pxor \TMP3, \TMP2 # xor the shifted versions
193 psrldq $4, \TMP5 # right shift TMP5 1 DW
194 pslldq $12, \TMP2 # left shift TMP2 3 DWs
197 # second phase of the reduction
199 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
200 # in in order to perform
204 psrld $1,\TMP2 # packed left shift >>1
205 psrld $2,\TMP3 # packed left shift >>2
206 psrld $7,\TMP4 # packed left shift >>7
207 pxor \TMP3,\TMP2 # xor the shifted versions
211 pxor \TMP1, \GH # result is in TMP1
215 * if a = number of total plaintext bytes
217 * num_initial_blocks = b mod 4
218 * encrypt the initial num_initial_blocks blocks and apply ghash on
220 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
222 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
226 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
227 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
228 MOVADQ SHUF_MASK(%rip), %xmm14
229 mov arg7, %r10 # %r10 = AAD
230 mov arg8, %r12 # %r12 = aadLen
234 _get_AAD_loop\num_initial_blocks\operation:
241 jne _get_AAD_loop\num_initial_blocks\operation
244 je _get_AAD_loop2_done\num_initial_blocks\operation
247 _get_AAD_loop2\num_initial_blocks\operation:
251 jne _get_AAD_loop2\num_initial_blocks\operation
253 _get_AAD_loop2_done\num_initial_blocks\operation:
254 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
256 xor %r11, %r11 # initialise the data pointer offset as zero
258 # start AES for num_initial_blocks blocks
260 mov %arg5, %rax # %rax = *Y0
261 movdqu (%rax), \XMM0 # XMM0 = Y0
262 PSHUFB_XMM %xmm14, \XMM0
264 .if (\i == 5) || (\i == 6) || (\i == 7)
265 MOVADQ ONE(%RIP),\TMP1
268 paddd \TMP1, \XMM0 # INCR Y0
269 movdqa \XMM0, %xmm\index
270 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
271 pxor \TMP2, %xmm\index
275 shr $2,%eax # 128->4, 192->6, 256->8
276 add $5,%eax # 128->9, 192->11, 256->13
278 aes_loop_initial_dec\num_initial_blocks:
281 AESENC \TMP1, %xmm\index
285 jnz aes_loop_initial_dec\num_initial_blocks
289 AESENCLAST \TMP1, %xmm\index # Last Round
292 movdqu (%arg3 , %r11, 1), \TMP1
293 pxor \TMP1, %xmm\index
294 movdqu %xmm\index, (%arg2 , %r11, 1)
295 # write back plaintext/ciphertext for num_initial_blocks
298 movdqa \TMP1, %xmm\index
299 PSHUFB_XMM %xmm14, %xmm\index
300 # prepare plaintext/ciphertext for GHASH computation
303 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
304 # apply GHASH on num_initial_blocks blocks
308 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
310 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
312 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
320 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323 jl _initial_blocks_done\num_initial_blocks\operation
324 # no need for precomputed values
327 * Precomputations for HashKey parallel with encryption of first 4 blocks.
328 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
330 MOVADQ ONE(%rip), \TMP1
331 paddd \TMP1, \XMM0 # INCR Y0
333 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
335 paddd \TMP1, \XMM0 # INCR Y0
337 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
339 paddd \TMP1, \XMM0 # INCR Y0
341 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
343 paddd \TMP1, \XMM0 # INCR Y0
345 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
347 MOVADQ 0(%arg1),\TMP1
353 pshufd $78, \TMP3, \TMP1
355 movdqa \TMP1, HashKey_k(%rsp)
356 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
357 # TMP5 = HashKey^2<<1 (mod poly)
358 movdqa \TMP5, HashKey_2(%rsp)
359 # HashKey_2 = HashKey^2<<1 (mod poly)
360 pshufd $78, \TMP5, \TMP1
362 movdqa \TMP1, HashKey_2_k(%rsp)
363 .irpc index, 1234 # do 4 rounds
364 movaps 0x10*\index(%arg1), \TMP1
370 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
371 # TMP5 = HashKey^3<<1 (mod poly)
372 movdqa \TMP5, HashKey_3(%rsp)
373 pshufd $78, \TMP5, \TMP1
375 movdqa \TMP1, HashKey_3_k(%rsp)
376 .irpc index, 56789 # do next 5 rounds
377 movaps 0x10*\index(%arg1), \TMP1
383 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
384 # TMP5 = HashKey^3<<1 (mod poly)
385 movdqa \TMP5, HashKey_4(%rsp)
386 pshufd $78, \TMP5, \TMP1
388 movdqa \TMP1, HashKey_4_k(%rsp)
391 shr $2,%eax # 128->4, 192->6, 256->8
392 sub $4,%eax # 128->0, 192->2, 256->4
393 jz aes_loop_pre_dec_done\num_initial_blocks
395 aes_loop_pre_dec\num_initial_blocks:
398 AESENC \TMP2, %xmm\index
402 jnz aes_loop_pre_dec\num_initial_blocks
404 aes_loop_pre_dec_done\num_initial_blocks:
406 AESENCLAST \TMP2, \XMM1
407 AESENCLAST \TMP2, \XMM2
408 AESENCLAST \TMP2, \XMM3
409 AESENCLAST \TMP2, \XMM4
410 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
412 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
414 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
416 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
418 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
420 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
422 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
424 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
427 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
429 # combine GHASHed value with the corresponding ciphertext
430 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
431 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
432 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
434 _initial_blocks_done\num_initial_blocks\operation:
440 * if a = number of total plaintext bytes
442 * num_initial_blocks = b mod 4
443 * encrypt the initial num_initial_blocks blocks and apply ghash on
445 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
447 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
451 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
452 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
453 MOVADQ SHUF_MASK(%rip), %xmm14
454 mov arg7, %r10 # %r10 = AAD
455 mov arg8, %r12 # %r12 = aadLen
458 _get_AAD_loop\num_initial_blocks\operation:
465 jne _get_AAD_loop\num_initial_blocks\operation
467 je _get_AAD_loop2_done\num_initial_blocks\operation
469 _get_AAD_loop2\num_initial_blocks\operation:
473 jne _get_AAD_loop2\num_initial_blocks\operation
474 _get_AAD_loop2_done\num_initial_blocks\operation:
475 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
477 xor %r11, %r11 # initialise the data pointer offset as zero
479 # start AES for num_initial_blocks blocks
481 mov %arg5, %rax # %rax = *Y0
482 movdqu (%rax), \XMM0 # XMM0 = Y0
483 PSHUFB_XMM %xmm14, \XMM0
485 .if (\i == 5) || (\i == 6) || (\i == 7)
487 MOVADQ ONE(%RIP),\TMP1
488 MOVADQ 0(%arg1),\TMP2
490 paddd \TMP1, \XMM0 # INCR Y0
491 MOVADQ \XMM0, %xmm\index
492 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
493 pxor \TMP2, %xmm\index
497 shr $2,%eax # 128->4, 192->6, 256->8
498 add $5,%eax # 128->9, 192->11, 256->13
500 aes_loop_initial_enc\num_initial_blocks:
503 AESENC \TMP1, %xmm\index
507 jnz aes_loop_initial_enc\num_initial_blocks
511 AESENCLAST \TMP1, %xmm\index # Last Round
514 movdqu (%arg3 , %r11, 1), \TMP1
515 pxor \TMP1, %xmm\index
516 movdqu %xmm\index, (%arg2 , %r11, 1)
517 # write back plaintext/ciphertext for num_initial_blocks
519 PSHUFB_XMM %xmm14, %xmm\index
521 # prepare plaintext/ciphertext for GHASH computation
524 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
525 # apply GHASH on num_initial_blocks blocks
529 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
531 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
533 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
536 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
538 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
541 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
544 jl _initial_blocks_done\num_initial_blocks\operation
545 # no need for precomputed values
548 * Precomputations for HashKey parallel with encryption of first 4 blocks.
549 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
551 MOVADQ ONE(%RIP),\TMP1
552 paddd \TMP1, \XMM0 # INCR Y0
554 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
556 paddd \TMP1, \XMM0 # INCR Y0
558 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
560 paddd \TMP1, \XMM0 # INCR Y0
562 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
564 paddd \TMP1, \XMM0 # INCR Y0
566 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
568 MOVADQ 0(%arg1),\TMP1
574 pshufd $78, \TMP3, \TMP1
576 movdqa \TMP1, HashKey_k(%rsp)
577 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
578 # TMP5 = HashKey^2<<1 (mod poly)
579 movdqa \TMP5, HashKey_2(%rsp)
580 # HashKey_2 = HashKey^2<<1 (mod poly)
581 pshufd $78, \TMP5, \TMP1
583 movdqa \TMP1, HashKey_2_k(%rsp)
584 .irpc index, 1234 # do 4 rounds
585 movaps 0x10*\index(%arg1), \TMP1
591 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
592 # TMP5 = HashKey^3<<1 (mod poly)
593 movdqa \TMP5, HashKey_3(%rsp)
594 pshufd $78, \TMP5, \TMP1
596 movdqa \TMP1, HashKey_3_k(%rsp)
597 .irpc index, 56789 # do next 5 rounds
598 movaps 0x10*\index(%arg1), \TMP1
604 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
605 # TMP5 = HashKey^3<<1 (mod poly)
606 movdqa \TMP5, HashKey_4(%rsp)
607 pshufd $78, \TMP5, \TMP1
609 movdqa \TMP1, HashKey_4_k(%rsp)
612 shr $2,%eax # 128->4, 192->6, 256->8
613 sub $4,%eax # 128->0, 192->2, 256->4
614 jz aes_loop_pre_enc_done\num_initial_blocks
616 aes_loop_pre_enc\num_initial_blocks:
619 AESENC \TMP2, %xmm\index
623 jnz aes_loop_pre_enc\num_initial_blocks
625 aes_loop_pre_enc_done\num_initial_blocks:
627 AESENCLAST \TMP2, \XMM1
628 AESENCLAST \TMP2, \XMM2
629 AESENCLAST \TMP2, \XMM3
630 AESENCLAST \TMP2, \XMM4
631 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
633 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
635 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
637 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
639 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
640 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
641 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
642 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
645 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
647 # combine GHASHed value with the corresponding ciphertext
648 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
649 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
650 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
652 _initial_blocks_done\num_initial_blocks\operation:
657 * encrypt 4 blocks at a time
658 * ghash the 4 previously encrypted ciphertext blocks
659 * arg1, %arg2, %arg3 are used as pointers only, not modified
660 * %r11 is the data offset value
662 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
663 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
670 movdqa SHUF_MASK(%rip), %xmm15
671 # multiply TMP5 * HashKey using karatsuba
674 pshufd $78, \XMM5, \TMP6
676 paddd ONE(%rip), \XMM0 # INCR CNT
677 movdqa HashKey_4(%rsp), \TMP5
678 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
680 paddd ONE(%rip), \XMM0 # INCR CNT
682 paddd ONE(%rip), \XMM0 # INCR CNT
684 paddd ONE(%rip), \XMM0 # INCR CNT
686 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
687 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
688 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
689 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
690 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
696 movdqa HashKey_4_k(%rsp), \TMP5
697 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
698 movaps 0x10(%arg1), \TMP1
699 AESENC \TMP1, \XMM1 # Round 1
703 movaps 0x20(%arg1), \TMP1
704 AESENC \TMP1, \XMM1 # Round 2
709 pshufd $78, \XMM6, \TMP2
711 movdqa HashKey_3(%rsp), \TMP5
712 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
713 movaps 0x30(%arg1), \TMP3
714 AESENC \TMP3, \XMM1 # Round 3
718 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
719 movaps 0x40(%arg1), \TMP3
720 AESENC \TMP3, \XMM1 # Round 4
724 movdqa HashKey_3_k(%rsp), \TMP5
725 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
726 movaps 0x50(%arg1), \TMP3
727 AESENC \TMP3, \XMM1 # Round 5
732 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
736 pshufd $78, \XMM7, \TMP2
738 movdqa HashKey_2(%rsp ), \TMP5
740 # Multiply TMP5 * HashKey using karatsuba
742 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
743 movaps 0x60(%arg1), \TMP3
744 AESENC \TMP3, \XMM1 # Round 6
748 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
749 movaps 0x70(%arg1), \TMP3
750 AESENC \TMP3, \XMM1 # Round 7
754 movdqa HashKey_2_k(%rsp), \TMP5
755 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
756 movaps 0x80(%arg1), \TMP3
757 AESENC \TMP3, \XMM1 # Round 8
762 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
766 # Multiply XMM8 * HashKey
767 # XMM8 and TMP5 hold the values for the two operands
770 pshufd $78, \XMM8, \TMP2
772 movdqa HashKey(%rsp), \TMP5
773 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
774 movaps 0x90(%arg1), \TMP3
775 AESENC \TMP3, \XMM1 # Round 9
779 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
782 shr $2,%eax # 128->4, 192->6, 256->8
783 sub $4,%eax # 128->0, 192->2, 256->4
784 jz aes_loop_par_enc_done
789 AESENC \TMP3, %xmm\index
795 aes_loop_par_enc_done:
797 AESENCLAST \TMP3, \XMM1 # Round 10
798 AESENCLAST \TMP3, \XMM2
799 AESENCLAST \TMP3, \XMM3
800 AESENCLAST \TMP3, \XMM4
801 movdqa HashKey_k(%rsp), \TMP5
802 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
803 movdqu (%arg3,%r11,1), \TMP3
804 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
805 movdqu 16(%arg3,%r11,1), \TMP3
806 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
807 movdqu 32(%arg3,%r11,1), \TMP3
808 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
809 movdqu 48(%arg3,%r11,1), \TMP3
810 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
811 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
812 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
813 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
814 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
815 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
816 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
817 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
818 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
826 pslldq $8, \TMP3 # left shift TMP3 2 DWs
827 psrldq $8, \TMP2 # right shift TMP2 2 DWs
829 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
831 # first phase of reduction
836 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
837 pslld $31, \TMP2 # packed right shift << 31
838 pslld $30, \TMP3 # packed right shift << 30
839 pslld $25, \TMP4 # packed right shift << 25
840 pxor \TMP3, \TMP2 # xor the shifted versions
843 psrldq $4, \TMP5 # right shift T5 1 DW
844 pslldq $12, \TMP2 # left shift T2 3 DWs
847 # second phase of reduction
849 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
852 psrld $1, \TMP2 # packed left shift >>1
853 psrld $2, \TMP3 # packed left shift >>2
854 psrld $7, \TMP4 # packed left shift >>7
855 pxor \TMP3,\TMP2 # xor the shifted versions
859 pxor \TMP1, \XMM5 # result is in TMP1
865 * decrypt 4 blocks at a time
866 * ghash the 4 previously decrypted ciphertext blocks
867 * arg1, %arg2, %arg3 are used as pointers only, not modified
868 * %r11 is the data offset value
870 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
871 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
878 movdqa SHUF_MASK(%rip), %xmm15
879 # multiply TMP5 * HashKey using karatsuba
882 pshufd $78, \XMM5, \TMP6
884 paddd ONE(%rip), \XMM0 # INCR CNT
885 movdqa HashKey_4(%rsp), \TMP5
886 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
888 paddd ONE(%rip), \XMM0 # INCR CNT
890 paddd ONE(%rip), \XMM0 # INCR CNT
892 paddd ONE(%rip), \XMM0 # INCR CNT
894 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
895 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
896 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
897 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
898 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
904 movdqa HashKey_4_k(%rsp), \TMP5
905 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
906 movaps 0x10(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 1
911 movaps 0x20(%arg1), \TMP1
912 AESENC \TMP1, \XMM1 # Round 2
917 pshufd $78, \XMM6, \TMP2
919 movdqa HashKey_3(%rsp), \TMP5
920 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
921 movaps 0x30(%arg1), \TMP3
922 AESENC \TMP3, \XMM1 # Round 3
926 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
927 movaps 0x40(%arg1), \TMP3
928 AESENC \TMP3, \XMM1 # Round 4
932 movdqa HashKey_3_k(%rsp), \TMP5
933 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
934 movaps 0x50(%arg1), \TMP3
935 AESENC \TMP3, \XMM1 # Round 5
940 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
944 pshufd $78, \XMM7, \TMP2
946 movdqa HashKey_2(%rsp ), \TMP5
948 # Multiply TMP5 * HashKey using karatsuba
950 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
951 movaps 0x60(%arg1), \TMP3
952 AESENC \TMP3, \XMM1 # Round 6
956 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
957 movaps 0x70(%arg1), \TMP3
958 AESENC \TMP3, \XMM1 # Round 7
962 movdqa HashKey_2_k(%rsp), \TMP5
963 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
964 movaps 0x80(%arg1), \TMP3
965 AESENC \TMP3, \XMM1 # Round 8
970 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
974 # Multiply XMM8 * HashKey
975 # XMM8 and TMP5 hold the values for the two operands
978 pshufd $78, \XMM8, \TMP2
980 movdqa HashKey(%rsp), \TMP5
981 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
982 movaps 0x90(%arg1), \TMP3
983 AESENC \TMP3, \XMM1 # Round 9
987 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
990 shr $2,%eax # 128->4, 192->6, 256->8
991 sub $4,%eax # 128->0, 192->2, 256->4
992 jz aes_loop_par_dec_done
997 AESENC \TMP3, %xmm\index
1001 jnz aes_loop_par_dec
1003 aes_loop_par_dec_done:
1004 MOVADQ (%r10), \TMP3
1005 AESENCLAST \TMP3, \XMM1 # last round
1006 AESENCLAST \TMP3, \XMM2
1007 AESENCLAST \TMP3, \XMM3
1008 AESENCLAST \TMP3, \XMM4
1009 movdqa HashKey_k(%rsp), \TMP5
1010 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1011 movdqu (%arg3,%r11,1), \TMP3
1012 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1013 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
1015 movdqu 16(%arg3,%r11,1), \TMP3
1016 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1017 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
1019 movdqu 32(%arg3,%r11,1), \TMP3
1020 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1021 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1023 movdqu 48(%arg3,%r11,1), \TMP3
1024 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1025 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1027 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1028 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1029 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1030 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1038 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1039 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1041 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1043 # first phase of reduction
1048 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1049 pslld $31, \TMP2 # packed right shift << 31
1050 pslld $30, \TMP3 # packed right shift << 30
1051 pslld $25, \TMP4 # packed right shift << 25
1052 pxor \TMP3, \TMP2 # xor the shifted versions
1055 psrldq $4, \TMP5 # right shift T5 1 DW
1056 pslldq $12, \TMP2 # left shift T2 3 DWs
1059 # second phase of reduction
1061 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1064 psrld $1, \TMP2 # packed left shift >>1
1065 psrld $2, \TMP3 # packed left shift >>2
1066 psrld $7, \TMP4 # packed left shift >>7
1067 pxor \TMP3,\TMP2 # xor the shifted versions
1071 pxor \TMP1, \XMM5 # result is in TMP1
1076 /* GHASH the last 4 ciphertext blocks. */
1077 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1078 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1080 # Multiply TMP6 * HashKey (using Karatsuba)
1083 pshufd $78, \XMM1, \TMP2
1085 movdqa HashKey_4(%rsp), \TMP5
1086 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1087 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1088 movdqa HashKey_4_k(%rsp), \TMP4
1089 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1090 movdqa \XMM1, \XMMDst
1091 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1093 # Multiply TMP1 * HashKey (using Karatsuba)
1096 pshufd $78, \XMM2, \TMP2
1098 movdqa HashKey_3(%rsp), \TMP5
1099 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1100 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1101 movdqa HashKey_3_k(%rsp), \TMP4
1102 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1106 # results accumulated in TMP6, XMMDst, XMM1
1108 # Multiply TMP1 * HashKey (using Karatsuba)
1111 pshufd $78, \XMM3, \TMP2
1113 movdqa HashKey_2(%rsp), \TMP5
1114 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1115 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1116 movdqa HashKey_2_k(%rsp), \TMP4
1117 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1120 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1122 # Multiply TMP1 * HashKey (using Karatsuba)
1124 pshufd $78, \XMM4, \TMP2
1126 movdqa HashKey(%rsp), \TMP5
1127 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1128 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1129 movdqa HashKey_k(%rsp), \TMP4
1130 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1136 # middle section of the temp results combined as in karatsuba algorithm
1138 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1139 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1142 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1143 # first phase of the reduction
1144 movdqa \XMMDst, \TMP2
1145 movdqa \XMMDst, \TMP3
1146 movdqa \XMMDst, \TMP4
1147 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1148 pslld $31, \TMP2 # packed right shifting << 31
1149 pslld $30, \TMP3 # packed right shifting << 30
1150 pslld $25, \TMP4 # packed right shifting << 25
1151 pxor \TMP3, \TMP2 # xor the shifted versions
1154 psrldq $4, \TMP7 # right shift TMP7 1 DW
1155 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1158 # second phase of the reduction
1159 movdqa \XMMDst, \TMP2
1160 # make 3 copies of XMMDst for doing 3 shift operations
1161 movdqa \XMMDst, \TMP3
1162 movdqa \XMMDst, \TMP4
1163 psrld $1, \TMP2 # packed left shift >> 1
1164 psrld $2, \TMP3 # packed left shift >> 2
1165 psrld $7, \TMP4 # packed left shift >> 7
1166 pxor \TMP3, \TMP2 # xor the shifted versions
1170 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1174 /* Encryption of a single block
1178 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1182 shr $2,%eax # 128->4, 192->6, 256->8
1183 add $5,%eax # 128->9, 192->11, 256->13
1184 lea 16(%arg1), %r10 # get first expanded key address
1194 AESENCLAST \TMP1,\XMM0
1196 /*****************************************************************************
1197 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1198 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1199 * const u8 *in, // Ciphertext input
1200 * u64 plaintext_len, // Length of data in bytes for decryption.
1201 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1202 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1203 * // concatenated with 0x00000001. 16-byte aligned pointer.
1204 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1205 * const u8 *aad, // Additional Authentication Data (AAD)
1206 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1207 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1208 * // given authentication tag and only return the plaintext if they match.
1209 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1210 * // (most likely), 12 or 8.
1215 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1216 * set of 11 keys in the data structure void *aes_ctx
1220 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1221 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1222 * | Salt (From the SA) |
1223 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1224 * | Initialization Vector |
1225 * | (This is the sequence number from IPSec header) |
1226 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1233 * AAD padded to 128 bits with 0
1234 * for example, assume AAD is a u32 vector
1236 * if AAD is 8 bytes:
1237 * AAD[3] = {A0, A1};
1238 * padded AAD in xmm register = {A1 A0 0 0}
1241 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1242 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1245 * | 32-bit Sequence Number (A0) |
1246 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1248 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1250 * AAD Format with 32-bit Sequence Number
1252 * if AAD is 12 bytes:
1253 * AAD[3] = {A0, A1, A2};
1254 * padded AAD in xmm register = {A2 A1 A0 0}
1257 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1258 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1259 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1260 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1262 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1263 * | 64-bit Extended Sequence Number {A1,A0} |
1265 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1267 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1269 * AAD Format with 64-bit Extended Sequence Number
1272 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1273 * The code supports 16 too but for other sizes, the code will fail.
1276 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1277 * For other sizes, the code will fail.
1279 * poly = x^128 + x^127 + x^126 + x^121 + 1
1281 *****************************************************************************/
1282 ENTRY(aesni_gcm_dec)
1288 * states of %xmm registers %xmm6:%xmm15 not saved
1289 * all %xmm registers are clobbered
1291 sub $VARIABLE_OFFSET, %rsp
1292 and $~63, %rsp # align rsp to 64 bytes
1294 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1295 movdqa SHUF_MASK(%rip), %xmm2
1296 PSHUFB_XMM %xmm2, %xmm13
1299 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1301 movdqa %xmm13, %xmm2
1311 pshufd $0x24, %xmm1, %xmm2
1312 pcmpeqd TWOONE(%rip), %xmm2
1313 pand POLY(%rip), %xmm2
1314 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1317 # Decrypt first few blocks
1319 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1320 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1321 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1324 jz _initial_num_blocks_is_0_decrypt
1326 jb _initial_num_blocks_is_1_decrypt
1327 je _initial_num_blocks_is_2_decrypt
1328 _initial_num_blocks_is_3_decrypt:
1329 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1330 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1332 jmp _initial_blocks_decrypted
1333 _initial_num_blocks_is_2_decrypt:
1334 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1335 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1337 jmp _initial_blocks_decrypted
1338 _initial_num_blocks_is_1_decrypt:
1339 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1340 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1342 jmp _initial_blocks_decrypted
1343 _initial_num_blocks_is_0_decrypt:
1344 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1345 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1346 _initial_blocks_decrypted:
1348 je _zero_cipher_left_decrypt
1350 je _four_cipher_left_decrypt
1352 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1353 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1357 _four_cipher_left_decrypt:
1358 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1359 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1360 _zero_cipher_left_decrypt:
1362 and $15, %r13 # %r13 = arg4 (mod 16)
1363 je _multiple_of_16_bytes_decrypt
1365 # Handle the last <16 byte block separately
1367 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1368 movdqa SHUF_MASK(%rip), %xmm10
1369 PSHUFB_XMM %xmm10, %xmm0
1371 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1374 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1375 lea SHIFT_MASK+16(%rip), %r12
1377 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1378 # (%r13 is the number of bytes in plaintext mod 16)
1379 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1380 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1383 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1384 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1385 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1386 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1388 movdqa SHUF_MASK(%rip), %xmm10
1389 PSHUFB_XMM %xmm10 ,%xmm2
1392 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1393 # GHASH computation for the last <16 byte block
1398 MOVQ_R64_XMM %xmm0, %rax
1400 jle _less_than_8_bytes_left_decrypt
1401 mov %rax, (%arg2 , %r11, 1)
1404 MOVQ_R64_XMM %xmm0, %rax
1406 _less_than_8_bytes_left_decrypt:
1407 mov %al, (%arg2, %r11, 1)
1411 jne _less_than_8_bytes_left_decrypt
1412 _multiple_of_16_bytes_decrypt:
1413 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1414 shl $3, %r12 # convert into number of bits
1415 movd %r12d, %xmm15 # len(A) in %xmm15
1416 shl $3, %arg4 # len(C) in bits (*128)
1417 MOVQ_R64_XMM %arg4, %xmm1
1418 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1419 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1421 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1422 # final GHASH computation
1423 movdqa SHUF_MASK(%rip), %xmm10
1424 PSHUFB_XMM %xmm10, %xmm8
1426 mov %arg5, %rax # %rax = *Y0
1427 movdqu (%rax), %xmm0 # %xmm0 = Y0
1428 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1431 mov arg9, %r10 # %r10 = authTag
1432 mov arg10, %r11 # %r11 = auth_tag_len
1438 MOVQ_R64_XMM %xmm0, %rax
1440 jmp _return_T_done_decrypt
1442 MOVQ_R64_XMM %xmm0, %rax
1447 jmp _return_T_done_decrypt
1449 movdqu %xmm0, (%r10)
1450 _return_T_done_decrypt:
1456 ENDPROC(aesni_gcm_dec)
1459 /*****************************************************************************
1460 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1461 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1462 * const u8 *in, // Plaintext input
1463 * u64 plaintext_len, // Length of data in bytes for encryption.
1464 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1465 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1466 * // concatenated with 0x00000001. 16-byte aligned pointer.
1467 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1468 * const u8 *aad, // Additional Authentication Data (AAD)
1469 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1470 * u8 *auth_tag, // Authenticated Tag output.
1471 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1477 * keys are pre-expanded and aligned to 16 bytes. we are using the
1478 * first set of 11 keys in the data structure void *aes_ctx
1483 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1484 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1485 * | Salt (From the SA) |
1486 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1487 * | Initialization Vector |
1488 * | (This is the sequence number from IPSec header) |
1489 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1496 * AAD padded to 128 bits with 0
1497 * for example, assume AAD is a u32 vector
1499 * if AAD is 8 bytes:
1500 * AAD[3] = {A0, A1};
1501 * padded AAD in xmm register = {A1 A0 0 0}
1504 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1505 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1508 * | 32-bit Sequence Number (A0) |
1509 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1511 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1513 * AAD Format with 32-bit Sequence Number
1515 * if AAD is 12 bytes:
1516 * AAD[3] = {A0, A1, A2};
1517 * padded AAD in xmm register = {A2 A1 A0 0}
1520 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1521 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1523 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1524 * | 64-bit Extended Sequence Number {A1,A0} |
1526 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1528 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1530 * AAD Format with 64-bit Extended Sequence Number
1533 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1534 * The code supports 16 too but for other sizes, the code will fail.
1537 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1538 * For other sizes, the code will fail.
1540 * poly = x^128 + x^127 + x^126 + x^121 + 1
1541 ***************************************************************************/
1542 ENTRY(aesni_gcm_enc)
1548 # states of %xmm registers %xmm6:%xmm15 not saved
1549 # all %xmm registers are clobbered
1551 sub $VARIABLE_OFFSET, %rsp
1554 movdqu (%r12), %xmm13
1555 movdqa SHUF_MASK(%rip), %xmm2
1556 PSHUFB_XMM %xmm2, %xmm13
1559 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1561 movdqa %xmm13, %xmm2
1571 pshufd $0x24, %xmm1, %xmm2
1572 pcmpeqd TWOONE(%rip), %xmm2
1573 pand POLY(%rip), %xmm2
1575 movdqa %xmm13, HashKey(%rsp)
1576 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1580 # Encrypt first few blocks
1583 jz _initial_num_blocks_is_0_encrypt
1585 jb _initial_num_blocks_is_1_encrypt
1586 je _initial_num_blocks_is_2_encrypt
1587 _initial_num_blocks_is_3_encrypt:
1588 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1589 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1591 jmp _initial_blocks_encrypted
1592 _initial_num_blocks_is_2_encrypt:
1593 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1594 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1596 jmp _initial_blocks_encrypted
1597 _initial_num_blocks_is_1_encrypt:
1598 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1599 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1601 jmp _initial_blocks_encrypted
1602 _initial_num_blocks_is_0_encrypt:
1603 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1604 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1605 _initial_blocks_encrypted:
1607 # Main loop - Encrypt remaining blocks
1610 je _zero_cipher_left_encrypt
1612 je _four_cipher_left_encrypt
1613 _encrypt_by_4_encrypt:
1614 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1615 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1618 jne _encrypt_by_4_encrypt
1619 _four_cipher_left_encrypt:
1620 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1621 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1622 _zero_cipher_left_encrypt:
1624 and $15, %r13 # %r13 = arg4 (mod 16)
1625 je _multiple_of_16_bytes_encrypt
1627 # Handle the last <16 Byte block separately
1628 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1629 movdqa SHUF_MASK(%rip), %xmm10
1630 PSHUFB_XMM %xmm10, %xmm0
1633 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1636 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1637 lea SHIFT_MASK+16(%rip), %r12
1639 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1640 # (%r13 is the number of bytes in plaintext mod 16)
1641 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1642 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1643 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1644 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1645 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1646 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1647 movdqa SHUF_MASK(%rip), %xmm10
1648 PSHUFB_XMM %xmm10,%xmm0
1651 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1652 # GHASH computation for the last <16 byte block
1656 movdqa SHUF_MASK(%rip), %xmm10
1657 PSHUFB_XMM %xmm10, %xmm0
1659 # shuffle xmm0 back to output as ciphertext
1662 MOVQ_R64_XMM %xmm0, %rax
1664 jle _less_than_8_bytes_left_encrypt
1665 mov %rax, (%arg2 , %r11, 1)
1668 MOVQ_R64_XMM %xmm0, %rax
1670 _less_than_8_bytes_left_encrypt:
1671 mov %al, (%arg2, %r11, 1)
1675 jne _less_than_8_bytes_left_encrypt
1676 _multiple_of_16_bytes_encrypt:
1677 mov arg8, %r12 # %r12 = addLen (number of bytes)
1679 movd %r12d, %xmm15 # len(A) in %xmm15
1680 shl $3, %arg4 # len(C) in bits (*128)
1681 MOVQ_R64_XMM %arg4, %xmm1
1682 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1683 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1685 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1686 # final GHASH computation
1687 movdqa SHUF_MASK(%rip), %xmm10
1688 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1690 mov %arg5, %rax # %rax = *Y0
1691 movdqu (%rax), %xmm0 # %xmm0 = Y0
1692 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1695 mov arg9, %r10 # %r10 = authTag
1696 mov arg10, %r11 # %r11 = auth_tag_len
1702 MOVQ_R64_XMM %xmm0, %rax
1704 jmp _return_T_done_encrypt
1706 MOVQ_R64_XMM %xmm0, %rax
1711 jmp _return_T_done_encrypt
1713 movdqu %xmm0, (%r10)
1714 _return_T_done_encrypt:
1720 ENDPROC(aesni_gcm_enc)
1727 _key_expansion_256a:
1728 pshufd $0b11111111, %xmm1, %xmm1
1729 shufps $0b00010000, %xmm0, %xmm4
1731 shufps $0b10001100, %xmm0, %xmm4
1734 movaps %xmm0, (TKEYP)
1737 ENDPROC(_key_expansion_128)
1738 ENDPROC(_key_expansion_256a)
1741 _key_expansion_192a:
1742 pshufd $0b01010101, %xmm1, %xmm1
1743 shufps $0b00010000, %xmm0, %xmm4
1745 shufps $0b10001100, %xmm0, %xmm4
1752 pshufd $0b11111111, %xmm0, %xmm3
1757 shufps $0b01000100, %xmm0, %xmm6
1758 movaps %xmm6, (TKEYP)
1759 shufps $0b01001110, %xmm2, %xmm1
1760 movaps %xmm1, 0x10(TKEYP)
1763 ENDPROC(_key_expansion_192a)
1766 _key_expansion_192b:
1767 pshufd $0b01010101, %xmm1, %xmm1
1768 shufps $0b00010000, %xmm0, %xmm4
1770 shufps $0b10001100, %xmm0, %xmm4
1776 pshufd $0b11111111, %xmm0, %xmm3
1780 movaps %xmm0, (TKEYP)
1783 ENDPROC(_key_expansion_192b)
1786 _key_expansion_256b:
1787 pshufd $0b10101010, %xmm1, %xmm1
1788 shufps $0b00010000, %xmm2, %xmm4
1790 shufps $0b10001100, %xmm2, %xmm4
1793 movaps %xmm2, (TKEYP)
1796 ENDPROC(_key_expansion_256b)
1799 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1800 * unsigned int key_len)
1802 ENTRY(aesni_set_key)
1805 movl 8(%esp), KEYP # ctx
1806 movl 12(%esp), UKEYP # in_key
1807 movl 16(%esp), %edx # key_len
1809 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1810 movaps %xmm0, (KEYP)
1811 lea 0x10(KEYP), TKEYP # key addr
1812 movl %edx, 480(KEYP)
1813 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1817 movups 0x10(UKEYP), %xmm2 # other user key
1818 movaps %xmm2, (TKEYP)
1820 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1821 call _key_expansion_256a
1822 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1823 call _key_expansion_256b
1824 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1825 call _key_expansion_256a
1826 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1827 call _key_expansion_256b
1828 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1829 call _key_expansion_256a
1830 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1831 call _key_expansion_256b
1832 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1833 call _key_expansion_256a
1834 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1835 call _key_expansion_256b
1836 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1837 call _key_expansion_256a
1838 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1839 call _key_expansion_256b
1840 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1841 call _key_expansion_256a
1842 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1843 call _key_expansion_256b
1844 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1845 call _key_expansion_256a
1848 movq 0x10(UKEYP), %xmm2 # other user key
1849 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1850 call _key_expansion_192a
1851 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1852 call _key_expansion_192b
1853 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1854 call _key_expansion_192a
1855 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1856 call _key_expansion_192b
1857 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1858 call _key_expansion_192a
1859 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1860 call _key_expansion_192b
1861 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1862 call _key_expansion_192a
1863 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1864 call _key_expansion_192b
1867 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1868 call _key_expansion_128
1869 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1870 call _key_expansion_128
1871 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1872 call _key_expansion_128
1873 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1874 call _key_expansion_128
1875 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1876 call _key_expansion_128
1877 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1878 call _key_expansion_128
1879 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1880 call _key_expansion_128
1881 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1882 call _key_expansion_128
1883 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1884 call _key_expansion_128
1885 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1886 call _key_expansion_128
1889 movaps (KEYP), %xmm0
1890 movaps (TKEYP), %xmm1
1891 movaps %xmm0, 240(TKEYP)
1892 movaps %xmm1, 240(KEYP)
1894 lea 240-16(TKEYP), UKEYP
1897 movaps (KEYP), %xmm0
1899 movaps %xmm1, (UKEYP)
1909 ENDPROC(aesni_set_key)
1912 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1922 movl 480(KEYP), KLEN # key length
1923 movups (INP), STATE # input
1925 movups STATE, (OUTP) # output
1934 * _aesni_enc1: internal ABI
1936 * KEYP: key struct pointer
1938 * STATE: initial state (input)
1940 * STATE: finial state (output)
1947 movaps (KEYP), KEY # key
1949 pxor KEY, STATE # round 0
1953 lea 0x20(TKEYP), TKEYP
1956 movaps -0x60(TKEYP), KEY
1958 movaps -0x50(TKEYP), KEY
1962 movaps -0x40(TKEYP), KEY
1964 movaps -0x30(TKEYP), KEY
1968 movaps -0x20(TKEYP), KEY
1970 movaps -0x10(TKEYP), KEY
1974 movaps 0x10(TKEYP), KEY
1976 movaps 0x20(TKEYP), KEY
1978 movaps 0x30(TKEYP), KEY
1980 movaps 0x40(TKEYP), KEY
1982 movaps 0x50(TKEYP), KEY
1984 movaps 0x60(TKEYP), KEY
1986 movaps 0x70(TKEYP), KEY
1987 AESENCLAST KEY STATE
1989 ENDPROC(_aesni_enc1)
1992 * _aesni_enc4: internal ABI
1994 * KEYP: key struct pointer
1996 * STATE1: initial state (input)
2001 * STATE1: finial state (output)
2011 movaps (KEYP), KEY # key
2013 pxor KEY, STATE1 # round 0
2020 lea 0x20(TKEYP), TKEYP
2023 movaps -0x60(TKEYP), KEY
2028 movaps -0x50(TKEYP), KEY
2035 movaps -0x40(TKEYP), KEY
2040 movaps -0x30(TKEYP), KEY
2047 movaps -0x20(TKEYP), KEY
2052 movaps -0x10(TKEYP), KEY
2062 movaps 0x10(TKEYP), KEY
2067 movaps 0x20(TKEYP), KEY
2072 movaps 0x30(TKEYP), KEY
2077 movaps 0x40(TKEYP), KEY
2082 movaps 0x50(TKEYP), KEY
2087 movaps 0x60(TKEYP), KEY
2092 movaps 0x70(TKEYP), KEY
2093 AESENCLAST KEY STATE1 # last round
2094 AESENCLAST KEY STATE2
2095 AESENCLAST KEY STATE3
2096 AESENCLAST KEY STATE4
2098 ENDPROC(_aesni_enc4)
2101 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2111 mov 480(KEYP), KLEN # key length
2113 movups (INP), STATE # input
2115 movups STATE, (OUTP) #output
2124 * _aesni_dec1: internal ABI
2126 * KEYP: key struct pointer
2128 * STATE: initial state (input)
2130 * STATE: finial state (output)
2137 movaps (KEYP), KEY # key
2139 pxor KEY, STATE # round 0
2143 lea 0x20(TKEYP), TKEYP
2146 movaps -0x60(TKEYP), KEY
2148 movaps -0x50(TKEYP), KEY
2152 movaps -0x40(TKEYP), KEY
2154 movaps -0x30(TKEYP), KEY
2158 movaps -0x20(TKEYP), KEY
2160 movaps -0x10(TKEYP), KEY
2164 movaps 0x10(TKEYP), KEY
2166 movaps 0x20(TKEYP), KEY
2168 movaps 0x30(TKEYP), KEY
2170 movaps 0x40(TKEYP), KEY
2172 movaps 0x50(TKEYP), KEY
2174 movaps 0x60(TKEYP), KEY
2176 movaps 0x70(TKEYP), KEY
2177 AESDECLAST KEY STATE
2179 ENDPROC(_aesni_dec1)
2182 * _aesni_dec4: internal ABI
2184 * KEYP: key struct pointer
2186 * STATE1: initial state (input)
2191 * STATE1: finial state (output)
2201 movaps (KEYP), KEY # key
2203 pxor KEY, STATE1 # round 0
2210 lea 0x20(TKEYP), TKEYP
2213 movaps -0x60(TKEYP), KEY
2218 movaps -0x50(TKEYP), KEY
2225 movaps -0x40(TKEYP), KEY
2230 movaps -0x30(TKEYP), KEY
2237 movaps -0x20(TKEYP), KEY
2242 movaps -0x10(TKEYP), KEY
2252 movaps 0x10(TKEYP), KEY
2257 movaps 0x20(TKEYP), KEY
2262 movaps 0x30(TKEYP), KEY
2267 movaps 0x40(TKEYP), KEY
2272 movaps 0x50(TKEYP), KEY
2277 movaps 0x60(TKEYP), KEY
2282 movaps 0x70(TKEYP), KEY
2283 AESDECLAST KEY STATE1 # last round
2284 AESDECLAST KEY STATE2
2285 AESDECLAST KEY STATE3
2286 AESDECLAST KEY STATE4
2288 ENDPROC(_aesni_dec4)
2291 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2294 ENTRY(aesni_ecb_enc)
2304 test LEN, LEN # check length
2313 movups (INP), STATE1
2314 movups 0x10(INP), STATE2
2315 movups 0x20(INP), STATE3
2316 movups 0x30(INP), STATE4
2318 movups STATE1, (OUTP)
2319 movups STATE2, 0x10(OUTP)
2320 movups STATE3, 0x20(OUTP)
2321 movups STATE4, 0x30(OUTP)
2331 movups (INP), STATE1
2333 movups STATE1, (OUTP)
2346 ENDPROC(aesni_ecb_enc)
2349 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2352 ENTRY(aesni_ecb_dec)
2372 movups (INP), STATE1
2373 movups 0x10(INP), STATE2
2374 movups 0x20(INP), STATE3
2375 movups 0x30(INP), STATE4
2377 movups STATE1, (OUTP)
2378 movups STATE2, 0x10(OUTP)
2379 movups STATE3, 0x20(OUTP)
2380 movups STATE4, 0x30(OUTP)
2390 movups (INP), STATE1
2392 movups STATE1, (OUTP)
2405 ENDPROC(aesni_ecb_dec)
2408 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2409 * size_t len, u8 *iv)
2411 ENTRY(aesni_cbc_enc)
2426 movups (IVP), STATE # load iv as initial state
2429 movups (INP), IN # load input
2432 movups STATE, (OUTP) # store output
2447 ENDPROC(aesni_cbc_enc)
2450 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2451 * size_t len, u8 *iv)
2453 ENTRY(aesni_cbc_dec)
2466 jb .Lcbc_dec_just_ret
2476 movups 0x10(INP), IN2
2479 movups 0x20(INP), IN3
2481 movups 0x30(INP), IN4
2484 movups 0x20(INP), IN1
2486 movups 0x30(INP), IN2
2501 movups 0x10(INP), IN2
2504 movups STATE1, (OUTP)
2505 movups STATE2, 0x10(OUTP)
2506 movups STATE3, 0x20(OUTP)
2507 movups STATE4, 0x30(OUTP)
2521 movups STATE, (OUTP)
2538 ENDPROC(aesni_cbc_dec)
2543 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2546 * _aesni_inc_init: internal ABI
2547 * setup registers used by _aesni_inc
2551 * CTR: == IV, in little endian
2552 * TCTR_LOW: == lower qword of CTR
2553 * INC: == 1, in little endian
2554 * BSWAP_MASK == endian swapping mask
2558 movaps .Lbswap_mask, BSWAP_MASK
2560 PSHUFB_XMM BSWAP_MASK CTR
2562 MOVQ_R64_XMM TCTR_LOW INC
2563 MOVQ_R64_XMM CTR TCTR_LOW
2565 ENDPROC(_aesni_inc_init)
2568 * _aesni_inc: internal ABI
2569 * Increase IV by 1, IV is in big endian
2572 * CTR: == IV, in little endian
2573 * TCTR_LOW: == lower qword of CTR
2574 * INC: == 1, in little endian
2575 * BSWAP_MASK == endian swapping mask
2579 * CTR: == output IV, in little endian
2580 * TCTR_LOW: == lower qword of CTR
2592 PSHUFB_XMM BSWAP_MASK IV
2597 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2598 * size_t len, u8 *iv)
2600 ENTRY(aesni_ctr_enc)
2602 jb .Lctr_enc_just_ret
2605 call _aesni_inc_init
2615 movups 0x10(INP), IN2
2618 movups 0x20(INP), IN3
2621 movups 0x30(INP), IN4
2624 movups STATE1, (OUTP)
2626 movups STATE2, 0x10(OUTP)
2628 movups STATE3, 0x20(OUTP)
2630 movups STATE4, 0x30(OUTP)
2645 movups STATE, (OUTP)
2655 ENDPROC(aesni_ctr_enc)
2658 * _aesni_gf128mul_x_ble: internal ABI
2659 * Multiply in GF(2^128) for XTS IVs
2662 * GF128MUL_MASK == mask with 0x87 and 0x01
2666 * CTR: == temporary value
2668 #define _aesni_gf128mul_x_ble() \
2669 pshufd $0x13, IV, CTR; \
2672 pand GF128MUL_MASK, CTR; \
2676 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2679 ENTRY(aesni_xts_crypt8)
2683 leaq _aesni_enc4, %r11
2684 leaq _aesni_dec4, %rax
2688 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2695 movdqu 0x00(INP), INC
2697 movdqu IV, 0x00(OUTP)
2699 _aesni_gf128mul_x_ble()
2701 movdqu 0x10(INP), INC
2703 movdqu IV, 0x10(OUTP)
2705 _aesni_gf128mul_x_ble()
2707 movdqu 0x20(INP), INC
2709 movdqu IV, 0x20(OUTP)
2711 _aesni_gf128mul_x_ble()
2713 movdqu 0x30(INP), INC
2715 movdqu IV, 0x30(OUTP)
2719 movdqu 0x00(OUTP), INC
2721 movdqu STATE1, 0x00(OUTP)
2723 _aesni_gf128mul_x_ble()
2725 movdqu 0x40(INP), INC
2727 movdqu IV, 0x40(OUTP)
2729 movdqu 0x10(OUTP), INC
2731 movdqu STATE2, 0x10(OUTP)
2733 _aesni_gf128mul_x_ble()
2735 movdqu 0x50(INP), INC
2737 movdqu IV, 0x50(OUTP)
2739 movdqu 0x20(OUTP), INC
2741 movdqu STATE3, 0x20(OUTP)
2743 _aesni_gf128mul_x_ble()
2745 movdqu 0x60(INP), INC
2747 movdqu IV, 0x60(OUTP)
2749 movdqu 0x30(OUTP), INC
2751 movdqu STATE4, 0x30(OUTP)
2753 _aesni_gf128mul_x_ble()
2755 movdqu 0x70(INP), INC
2757 movdqu IV, 0x70(OUTP)
2759 _aesni_gf128mul_x_ble()
2764 movdqu 0x40(OUTP), INC
2766 movdqu STATE1, 0x40(OUTP)
2768 movdqu 0x50(OUTP), INC
2770 movdqu STATE2, 0x50(OUTP)
2772 movdqu 0x60(OUTP), INC
2774 movdqu STATE3, 0x60(OUTP)
2776 movdqu 0x70(OUTP), INC
2778 movdqu STATE4, 0x70(OUTP)
2781 ENDPROC(aesni_xts_crypt8)