2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
37 POLY: .octa 0xC2000000000000000000000000000001
38 TWOONE: .octa 0x00000001000000000000000000000001
40 # order of these constants should not change.
41 # more specifically, ALL_F should follow SHIFT_MASK,
42 # and ZERO should follow ALL_F
44 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45 MASK1: .octa 0x0000000000000000ffffffffffffffff
46 MASK2: .octa 0xffffffffffffffff0000000000000000
47 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49 ZERO: .octa 0x00000000000000000000000000000000
50 ONE: .octa 0x00000000000000000000000000000001
51 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
59 #define STACK_OFFSET 8*3
60 #define HashKey 16*0 // store HashKey <<1 mod poly here
61 #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62 #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63 #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64 #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67 #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70 #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73 #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76 #define VARIABLE_OFFSET 16*8
84 #define arg7 STACK_OFFSET+8(%r14)
85 #define arg8 STACK_OFFSET+16(%r14)
86 #define arg9 STACK_OFFSET+24(%r14)
87 #define arg10 STACK_OFFSET+32(%r14)
104 #define BSWAP_MASK %xmm10
136 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
139 * Input: A and B (128-bits each, bit-reflected)
140 * Output: C = A*B*x mod poly, (i.e. >>1 )
141 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
145 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
162 # first phase of the reduction
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
179 # second phase of the reduction
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
193 pxor \TMP1, \GH # result is in TMP1
197 * if a = number of total plaintext bytes
199 * num_initial_blocks = b mod 4
200 * encrypt the initial num_initial_blocks blocks and apply ghash on
202 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
204 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
208 .macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
214 _get_AAD_loop\num_initial_blocks\operation:
221 jne _get_AAD_loop\num_initial_blocks\operation
223 je _get_AAD_loop2_done\num_initial_blocks\operation
225 _get_AAD_loop2\num_initial_blocks\operation:
229 jne _get_AAD_loop2\num_initial_blocks\operation
230 _get_AAD_loop2_done\num_initial_blocks\operation:
231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
234 xor %r11, %r11 # initialise the data pointer offset as zero
236 # start AES for num_initial_blocks blocks
238 mov %arg5, %rax # %rax = *Y0
239 movdqu (%rax), \XMM0 # XMM0 = Y0
240 movdqa SHUF_MASK(%rip), %xmm14
241 PSHUFB_XMM %xmm14, \XMM0
243 .if (\i == 5) || (\i == 6) || (\i == 7)
245 paddd ONE(%rip), \XMM0 # INCR Y0
246 movdqa \XMM0, %xmm\index
247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
252 pxor 16*0(%arg1), %xmm\index
255 movaps 0x10(%rdi), \TMP1
256 AESENC \TMP1, %xmm\index # Round 1
259 movaps 0x20(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
263 movaps 0x30(%arg1), \TMP1
264 AESENC \TMP1, %xmm\index # Round 2
267 movaps 0x40(%arg1), \TMP1
268 AESENC \TMP1, %xmm\index # Round 2
271 movaps 0x50(%arg1), \TMP1
272 AESENC \TMP1, %xmm\index # Round 2
275 movaps 0x60(%arg1), \TMP1
276 AESENC \TMP1, %xmm\index # Round 2
279 movaps 0x70(%arg1), \TMP1
280 AESENC \TMP1, %xmm\index # Round 2
283 movaps 0x80(%arg1), \TMP1
284 AESENC \TMP1, %xmm\index # Round 2
287 movaps 0x90(%arg1), \TMP1
288 AESENC \TMP1, %xmm\index # Round 2
291 movaps 0xa0(%arg1), \TMP1
292 AESENCLAST \TMP1, %xmm\index # Round 10
295 movdqu (%arg3 , %r11, 1), \TMP1
296 pxor \TMP1, %xmm\index
297 movdqu %xmm\index, (%arg2 , %r11, 1)
298 # write back plaintext/ciphertext for num_initial_blocks
301 movdqa \TMP1, %xmm\index
302 movdqa SHUF_MASK(%rip), %xmm14
303 PSHUFB_XMM %xmm14, %xmm\index
305 # prepare plaintext/ciphertext for GHASH computation
308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 # apply GHASH on num_initial_blocks blocks
313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
328 jl _initial_blocks_done\num_initial_blocks\operation
329 # no need for precomputed values
332 * Precomputations for HashKey parallel with encryption of first 4 blocks.
333 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
335 paddd ONE(%rip), \XMM0 # INCR Y0
337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
340 paddd ONE(%rip), \XMM0 # INCR Y0
342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
345 paddd ONE(%rip), \XMM0 # INCR Y0
347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
350 paddd ONE(%rip), \XMM0 # INCR Y0
352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
355 pxor 16*0(%arg1), \XMM1
356 pxor 16*0(%arg1), \XMM2
357 pxor 16*0(%arg1), \XMM3
358 pxor 16*0(%arg1), \XMM4
360 pshufd $78, \TMP3, \TMP1
362 movdqa \TMP1, HashKey_k(%rsp)
363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364 # TMP5 = HashKey^2<<1 (mod poly)
365 movdqa \TMP5, HashKey_2(%rsp)
366 # HashKey_2 = HashKey^2<<1 (mod poly)
367 pshufd $78, \TMP5, \TMP1
369 movdqa \TMP1, HashKey_2_k(%rsp)
370 .irpc index, 1234 # do 4 rounds
371 movaps 0x10*\index(%arg1), \TMP1
377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378 # TMP5 = HashKey^3<<1 (mod poly)
379 movdqa \TMP5, HashKey_3(%rsp)
380 pshufd $78, \TMP5, \TMP1
382 movdqa \TMP1, HashKey_3_k(%rsp)
383 .irpc index, 56789 # do next 5 rounds
384 movaps 0x10*\index(%arg1), \TMP1
390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391 # TMP5 = HashKey^3<<1 (mod poly)
392 movdqa \TMP5, HashKey_4(%rsp)
393 pshufd $78, \TMP5, \TMP1
395 movdqa \TMP1, HashKey_4_k(%rsp)
396 movaps 0xa0(%arg1), \TMP2
397 AESENCLAST \TMP2, \XMM1
398 AESENCLAST \TMP2, \XMM2
399 AESENCLAST \TMP2, \XMM3
400 AESENCLAST \TMP2, \XMM4
401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
421 # combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
429 _initial_blocks_done\num_initial_blocks\operation:
435 * if a = number of total plaintext bytes
437 * num_initial_blocks = b mod 4
438 * encrypt the initial num_initial_blocks blocks and apply ghash on
440 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
442 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
446 .macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
452 _get_AAD_loop\num_initial_blocks\operation:
459 jne _get_AAD_loop\num_initial_blocks\operation
461 je _get_AAD_loop2_done\num_initial_blocks\operation
463 _get_AAD_loop2\num_initial_blocks\operation:
467 jne _get_AAD_loop2\num_initial_blocks\operation
468 _get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
472 xor %r11, %r11 # initialise the data pointer offset as zero
474 # start AES for num_initial_blocks blocks
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
481 .if (\i == 5) || (\i == 6) || (\i == 7)
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
490 pxor 16*0(%arg1), %xmm\index
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
542 # prepare plaintext/ciphertext for GHASH computation
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
569 * Precomputations for HashKey parallel with encryption of first 4 blocks.
570 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
572 paddd ONE(%rip), \XMM0 # INCR Y0
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
577 paddd ONE(%rip), \XMM0 # INCR Y0
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
582 paddd ONE(%rip), \XMM0 # INCR Y0
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
587 paddd ONE(%rip), \XMM0 # INCR Y0
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
597 pshufd $78, \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601 # TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603 # HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607 .irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615 # TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620 .irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628 # TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
655 # combine GHASHed value with the corresponding ciphertext
656 movdqa SHUF_MASK(%rip), %xmm14
657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
663 _initial_blocks_done\num_initial_blocks\operation:
668 * encrypt 4 blocks at a time
669 * ghash the 4 previously encrypted ciphertext blocks
670 * arg1, %arg2, %arg3 are used as pointers only, not modified
671 * %r11 is the data offset value
673 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
685 pshufd $78, \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
691 paddd ONE(%rip), \XMM0 # INCR CNT
693 paddd ONE(%rip), \XMM0 # INCR CNT
695 paddd ONE(%rip), \XMM0 # INCR CNT
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
720 pshufd $78, \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
743 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
747 pshufd $78, \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
751 # Multiply TMP5 * HashKey using karatsuba
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
773 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
781 pshufd $78, \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
826 # first phase of reduction
831 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
842 # second phase of reduction
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
854 pxor \TMP1, \XMM5 # result is in TMP1
860 * decrypt 4 blocks at a time
861 * ghash the 4 previously decrypted ciphertext blocks
862 * arg1, %arg2, %arg3 are used as pointers only, not modified
863 * %r11 is the data offset value
865 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
873 movdqa SHUF_MASK(%rip), %xmm15
874 # multiply TMP5 * HashKey using karatsuba
877 pshufd $78, \XMM5, \TMP6
879 paddd ONE(%rip), \XMM0 # INCR CNT
880 movdqa HashKey_4(%rsp), \TMP5
881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
883 paddd ONE(%rip), \XMM0 # INCR CNT
885 paddd ONE(%rip), \XMM0 # INCR CNT
887 paddd ONE(%rip), \XMM0 # INCR CNT
889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
899 movdqa HashKey_4_k(%rsp), \TMP5
900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901 movaps 0x10(%arg1), \TMP1
902 AESENC \TMP1, \XMM1 # Round 1
906 movaps 0x20(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 2
912 pshufd $78, \XMM6, \TMP2
914 movdqa HashKey_3(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916 movaps 0x30(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 3
921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922 movaps 0x40(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 4
927 movdqa HashKey_3_k(%rsp), \TMP5
928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929 movaps 0x50(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 5
935 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
939 pshufd $78, \XMM7, \TMP2
941 movdqa HashKey_2(%rsp ), \TMP5
943 # Multiply TMP5 * HashKey using karatsuba
945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946 movaps 0x60(%arg1), \TMP3
947 AESENC \TMP3, \XMM1 # Round 6
951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952 movaps 0x70(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 7
957 movdqa HashKey_2_k(%rsp), \TMP5
958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959 movaps 0x80(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 8
965 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
969 # Multiply XMM8 * HashKey
970 # XMM8 and TMP5 hold the values for the two operands
973 pshufd $78, \XMM8, \TMP2
975 movdqa HashKey(%rsp), \TMP5
976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977 movaps 0x90(%arg1), \TMP3
978 AESENC \TMP3, \XMM1 # Round 9
982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983 movaps 0xa0(%arg1), \TMP3
984 AESENCLAST \TMP3, \XMM1 # Round 10
985 AESENCLAST \TMP3, \XMM2
986 AESENCLAST \TMP3, \XMM3
987 AESENCLAST \TMP3, \XMM4
988 movdqa HashKey_k(%rsp), \TMP5
989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 movdqu (%arg3,%r11,1), \TMP3
991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
994 movdqu 16(%arg3,%r11,1), \TMP3
995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
998 movdqu 32(%arg3,%r11,1), \TMP3
999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1002 movdqu 48(%arg3,%r11,1), \TMP3
1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1022 # first phase of reduction
1027 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028 pslld $31, \TMP2 # packed right shift << 31
1029 pslld $30, \TMP3 # packed right shift << 30
1030 pslld $25, \TMP4 # packed right shift << 25
1031 pxor \TMP3, \TMP2 # xor the shifted versions
1034 psrldq $4, \TMP5 # right shift T5 1 DW
1035 pslldq $12, \TMP2 # left shift T2 3 DWs
1038 # second phase of reduction
1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1043 psrld $1, \TMP2 # packed left shift >>1
1044 psrld $2, \TMP3 # packed left shift >>2
1045 psrld $7, \TMP4 # packed left shift >>7
1046 pxor \TMP3,\TMP2 # xor the shifted versions
1050 pxor \TMP1, \XMM5 # result is in TMP1
1055 /* GHASH the last 4 ciphertext blocks. */
1056 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1059 # Multiply TMP6 * HashKey (using Karatsuba)
1062 pshufd $78, \XMM1, \TMP2
1064 movdqa HashKey_4(%rsp), \TMP5
1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067 movdqa HashKey_4_k(%rsp), \TMP4
1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069 movdqa \XMM1, \XMMDst
1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1072 # Multiply TMP1 * HashKey (using Karatsuba)
1075 pshufd $78, \XMM2, \TMP2
1077 movdqa HashKey_3(%rsp), \TMP5
1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080 movdqa HashKey_3_k(%rsp), \TMP4
1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1085 # results accumulated in TMP6, XMMDst, XMM1
1087 # Multiply TMP1 * HashKey (using Karatsuba)
1090 pshufd $78, \XMM3, \TMP2
1092 movdqa HashKey_2(%rsp), \TMP5
1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095 movdqa HashKey_2_k(%rsp), \TMP4
1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1101 # Multiply TMP1 * HashKey (using Karatsuba)
1103 pshufd $78, \XMM4, \TMP2
1105 movdqa HashKey(%rsp), \TMP5
1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108 movdqa HashKey_k(%rsp), \TMP4
1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1115 # middle section of the temp results combined as in karatsuba algorithm
1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1121 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122 # first phase of the reduction
1123 movdqa \XMMDst, \TMP2
1124 movdqa \XMMDst, \TMP3
1125 movdqa \XMMDst, \TMP4
1126 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127 pslld $31, \TMP2 # packed right shifting << 31
1128 pslld $30, \TMP3 # packed right shifting << 30
1129 pslld $25, \TMP4 # packed right shifting << 25
1130 pxor \TMP3, \TMP2 # xor the shifted versions
1133 psrldq $4, \TMP7 # right shift TMP7 1 DW
1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1137 # second phase of the reduction
1138 movdqa \XMMDst, \TMP2
1139 # make 3 copies of XMMDst for doing 3 shift operations
1140 movdqa \XMMDst, \TMP3
1141 movdqa \XMMDst, \TMP4
1142 psrld $1, \TMP2 # packed left shift >> 1
1143 psrld $2, \TMP3 # packed left shift >> 2
1144 psrld $7, \TMP4 # packed left shift >> 7
1145 pxor \TMP3, \TMP2 # xor the shifted versions
1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1152 /* Encryption of a single block done*/
1153 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1156 movaps 16(%arg1), \TMP1
1158 movaps 32(%arg1), \TMP1
1160 movaps 48(%arg1), \TMP1
1162 movaps 64(%arg1), \TMP1
1164 movaps 80(%arg1), \TMP1
1166 movaps 96(%arg1), \TMP1
1168 movaps 112(%arg1), \TMP1
1170 movaps 128(%arg1), \TMP1
1172 movaps 144(%arg1), \TMP1
1174 movaps 160(%arg1), \TMP1
1175 AESENCLAST \TMP1, \XMM0
1179 /*****************************************************************************
1180 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182 * const u8 *in, // Ciphertext input
1183 * u64 plaintext_len, // Length of data in bytes for decryption.
1184 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186 * // concatenated with 0x00000001. 16-byte aligned pointer.
1187 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188 * const u8 *aad, // Additional Authentication Data (AAD)
1189 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191 * // given authentication tag and only return the plaintext if they match.
1192 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193 * // (most likely), 12 or 8.
1198 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1199 * set of 11 keys in the data structure void *aes_ctx
1203 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205 * | Salt (From the SA) |
1206 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207 * | Initialization Vector |
1208 * | (This is the sequence number from IPSec header) |
1209 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1211 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1216 * AAD padded to 128 bits with 0
1217 * for example, assume AAD is a u32 vector
1219 * if AAD is 8 bytes:
1220 * AAD[3] = {A0, A1};
1221 * padded AAD in xmm register = {A1 A0 0 0}
1224 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1227 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228 * | 32-bit Sequence Number (A0) |
1229 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1231 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1233 * AAD Format with 32-bit Sequence Number
1235 * if AAD is 12 bytes:
1236 * AAD[3] = {A0, A1, A2};
1237 * padded AAD in xmm register = {A2 A1 A0 0}
1240 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1245 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246 * | 64-bit Extended Sequence Number {A1,A0} |
1248 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1250 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1252 * AAD Format with 64-bit Extended Sequence Number
1255 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256 * The code supports 16 too but for other sizes, the code will fail.
1259 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260 * For other sizes, the code will fail.
1262 * poly = x^128 + x^127 + x^126 + x^121 + 1
1264 *****************************************************************************/
1265 ENTRY(aesni_gcm_dec)
1271 * states of %xmm registers %xmm6:%xmm15 not saved
1272 * all %xmm registers are clobbered
1274 sub $VARIABLE_OFFSET, %rsp
1275 and $~63, %rsp # align rsp to 64 bytes
1277 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1278 movdqa SHUF_MASK(%rip), %xmm2
1279 PSHUFB_XMM %xmm2, %xmm13
1282 # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284 movdqa %xmm13, %xmm2
1294 pshufd $0x24, %xmm1, %xmm2
1295 pcmpeqd TWOONE(%rip), %xmm2
1296 pand POLY(%rip), %xmm2
1297 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1300 # Decrypt first few blocks
1302 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1303 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1304 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1307 jz _initial_num_blocks_is_0_decrypt
1309 jb _initial_num_blocks_is_1_decrypt
1310 je _initial_num_blocks_is_2_decrypt
1311 _initial_num_blocks_is_3_decrypt:
1312 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1313 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315 jmp _initial_blocks_decrypted
1316 _initial_num_blocks_is_2_decrypt:
1317 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1318 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320 jmp _initial_blocks_decrypted
1321 _initial_num_blocks_is_1_decrypt:
1322 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1323 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325 jmp _initial_blocks_decrypted
1326 _initial_num_blocks_is_0_decrypt:
1327 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1328 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1329 _initial_blocks_decrypted:
1331 je _zero_cipher_left_decrypt
1333 je _four_cipher_left_decrypt
1335 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1336 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1340 _four_cipher_left_decrypt:
1341 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1342 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1343 _zero_cipher_left_decrypt:
1345 and $15, %r13 # %r13 = arg4 (mod 16)
1346 je _multiple_of_16_bytes_decrypt
1348 # Handle the last <16 byte block separately
1350 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1351 movdqa SHUF_MASK(%rip), %xmm10
1352 PSHUFB_XMM %xmm10, %xmm0
1354 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1357 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1358 lea SHIFT_MASK+16(%rip), %r12
1360 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1361 # (%r13 is the number of bytes in plaintext mod 16)
1362 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1363 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1366 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1367 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1368 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1369 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1371 movdqa SHUF_MASK(%rip), %xmm10
1372 PSHUFB_XMM %xmm10 ,%xmm2
1375 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1376 # GHASH computation for the last <16 byte block
1381 MOVQ_R64_XMM %xmm0, %rax
1383 jle _less_than_8_bytes_left_decrypt
1384 mov %rax, (%arg2 , %r11, 1)
1387 MOVQ_R64_XMM %xmm0, %rax
1389 _less_than_8_bytes_left_decrypt:
1390 mov %al, (%arg2, %r11, 1)
1394 jne _less_than_8_bytes_left_decrypt
1395 _multiple_of_16_bytes_decrypt:
1396 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1397 shl $3, %r12 # convert into number of bits
1398 movd %r12d, %xmm15 # len(A) in %xmm15
1399 shl $3, %arg4 # len(C) in bits (*128)
1400 MOVQ_R64_XMM %arg4, %xmm1
1401 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1402 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1404 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1405 # final GHASH computation
1406 movdqa SHUF_MASK(%rip), %xmm10
1407 PSHUFB_XMM %xmm10, %xmm8
1409 mov %arg5, %rax # %rax = *Y0
1410 movdqu (%rax), %xmm0 # %xmm0 = Y0
1411 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1414 mov arg9, %r10 # %r10 = authTag
1415 mov arg10, %r11 # %r11 = auth_tag_len
1421 MOVQ_R64_XMM %xmm0, %rax
1423 jmp _return_T_done_decrypt
1425 MOVQ_R64_XMM %xmm0, %rax
1430 jmp _return_T_done_decrypt
1432 movdqu %xmm0, (%r10)
1433 _return_T_done_decrypt:
1439 ENDPROC(aesni_gcm_dec)
1442 /*****************************************************************************
1443 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445 * const u8 *in, // Plaintext input
1446 * u64 plaintext_len, // Length of data in bytes for encryption.
1447 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449 * // concatenated with 0x00000001. 16-byte aligned pointer.
1450 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451 * const u8 *aad, // Additional Authentication Data (AAD)
1452 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453 * u8 *auth_tag, // Authenticated Tag output.
1454 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1460 * keys are pre-expanded and aligned to 16 bytes. we are using the
1461 * first set of 11 keys in the data structure void *aes_ctx
1466 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468 * | Salt (From the SA) |
1469 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470 * | Initialization Vector |
1471 * | (This is the sequence number from IPSec header) |
1472 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1474 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1479 * AAD padded to 128 bits with 0
1480 * for example, assume AAD is a u32 vector
1482 * if AAD is 8 bytes:
1483 * AAD[3] = {A0, A1};
1484 * padded AAD in xmm register = {A1 A0 0 0}
1487 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1490 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491 * | 32-bit Sequence Number (A0) |
1492 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1494 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1496 * AAD Format with 32-bit Sequence Number
1498 * if AAD is 12 bytes:
1499 * AAD[3] = {A0, A1, A2};
1500 * padded AAD in xmm register = {A2 A1 A0 0}
1503 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1506 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507 * | 64-bit Extended Sequence Number {A1,A0} |
1509 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1511 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1513 * AAD Format with 64-bit Extended Sequence Number
1516 * from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517 * The code supports 16 too but for other sizes, the code will fail.
1520 * from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521 * For other sizes, the code will fail.
1523 * poly = x^128 + x^127 + x^126 + x^121 + 1
1524 ***************************************************************************/
1525 ENTRY(aesni_gcm_enc)
1531 # states of %xmm registers %xmm6:%xmm15 not saved
1532 # all %xmm registers are clobbered
1534 sub $VARIABLE_OFFSET, %rsp
1537 movdqu (%r12), %xmm13
1538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1542 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1544 movdqa %xmm13, %xmm2
1554 pshufd $0x24, %xmm1, %xmm2
1555 pcmpeqd TWOONE(%rip), %xmm2
1556 pand POLY(%rip), %xmm2
1558 movdqa %xmm13, HashKey(%rsp)
1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1563 # Encrypt first few blocks
1566 jz _initial_num_blocks_is_0_encrypt
1568 jb _initial_num_blocks_is_1_encrypt
1569 je _initial_num_blocks_is_2_encrypt
1570 _initial_num_blocks_is_3_encrypt:
1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1574 jmp _initial_blocks_encrypted
1575 _initial_num_blocks_is_2_encrypt:
1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1579 jmp _initial_blocks_encrypted
1580 _initial_num_blocks_is_1_encrypt:
1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1584 jmp _initial_blocks_encrypted
1585 _initial_num_blocks_is_0_encrypt:
1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588 _initial_blocks_encrypted:
1590 # Main loop - Encrypt remaining blocks
1593 je _zero_cipher_left_encrypt
1595 je _four_cipher_left_encrypt
1596 _encrypt_by_4_encrypt:
1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598 %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1601 jne _encrypt_by_4_encrypt
1602 _four_cipher_left_encrypt:
1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605 _zero_cipher_left_encrypt:
1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt
1610 # Handle the last <16 Byte block separately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1616 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1619 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1620 lea SHIFT_MASK+16(%rip), %r12
1622 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623 # (%r13 is the number of bytes in plaintext mod 16)
1624 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1625 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1626 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1627 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1628 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10,%xmm0
1634 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635 # GHASH computation for the last <16 byte block
1639 movdqa SHUF_MASK(%rip), %xmm10
1640 PSHUFB_XMM %xmm10, %xmm0
1642 # shuffle xmm0 back to output as ciphertext
1645 MOVQ_R64_XMM %xmm0, %rax
1647 jle _less_than_8_bytes_left_encrypt
1648 mov %rax, (%arg2 , %r11, 1)
1651 MOVQ_R64_XMM %xmm0, %rax
1653 _less_than_8_bytes_left_encrypt:
1654 mov %al, (%arg2, %r11, 1)
1658 jne _less_than_8_bytes_left_encrypt
1659 _multiple_of_16_bytes_encrypt:
1660 mov arg8, %r12 # %r12 = addLen (number of bytes)
1662 movd %r12d, %xmm15 # len(A) in %xmm15
1663 shl $3, %arg4 # len(C) in bits (*128)
1664 MOVQ_R64_XMM %arg4, %xmm1
1665 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1666 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1668 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669 # final GHASH computation
1670 movdqa SHUF_MASK(%rip), %xmm10
1671 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1673 mov %arg5, %rax # %rax = *Y0
1674 movdqu (%rax), %xmm0 # %xmm0 = Y0
1675 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1678 mov arg9, %r10 # %r10 = authTag
1679 mov arg10, %r11 # %r11 = auth_tag_len
1685 MOVQ_R64_XMM %xmm0, %rax
1687 jmp _return_T_done_encrypt
1689 MOVQ_R64_XMM %xmm0, %rax
1694 jmp _return_T_done_encrypt
1696 movdqu %xmm0, (%r10)
1697 _return_T_done_encrypt:
1703 ENDPROC(aesni_gcm_enc)
1710 _key_expansion_256a:
1711 pshufd $0b11111111, %xmm1, %xmm1
1712 shufps $0b00010000, %xmm0, %xmm4
1714 shufps $0b10001100, %xmm0, %xmm4
1717 movaps %xmm0, (TKEYP)
1720 ENDPROC(_key_expansion_128)
1721 ENDPROC(_key_expansion_256a)
1724 _key_expansion_192a:
1725 pshufd $0b01010101, %xmm1, %xmm1
1726 shufps $0b00010000, %xmm0, %xmm4
1728 shufps $0b10001100, %xmm0, %xmm4
1735 pshufd $0b11111111, %xmm0, %xmm3
1740 shufps $0b01000100, %xmm0, %xmm6
1741 movaps %xmm6, (TKEYP)
1742 shufps $0b01001110, %xmm2, %xmm1
1743 movaps %xmm1, 0x10(TKEYP)
1746 ENDPROC(_key_expansion_192a)
1749 _key_expansion_192b:
1750 pshufd $0b01010101, %xmm1, %xmm1
1751 shufps $0b00010000, %xmm0, %xmm4
1753 shufps $0b10001100, %xmm0, %xmm4
1759 pshufd $0b11111111, %xmm0, %xmm3
1763 movaps %xmm0, (TKEYP)
1766 ENDPROC(_key_expansion_192b)
1769 _key_expansion_256b:
1770 pshufd $0b10101010, %xmm1, %xmm1
1771 shufps $0b00010000, %xmm2, %xmm4
1773 shufps $0b10001100, %xmm2, %xmm4
1776 movaps %xmm2, (TKEYP)
1779 ENDPROC(_key_expansion_256b)
1782 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1783 * unsigned int key_len)
1785 ENTRY(aesni_set_key)
1788 movl 8(%esp), KEYP # ctx
1789 movl 12(%esp), UKEYP # in_key
1790 movl 16(%esp), %edx # key_len
1792 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1793 movaps %xmm0, (KEYP)
1794 lea 0x10(KEYP), TKEYP # key addr
1795 movl %edx, 480(KEYP)
1796 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1800 movups 0x10(UKEYP), %xmm2 # other user key
1801 movaps %xmm2, (TKEYP)
1803 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1804 call _key_expansion_256a
1805 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1806 call _key_expansion_256b
1807 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1808 call _key_expansion_256a
1809 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1810 call _key_expansion_256b
1811 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1812 call _key_expansion_256a
1813 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1814 call _key_expansion_256b
1815 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1816 call _key_expansion_256a
1817 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1818 call _key_expansion_256b
1819 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1820 call _key_expansion_256a
1821 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1822 call _key_expansion_256b
1823 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1824 call _key_expansion_256a
1825 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1826 call _key_expansion_256b
1827 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1828 call _key_expansion_256a
1831 movq 0x10(UKEYP), %xmm2 # other user key
1832 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1833 call _key_expansion_192a
1834 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1835 call _key_expansion_192b
1836 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1837 call _key_expansion_192a
1838 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1839 call _key_expansion_192b
1840 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1841 call _key_expansion_192a
1842 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1843 call _key_expansion_192b
1844 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1845 call _key_expansion_192a
1846 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1847 call _key_expansion_192b
1850 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1851 call _key_expansion_128
1852 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1853 call _key_expansion_128
1854 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1855 call _key_expansion_128
1856 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1857 call _key_expansion_128
1858 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1859 call _key_expansion_128
1860 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1861 call _key_expansion_128
1862 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1863 call _key_expansion_128
1864 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1865 call _key_expansion_128
1866 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1867 call _key_expansion_128
1868 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1869 call _key_expansion_128
1872 movaps (KEYP), %xmm0
1873 movaps (TKEYP), %xmm1
1874 movaps %xmm0, 240(TKEYP)
1875 movaps %xmm1, 240(KEYP)
1877 lea 240-16(TKEYP), UKEYP
1880 movaps (KEYP), %xmm0
1882 movaps %xmm1, (UKEYP)
1892 ENDPROC(aesni_set_key)
1895 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1905 movl 480(KEYP), KLEN # key length
1906 movups (INP), STATE # input
1908 movups STATE, (OUTP) # output
1917 * _aesni_enc1: internal ABI
1919 * KEYP: key struct pointer
1921 * STATE: initial state (input)
1923 * STATE: finial state (output)
1930 movaps (KEYP), KEY # key
1932 pxor KEY, STATE # round 0
1936 lea 0x20(TKEYP), TKEYP
1939 movaps -0x60(TKEYP), KEY
1941 movaps -0x50(TKEYP), KEY
1945 movaps -0x40(TKEYP), KEY
1947 movaps -0x30(TKEYP), KEY
1951 movaps -0x20(TKEYP), KEY
1953 movaps -0x10(TKEYP), KEY
1957 movaps 0x10(TKEYP), KEY
1959 movaps 0x20(TKEYP), KEY
1961 movaps 0x30(TKEYP), KEY
1963 movaps 0x40(TKEYP), KEY
1965 movaps 0x50(TKEYP), KEY
1967 movaps 0x60(TKEYP), KEY
1969 movaps 0x70(TKEYP), KEY
1970 AESENCLAST KEY STATE
1972 ENDPROC(_aesni_enc1)
1975 * _aesni_enc4: internal ABI
1977 * KEYP: key struct pointer
1979 * STATE1: initial state (input)
1984 * STATE1: finial state (output)
1994 movaps (KEYP), KEY # key
1996 pxor KEY, STATE1 # round 0
2003 lea 0x20(TKEYP), TKEYP
2006 movaps -0x60(TKEYP), KEY
2011 movaps -0x50(TKEYP), KEY
2018 movaps -0x40(TKEYP), KEY
2023 movaps -0x30(TKEYP), KEY
2030 movaps -0x20(TKEYP), KEY
2035 movaps -0x10(TKEYP), KEY
2045 movaps 0x10(TKEYP), KEY
2050 movaps 0x20(TKEYP), KEY
2055 movaps 0x30(TKEYP), KEY
2060 movaps 0x40(TKEYP), KEY
2065 movaps 0x50(TKEYP), KEY
2070 movaps 0x60(TKEYP), KEY
2075 movaps 0x70(TKEYP), KEY
2076 AESENCLAST KEY STATE1 # last round
2077 AESENCLAST KEY STATE2
2078 AESENCLAST KEY STATE3
2079 AESENCLAST KEY STATE4
2081 ENDPROC(_aesni_enc4)
2084 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2094 mov 480(KEYP), KLEN # key length
2096 movups (INP), STATE # input
2098 movups STATE, (OUTP) #output
2107 * _aesni_dec1: internal ABI
2109 * KEYP: key struct pointer
2111 * STATE: initial state (input)
2113 * STATE: finial state (output)
2120 movaps (KEYP), KEY # key
2122 pxor KEY, STATE # round 0
2126 lea 0x20(TKEYP), TKEYP
2129 movaps -0x60(TKEYP), KEY
2131 movaps -0x50(TKEYP), KEY
2135 movaps -0x40(TKEYP), KEY
2137 movaps -0x30(TKEYP), KEY
2141 movaps -0x20(TKEYP), KEY
2143 movaps -0x10(TKEYP), KEY
2147 movaps 0x10(TKEYP), KEY
2149 movaps 0x20(TKEYP), KEY
2151 movaps 0x30(TKEYP), KEY
2153 movaps 0x40(TKEYP), KEY
2155 movaps 0x50(TKEYP), KEY
2157 movaps 0x60(TKEYP), KEY
2159 movaps 0x70(TKEYP), KEY
2160 AESDECLAST KEY STATE
2162 ENDPROC(_aesni_dec1)
2165 * _aesni_dec4: internal ABI
2167 * KEYP: key struct pointer
2169 * STATE1: initial state (input)
2174 * STATE1: finial state (output)
2184 movaps (KEYP), KEY # key
2186 pxor KEY, STATE1 # round 0
2193 lea 0x20(TKEYP), TKEYP
2196 movaps -0x60(TKEYP), KEY
2201 movaps -0x50(TKEYP), KEY
2208 movaps -0x40(TKEYP), KEY
2213 movaps -0x30(TKEYP), KEY
2220 movaps -0x20(TKEYP), KEY
2225 movaps -0x10(TKEYP), KEY
2235 movaps 0x10(TKEYP), KEY
2240 movaps 0x20(TKEYP), KEY
2245 movaps 0x30(TKEYP), KEY
2250 movaps 0x40(TKEYP), KEY
2255 movaps 0x50(TKEYP), KEY
2260 movaps 0x60(TKEYP), KEY
2265 movaps 0x70(TKEYP), KEY
2266 AESDECLAST KEY STATE1 # last round
2267 AESDECLAST KEY STATE2
2268 AESDECLAST KEY STATE3
2269 AESDECLAST KEY STATE4
2271 ENDPROC(_aesni_dec4)
2274 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2277 ENTRY(aesni_ecb_enc)
2287 test LEN, LEN # check length
2296 movups (INP), STATE1
2297 movups 0x10(INP), STATE2
2298 movups 0x20(INP), STATE3
2299 movups 0x30(INP), STATE4
2301 movups STATE1, (OUTP)
2302 movups STATE2, 0x10(OUTP)
2303 movups STATE3, 0x20(OUTP)
2304 movups STATE4, 0x30(OUTP)
2314 movups (INP), STATE1
2316 movups STATE1, (OUTP)
2329 ENDPROC(aesni_ecb_enc)
2332 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2335 ENTRY(aesni_ecb_dec)
2355 movups (INP), STATE1
2356 movups 0x10(INP), STATE2
2357 movups 0x20(INP), STATE3
2358 movups 0x30(INP), STATE4
2360 movups STATE1, (OUTP)
2361 movups STATE2, 0x10(OUTP)
2362 movups STATE3, 0x20(OUTP)
2363 movups STATE4, 0x30(OUTP)
2373 movups (INP), STATE1
2375 movups STATE1, (OUTP)
2388 ENDPROC(aesni_ecb_dec)
2391 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2392 * size_t len, u8 *iv)
2394 ENTRY(aesni_cbc_enc)
2409 movups (IVP), STATE # load iv as initial state
2412 movups (INP), IN # load input
2415 movups STATE, (OUTP) # store output
2430 ENDPROC(aesni_cbc_enc)
2433 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2434 * size_t len, u8 *iv)
2436 ENTRY(aesni_cbc_dec)
2449 jb .Lcbc_dec_just_ret
2459 movups 0x10(INP), IN2
2462 movups 0x20(INP), IN3
2464 movups 0x30(INP), IN4
2467 movups 0x20(INP), IN1
2469 movups 0x30(INP), IN2
2484 movups 0x10(INP), IN2
2487 movups STATE1, (OUTP)
2488 movups STATE2, 0x10(OUTP)
2489 movups STATE3, 0x20(OUTP)
2490 movups STATE4, 0x30(OUTP)
2504 movups STATE, (OUTP)
2521 ENDPROC(aesni_cbc_dec)
2526 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2529 * _aesni_inc_init: internal ABI
2530 * setup registers used by _aesni_inc
2534 * CTR: == IV, in little endian
2535 * TCTR_LOW: == lower qword of CTR
2536 * INC: == 1, in little endian
2537 * BSWAP_MASK == endian swapping mask
2541 movaps .Lbswap_mask, BSWAP_MASK
2543 PSHUFB_XMM BSWAP_MASK CTR
2545 MOVQ_R64_XMM TCTR_LOW INC
2546 MOVQ_R64_XMM CTR TCTR_LOW
2548 ENDPROC(_aesni_inc_init)
2551 * _aesni_inc: internal ABI
2552 * Increase IV by 1, IV is in big endian
2555 * CTR: == IV, in little endian
2556 * TCTR_LOW: == lower qword of CTR
2557 * INC: == 1, in little endian
2558 * BSWAP_MASK == endian swapping mask
2562 * CTR: == output IV, in little endian
2563 * TCTR_LOW: == lower qword of CTR
2575 PSHUFB_XMM BSWAP_MASK IV
2580 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2581 * size_t len, u8 *iv)
2583 ENTRY(aesni_ctr_enc)
2585 jb .Lctr_enc_just_ret
2588 call _aesni_inc_init
2598 movups 0x10(INP), IN2
2601 movups 0x20(INP), IN3
2604 movups 0x30(INP), IN4
2607 movups STATE1, (OUTP)
2609 movups STATE2, 0x10(OUTP)
2611 movups STATE3, 0x20(OUTP)
2613 movups STATE4, 0x30(OUTP)
2628 movups STATE, (OUTP)
2638 ENDPROC(aesni_ctr_enc)