1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Implement AES algorithm in Intel AES-NI instructions.
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
24 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
28 #include <linux/linkage.h>
29 #include <asm/frame.h>
30 #include <asm/nospec-branch.h>
33 * The following macros are used to move an (un)aligned 16 byte value to/from
34 * an XMM register. This can done for either FP or integer values, for FP use
35 * movaps (move aligned packed single) or integer use movdqa (move double quad
36 * aligned). It doesn't make a performance difference which instruction is used
37 * since Nehalem (original Core i7) was released. However, the movaps is a byte
38 * shorter, so that is the one we'll use for now. (same for unaligned).
45 # constants in mergeable sections, linker can reorder and merge
46 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
48 .Lgf128mul_x_ble_mask:
49 .octa 0x00000000000000010000000000000087
50 .section .rodata.cst16.POLY, "aM", @progbits, 16
52 POLY: .octa 0xC2000000000000000000000000000001
53 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
55 TWOONE: .octa 0x00000001000000000000000000000001
57 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
59 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
60 .section .rodata.cst16.MASK1, "aM", @progbits, 16
62 MASK1: .octa 0x0000000000000000ffffffffffffffff
63 .section .rodata.cst16.MASK2, "aM", @progbits, 16
65 MASK2: .octa 0xffffffffffffffff0000000000000000
66 .section .rodata.cst16.ONE, "aM", @progbits, 16
68 ONE: .octa 0x00000000000000000000000000000001
69 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
71 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
72 .section .rodata.cst16.dec, "aM", @progbits, 16
75 .section .rodata.cst16.enc, "aM", @progbits, 16
79 # order of these constants should not change.
80 # more specifically, ALL_F should follow SHIFT_MASK,
81 # and zero should follow ALL_F
82 .section .rodata, "a", @progbits
84 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
85 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
86 .octa 0x00000000000000000000000000000000
91 #define STACK_OFFSET 8*3
95 #define InLen (16*1)+8
96 #define PBlockEncKey 16*2
99 #define PBlockLen 16*5
100 #define HashKey 16*6 // store HashKey <<1 mod poly here
101 #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
102 #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
103 #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
104 #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
105 // bits of HashKey <<1 mod poly here
106 //(for Karatsuba purposes)
107 #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
108 // bits of HashKey^2 <<1 mod poly here
109 // (for Karatsuba purposes)
110 #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
111 // bits of HashKey^3 <<1 mod poly here
112 // (for Karatsuba purposes)
113 #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
114 // bits of HashKey^4 <<1 mod poly here
115 // (for Karatsuba purposes)
123 #define arg7 STACK_OFFSET+8(%rsp)
124 #define arg8 STACK_OFFSET+16(%rsp)
125 #define arg9 STACK_OFFSET+24(%rsp)
126 #define arg10 STACK_OFFSET+32(%rsp)
127 #define arg11 STACK_OFFSET+40(%rsp)
128 #define keysize 2*15*16(%arg1)
145 #define BSWAP_MASK %xmm10
149 #define GF128MUL_MASK %xmm10
182 # states of %xmm registers %xmm6:%xmm15 not saved
183 # all %xmm registers are clobbered
194 # Precompute hashkeys.
195 # Input: Hash subkey.
196 # Output: HashKeys stored in gcm_context_data. Only needs to be called
198 # clobbers r12, and tmp xmm registers.
199 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
202 movdqa SHUF_MASK(%rip), \TMP2
205 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
217 pshufd $0x24, \TMP1, \TMP2
218 pcmpeqd TWOONE(%rip), \TMP2
219 pand POLY(%rip), \TMP2
221 movdqu \TMP3, HashKey(%arg2)
224 pshufd $78, \TMP3, \TMP1
226 movdqu \TMP1, HashKey_k(%arg2)
228 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
229 # TMP5 = HashKey^2<<1 (mod poly)
230 movdqu \TMP5, HashKey_2(%arg2)
231 # HashKey_2 = HashKey^2<<1 (mod poly)
232 pshufd $78, \TMP5, \TMP1
234 movdqu \TMP1, HashKey_2_k(%arg2)
236 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
237 # TMP5 = HashKey^3<<1 (mod poly)
238 movdqu \TMP5, HashKey_3(%arg2)
239 pshufd $78, \TMP5, \TMP1
241 movdqu \TMP1, HashKey_3_k(%arg2)
243 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
244 # TMP5 = HashKey^3<<1 (mod poly)
245 movdqu \TMP5, HashKey_4(%arg2)
246 pshufd $78, \TMP5, \TMP1
248 movdqu \TMP1, HashKey_4_k(%arg2)
251 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
252 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
253 .macro GCM_INIT Iv SUBKEY AAD AADLEN
255 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
257 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
258 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
259 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
262 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
264 movdqa SHUF_MASK(%rip), %xmm2
266 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
268 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
269 movdqu HashKey(%arg2), %xmm13
271 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
275 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
276 # struct has been initialized by GCM_INIT.
277 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
278 # Clobbers rax, r10-r13, and xmm0-xmm15
279 .macro GCM_ENC_DEC operation
280 movdqu AadHash(%arg2), %xmm8
281 movdqu HashKey(%arg2), %xmm13
282 add %arg5, InLen(%arg2)
284 xor %r11d, %r11d # initialise the data pointer offset as zero
285 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
287 sub %r11, %arg5 # sub partial block data used
288 mov %arg5, %r13 # save the number of bytes
290 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
292 # Encrypt/Decrypt first few blocks
295 jz _initial_num_blocks_is_0_\@
297 jb _initial_num_blocks_is_1_\@
298 je _initial_num_blocks_is_2_\@
299 _initial_num_blocks_is_3_\@:
300 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
301 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
303 jmp _initial_blocks_\@
304 _initial_num_blocks_is_2_\@:
305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
308 jmp _initial_blocks_\@
309 _initial_num_blocks_is_1_\@:
310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
313 jmp _initial_blocks_\@
314 _initial_num_blocks_is_0_\@:
315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
319 # Main loop - Encrypt/Decrypt remaining blocks
322 je _zero_cipher_left_\@
324 je _four_cipher_left_\@
326 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
327 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
332 _four_cipher_left_\@:
333 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
334 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
335 _zero_cipher_left_\@:
336 movdqu %xmm8, AadHash(%arg2)
337 movdqu %xmm0, CurCount(%arg2)
340 and $15, %r13 # %r13 = arg5 (mod 16)
341 je _multiple_of_16_bytes_\@
343 mov %r13, PBlockLen(%arg2)
345 # Handle the last <16 Byte block separately
346 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
347 movdqu %xmm0, CurCount(%arg2)
348 movdqa SHUF_MASK(%rip), %xmm10
351 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
352 movdqu %xmm0, PBlockEncKey(%arg2)
355 jge _large_enough_update_\@
357 lea (%arg4,%r11,1), %r10
359 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
362 _large_enough_update_\@:
366 # receive the last <16 Byte block
367 movdqu (%arg4, %r11, 1), %xmm1
372 lea SHIFT_MASK+16(%rip), %r12
373 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
374 # (r13 is the number of bytes in plaintext mod 16)
376 # get the appropriate shuffle mask
378 # shift right 16-r13 bytes
382 lea ALL_F+16(%rip), %r12
388 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
390 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
391 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
394 movdqa SHUF_MASK(%rip), %xmm10
399 movdqa SHUF_MASK(%rip), %xmm10
405 movdqu %xmm8, AadHash(%arg2)
407 # GHASH computation for the last <16 byte block
408 movdqa SHUF_MASK(%rip), %xmm10
409 # shuffle xmm0 back to output as ciphertext
416 jle _less_than_8_bytes_left_\@
417 mov %rax, (%arg3 , %r11, 1)
422 _less_than_8_bytes_left_\@:
423 mov %al, (%arg3, %r11, 1)
427 jne _less_than_8_bytes_left_\@
428 _multiple_of_16_bytes_\@:
431 # GCM_COMPLETE Finishes update of tag of last partial block
432 # Output: Authorization Tag (AUTH_TAG)
433 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
434 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
435 movdqu AadHash(%arg2), %xmm8
436 movdqu HashKey(%arg2), %xmm13
438 mov PBlockLen(%arg2), %r12
443 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
446 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
447 shl $3, %r12 # convert into number of bits
448 movd %r12d, %xmm15 # len(A) in %xmm15
449 mov InLen(%arg2), %r12
450 shl $3, %r12 # len(C) in bits (*128)
453 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
454 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
456 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
457 # final GHASH computation
458 movdqa SHUF_MASK(%rip), %xmm10
461 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
462 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
465 mov \AUTHTAG, %r10 # %r10 = authTag
466 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
498 jmp _return_T_done_\@
505 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
508 * Input: A and B (128-bits each, bit-reflected)
509 * Output: C = A*B*x mod poly, (i.e. >>1 )
510 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
511 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
514 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
516 pshufd $78, \GH, \TMP2
517 pshufd $78, \HK, \TMP3
518 pxor \GH, \TMP2 # TMP2 = a1+a0
519 pxor \HK, \TMP3 # TMP3 = b1+b0
520 pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1
521 pclmulqdq $0x00, \HK, \GH # GH = a0*b0
522 pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
524 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
526 pslldq $8, \TMP3 # left shift TMP3 2 DWs
527 psrldq $8, \TMP2 # right shift TMP2 2 DWs
529 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
531 # first phase of the reduction
535 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
536 # in in order to perform
538 pslld $31, \TMP2 # packed right shift <<31
539 pslld $30, \TMP3 # packed right shift <<30
540 pslld $25, \TMP4 # packed right shift <<25
541 pxor \TMP3, \TMP2 # xor the shifted versions
544 psrldq $4, \TMP5 # right shift TMP5 1 DW
545 pslldq $12, \TMP2 # left shift TMP2 3 DWs
548 # second phase of the reduction
550 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
551 # in in order to perform
555 psrld $1,\TMP2 # packed left shift >>1
556 psrld $2,\TMP3 # packed left shift >>2
557 psrld $7,\TMP4 # packed left shift >>7
558 pxor \TMP3,\TMP2 # xor the shifted versions
562 pxor \TMP1, \GH # result is in TMP1
565 # Reads DLEN bytes starting at DPTR and stores in XMMDst
566 # where 0 < DLEN < 16
567 # Clobbers %rax, DLEN and XMM1
568 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
574 jz _done_read_partial_block_\@
578 mov 7(\DPTR, \DLEN, 1), %al
580 jnz _read_next_byte_\@
584 jmp _done_read_partial_block_\@
587 _read_next_byte_lt8_\@:
589 mov -1(\DPTR, \DLEN, 1), %al
591 jnz _read_next_byte_lt8_\@
593 _done_read_partial_block_\@:
596 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
597 # clobbers r10-11, xmm14
598 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
600 MOVADQ SHUF_MASK(%rip), %xmm14
601 mov \AAD, %r10 # %r10 = AAD
602 mov \AADLEN, %r11 # %r11 = aadLen
610 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
612 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
616 jge _get_AAD_blocks\@
620 /* read the last <16B of AAD */
625 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
626 pshufb %xmm14, \TMP7 # byte-reflect the AAD data
628 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
632 movdqu \TMP6, AadHash(%arg2)
635 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
636 # between update calls.
637 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
638 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
639 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
640 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
642 mov PBlockLen(%arg2), %r13
644 je _partial_block_done_\@ # Leave Macro if no partial blocks
645 # Read in input data without over reading
646 cmp $16, \PLAIN_CYPH_LEN
647 jl _fewer_than_16_bytes_\@
648 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
651 _fewer_than_16_bytes_\@:
652 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
653 mov \PLAIN_CYPH_LEN, %r12
654 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
656 mov PBlockLen(%arg2), %r13
658 _data_read_\@: # Finished reading in data
660 movdqu PBlockEncKey(%arg2), %xmm9
661 movdqu HashKey(%arg2), %xmm13
663 lea SHIFT_MASK(%rip), %r12
665 # adjust the shuffle mask pointer to be able to shift r13 bytes
666 # r16-r13 is the number of bytes in plaintext mod 16)
668 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
669 pshufb %xmm2, %xmm9 # shift right r13 bytes
673 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
675 mov \PLAIN_CYPH_LEN, %r10
677 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
679 # Determine if if partial block is not being filled and
680 # shift mask accordingly
681 jge _no_extra_mask_1_\@
685 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
686 # get the appropriate mask to mask out bottom r13 bytes of xmm9
687 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
690 movdqa SHUF_MASK(%rip), %xmm10
693 pxor %xmm3, \AAD_HASH
696 jl _partial_incomplete_1_\@
698 # GHASH computation for the last <16 Byte block
699 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
702 mov %rax, PBlockLen(%arg2)
704 _partial_incomplete_1_\@:
705 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
707 movdqu \AAD_HASH, AadHash(%arg2)
709 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
711 mov \PLAIN_CYPH_LEN, %r10
713 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
715 # Determine if if partial block is not being filled and
716 # shift mask accordingly
717 jge _no_extra_mask_2_\@
721 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
722 # get the appropriate mask to mask out bottom r13 bytes of xmm9
725 movdqa SHUF_MASK(%rip), %xmm1
728 pxor %xmm9, \AAD_HASH
731 jl _partial_incomplete_2_\@
733 # GHASH computation for the last <16 Byte block
734 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
737 mov %rax, PBlockLen(%arg2)
739 _partial_incomplete_2_\@:
740 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
742 movdqu \AAD_HASH, AadHash(%arg2)
744 movdqa SHUF_MASK(%rip), %xmm10
745 # shuffle xmm9 back to output as ciphertext
749 # output encrypted Bytes
754 # Set r13 to be the number of bytes to write out
758 mov \PLAIN_CYPH_LEN, %r13
763 jle _less_than_8_bytes_left_\@
765 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
770 _less_than_8_bytes_left_\@:
771 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
775 jne _less_than_8_bytes_left_\@
776 _partial_block_done_\@:
777 .endm # PARTIAL_BLOCK
780 * if a = number of total plaintext bytes
782 * num_initial_blocks = b mod 4
783 * encrypt the initial num_initial_blocks blocks and apply ghash on
785 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
787 * arg1, %arg2, %arg3 are used as a pointer only, not modified
791 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
792 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
793 MOVADQ SHUF_MASK(%rip), %xmm14
795 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
797 # start AES for num_initial_blocks blocks
799 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
801 .if (\i == 5) || (\i == 6) || (\i == 7)
803 MOVADQ ONE(%RIP),\TMP1
804 MOVADQ 0(%arg1),\TMP2
806 paddd \TMP1, \XMM0 # INCR Y0
808 movdqa \XMM0, %xmm\index
810 MOVADQ \XMM0, %xmm\index
812 pshufb %xmm14, %xmm\index # perform a 16 byte swap
813 pxor \TMP2, %xmm\index
817 shr $2,%eax # 128->4, 192->6, 256->8
818 add $5,%eax # 128->9, 192->11, 256->13
823 aesenc \TMP1, %xmm\index
827 jnz aes_loop_initial_\@
831 aesenclast \TMP1, %xmm\index # Last Round
834 movdqu (%arg4 , %r11, 1), \TMP1
835 pxor \TMP1, %xmm\index
836 movdqu %xmm\index, (%arg3 , %r11, 1)
837 # write back plaintext/ciphertext for num_initial_blocks
841 movdqa \TMP1, %xmm\index
843 pshufb %xmm14, %xmm\index
845 # prepare plaintext/ciphertext for GHASH computation
849 # apply GHASH on num_initial_blocks blocks
853 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
855 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
857 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
860 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
865 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
868 jl _initial_blocks_done\@
869 # no need for precomputed values
872 * Precomputations for HashKey parallel with encryption of first 4 blocks.
873 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
875 MOVADQ ONE(%RIP),\TMP1
876 paddd \TMP1, \XMM0 # INCR Y0
878 pshufb %xmm14, \XMM1 # perform a 16 byte swap
880 paddd \TMP1, \XMM0 # INCR Y0
882 pshufb %xmm14, \XMM2 # perform a 16 byte swap
884 paddd \TMP1, \XMM0 # INCR Y0
886 pshufb %xmm14, \XMM3 # perform a 16 byte swap
888 paddd \TMP1, \XMM0 # INCR Y0
890 pshufb %xmm14, \XMM4 # perform a 16 byte swap
892 MOVADQ 0(%arg1),\TMP1
897 .irpc index, 1234 # do 4 rounds
898 movaps 0x10*\index(%arg1), \TMP1
904 .irpc index, 56789 # do next 5 rounds
905 movaps 0x10*\index(%arg1), \TMP1
913 shr $2,%eax # 128->4, 192->6, 256->8
914 sub $4,%eax # 128->0, 192->2, 256->4
915 jz aes_loop_pre_done\@
920 aesenc \TMP2, %xmm\index
928 aesenclast \TMP2, \XMM1
929 aesenclast \TMP2, \XMM2
930 aesenclast \TMP2, \XMM3
931 aesenclast \TMP2, \XMM4
932 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
935 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
938 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
941 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
944 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
947 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
950 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
953 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
956 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
957 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
958 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
959 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
963 pshufb %xmm14, \XMM1 # perform a 16 byte swap
965 # combine GHASHed value with the corresponding ciphertext
966 pshufb %xmm14, \XMM2 # perform a 16 byte swap
967 pshufb %xmm14, \XMM3 # perform a 16 byte swap
968 pshufb %xmm14, \XMM4 # perform a 16 byte swap
970 _initial_blocks_done\@:
975 * encrypt 4 blocks at a time
976 * ghash the 4 previously encrypted ciphertext blocks
977 * arg1, %arg3, %arg4 are used as pointers only, not modified
978 * %r11 is the data offset value
980 .macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
981 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
988 movdqa SHUF_MASK(%rip), %xmm15
989 # multiply TMP5 * HashKey using karatsuba
992 pshufd $78, \XMM5, \TMP6
994 paddd ONE(%rip), \XMM0 # INCR CNT
995 movdqu HashKey_4(%arg2), \TMP5
996 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
998 paddd ONE(%rip), \XMM0 # INCR CNT
1000 paddd ONE(%rip), \XMM0 # INCR CNT
1002 paddd ONE(%rip), \XMM0 # INCR CNT
1004 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1005 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1006 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1007 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1008 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1014 movdqu HashKey_4_k(%arg2), \TMP5
1015 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1016 movaps 0x10(%arg1), \TMP1
1017 aesenc \TMP1, \XMM1 # Round 1
1021 movaps 0x20(%arg1), \TMP1
1022 aesenc \TMP1, \XMM1 # Round 2
1027 pshufd $78, \XMM6, \TMP2
1029 movdqu HashKey_3(%arg2), \TMP5
1030 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1031 movaps 0x30(%arg1), \TMP3
1032 aesenc \TMP3, \XMM1 # Round 3
1036 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1037 movaps 0x40(%arg1), \TMP3
1038 aesenc \TMP3, \XMM1 # Round 4
1042 movdqu HashKey_3_k(%arg2), \TMP5
1043 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1044 movaps 0x50(%arg1), \TMP3
1045 aesenc \TMP3, \XMM1 # Round 5
1050 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1054 pshufd $78, \XMM7, \TMP2
1056 movdqu HashKey_2(%arg2), \TMP5
1058 # Multiply TMP5 * HashKey using karatsuba
1060 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1061 movaps 0x60(%arg1), \TMP3
1062 aesenc \TMP3, \XMM1 # Round 6
1066 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1067 movaps 0x70(%arg1), \TMP3
1068 aesenc \TMP3, \XMM1 # Round 7
1072 movdqu HashKey_2_k(%arg2), \TMP5
1073 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1074 movaps 0x80(%arg1), \TMP3
1075 aesenc \TMP3, \XMM1 # Round 8
1080 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1084 # Multiply XMM8 * HashKey
1085 # XMM8 and TMP5 hold the values for the two operands
1088 pshufd $78, \XMM8, \TMP2
1090 movdqu HashKey(%arg2), \TMP5
1091 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1092 movaps 0x90(%arg1), \TMP3
1093 aesenc \TMP3, \XMM1 # Round 9
1097 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1098 lea 0xa0(%arg1),%r10
1100 shr $2,%eax # 128->4, 192->6, 256->8
1101 sub $4,%eax # 128->0, 192->2, 256->4
1102 jz aes_loop_par_enc_done\@
1107 aesenc \TMP3, %xmm\index
1111 jnz aes_loop_par_enc\@
1113 aes_loop_par_enc_done\@:
1114 MOVADQ (%r10), \TMP3
1115 aesenclast \TMP3, \XMM1 # Round 10
1116 aesenclast \TMP3, \XMM2
1117 aesenclast \TMP3, \XMM3
1118 aesenclast \TMP3, \XMM4
1119 movdqu HashKey_k(%arg2), \TMP5
1120 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1121 movdqu (%arg4,%r11,1), \TMP3
1122 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1123 movdqu 16(%arg4,%r11,1), \TMP3
1124 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1125 movdqu 32(%arg4,%r11,1), \TMP3
1126 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1127 movdqu 48(%arg4,%r11,1), \TMP3
1128 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1129 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1130 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1131 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1132 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
1133 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1134 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1135 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1136 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1144 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1145 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1147 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1149 # first phase of reduction
1154 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1155 pslld $31, \TMP2 # packed right shift << 31
1156 pslld $30, \TMP3 # packed right shift << 30
1157 pslld $25, \TMP4 # packed right shift << 25
1158 pxor \TMP3, \TMP2 # xor the shifted versions
1161 psrldq $4, \TMP5 # right shift T5 1 DW
1162 pslldq $12, \TMP2 # left shift T2 3 DWs
1165 # second phase of reduction
1167 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1170 psrld $1, \TMP2 # packed left shift >>1
1171 psrld $2, \TMP3 # packed left shift >>2
1172 psrld $7, \TMP4 # packed left shift >>7
1173 pxor \TMP3,\TMP2 # xor the shifted versions
1177 pxor \TMP1, \XMM5 # result is in TMP1
1183 * decrypt 4 blocks at a time
1184 * ghash the 4 previously decrypted ciphertext blocks
1185 * arg1, %arg3, %arg4 are used as pointers only, not modified
1186 * %r11 is the data offset value
1188 .macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1189 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1196 movdqa SHUF_MASK(%rip), %xmm15
1197 # multiply TMP5 * HashKey using karatsuba
1200 pshufd $78, \XMM5, \TMP6
1202 paddd ONE(%rip), \XMM0 # INCR CNT
1203 movdqu HashKey_4(%arg2), \TMP5
1204 pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1206 paddd ONE(%rip), \XMM0 # INCR CNT
1208 paddd ONE(%rip), \XMM0 # INCR CNT
1210 paddd ONE(%rip), \XMM0 # INCR CNT
1212 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1213 pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1214 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1215 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1216 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1222 movdqu HashKey_4_k(%arg2), \TMP5
1223 pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1224 movaps 0x10(%arg1), \TMP1
1225 aesenc \TMP1, \XMM1 # Round 1
1229 movaps 0x20(%arg1), \TMP1
1230 aesenc \TMP1, \XMM1 # Round 2
1235 pshufd $78, \XMM6, \TMP2
1237 movdqu HashKey_3(%arg2), \TMP5
1238 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1239 movaps 0x30(%arg1), \TMP3
1240 aesenc \TMP3, \XMM1 # Round 3
1244 pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1245 movaps 0x40(%arg1), \TMP3
1246 aesenc \TMP3, \XMM1 # Round 4
1250 movdqu HashKey_3_k(%arg2), \TMP5
1251 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1252 movaps 0x50(%arg1), \TMP3
1253 aesenc \TMP3, \XMM1 # Round 5
1258 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1262 pshufd $78, \XMM7, \TMP2
1264 movdqu HashKey_2(%arg2), \TMP5
1266 # Multiply TMP5 * HashKey using karatsuba
1268 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1269 movaps 0x60(%arg1), \TMP3
1270 aesenc \TMP3, \XMM1 # Round 6
1274 pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1275 movaps 0x70(%arg1), \TMP3
1276 aesenc \TMP3, \XMM1 # Round 7
1280 movdqu HashKey_2_k(%arg2), \TMP5
1281 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1282 movaps 0x80(%arg1), \TMP3
1283 aesenc \TMP3, \XMM1 # Round 8
1288 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1292 # Multiply XMM8 * HashKey
1293 # XMM8 and TMP5 hold the values for the two operands
1296 pshufd $78, \XMM8, \TMP2
1298 movdqu HashKey(%arg2), \TMP5
1299 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1300 movaps 0x90(%arg1), \TMP3
1301 aesenc \TMP3, \XMM1 # Round 9
1305 pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1306 lea 0xa0(%arg1),%r10
1308 shr $2,%eax # 128->4, 192->6, 256->8
1309 sub $4,%eax # 128->0, 192->2, 256->4
1310 jz aes_loop_par_dec_done\@
1315 aesenc \TMP3, %xmm\index
1319 jnz aes_loop_par_dec\@
1321 aes_loop_par_dec_done\@:
1322 MOVADQ (%r10), \TMP3
1323 aesenclast \TMP3, \XMM1 # last round
1324 aesenclast \TMP3, \XMM2
1325 aesenclast \TMP3, \XMM3
1326 aesenclast \TMP3, \XMM4
1327 movdqu HashKey_k(%arg2), \TMP5
1328 pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1329 movdqu (%arg4,%r11,1), \TMP3
1330 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1331 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1333 movdqu 16(%arg4,%r11,1), \TMP3
1334 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1335 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1337 movdqu 32(%arg4,%r11,1), \TMP3
1338 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1339 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1341 movdqu 48(%arg4,%r11,1), \TMP3
1342 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1343 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1345 pshufb %xmm15, \XMM1 # perform a 16 byte swap
1346 pshufb %xmm15, \XMM2 # perform a 16 byte swap
1347 pshufb %xmm15, \XMM3 # perform a 16 byte swap
1348 pshufb %xmm15, \XMM4 # perform a 16 byte swap
1356 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1357 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1359 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1361 # first phase of reduction
1366 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1367 pslld $31, \TMP2 # packed right shift << 31
1368 pslld $30, \TMP3 # packed right shift << 30
1369 pslld $25, \TMP4 # packed right shift << 25
1370 pxor \TMP3, \TMP2 # xor the shifted versions
1373 psrldq $4, \TMP5 # right shift T5 1 DW
1374 pslldq $12, \TMP2 # left shift T2 3 DWs
1377 # second phase of reduction
1379 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1382 psrld $1, \TMP2 # packed left shift >>1
1383 psrld $2, \TMP3 # packed left shift >>2
1384 psrld $7, \TMP4 # packed left shift >>7
1385 pxor \TMP3,\TMP2 # xor the shifted versions
1389 pxor \TMP1, \XMM5 # result is in TMP1
1394 /* GHASH the last 4 ciphertext blocks. */
1395 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1396 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1398 # Multiply TMP6 * HashKey (using Karatsuba)
1401 pshufd $78, \XMM1, \TMP2
1403 movdqu HashKey_4(%arg2), \TMP5
1404 pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1405 pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1406 movdqu HashKey_4_k(%arg2), \TMP4
1407 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1408 movdqa \XMM1, \XMMDst
1409 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1411 # Multiply TMP1 * HashKey (using Karatsuba)
1414 pshufd $78, \XMM2, \TMP2
1416 movdqu HashKey_3(%arg2), \TMP5
1417 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1418 pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1419 movdqu HashKey_3_k(%arg2), \TMP4
1420 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1424 # results accumulated in TMP6, XMMDst, XMM1
1426 # Multiply TMP1 * HashKey (using Karatsuba)
1429 pshufd $78, \XMM3, \TMP2
1431 movdqu HashKey_2(%arg2), \TMP5
1432 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1433 pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1434 movdqu HashKey_2_k(%arg2), \TMP4
1435 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1438 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1440 # Multiply TMP1 * HashKey (using Karatsuba)
1442 pshufd $78, \XMM4, \TMP2
1444 movdqu HashKey(%arg2), \TMP5
1445 pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1446 pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1447 movdqu HashKey_k(%arg2), \TMP4
1448 pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1454 # middle section of the temp results combined as in karatsuba algorithm
1456 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1457 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1460 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1461 # first phase of the reduction
1462 movdqa \XMMDst, \TMP2
1463 movdqa \XMMDst, \TMP3
1464 movdqa \XMMDst, \TMP4
1465 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1466 pslld $31, \TMP2 # packed right shifting << 31
1467 pslld $30, \TMP3 # packed right shifting << 30
1468 pslld $25, \TMP4 # packed right shifting << 25
1469 pxor \TMP3, \TMP2 # xor the shifted versions
1472 psrldq $4, \TMP7 # right shift TMP7 1 DW
1473 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1476 # second phase of the reduction
1477 movdqa \XMMDst, \TMP2
1478 # make 3 copies of XMMDst for doing 3 shift operations
1479 movdqa \XMMDst, \TMP3
1480 movdqa \XMMDst, \TMP4
1481 psrld $1, \TMP2 # packed left shift >> 1
1482 psrld $2, \TMP3 # packed left shift >> 2
1483 psrld $7, \TMP4 # packed left shift >> 7
1484 pxor \TMP3, \TMP2 # xor the shifted versions
1488 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1492 /* Encryption of a single block
1496 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1500 shr $2,%eax # 128->4, 192->6, 256->8
1501 add $5,%eax # 128->9, 192->11, 256->13
1502 lea 16(%arg1), %r10 # get first expanded key address
1512 aesenclast \TMP1,\XMM0
1514 /*****************************************************************************
1515 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1516 * struct gcm_context_data *data
1518 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1519 * const u8 *in, // Ciphertext input
1520 * u64 plaintext_len, // Length of data in bytes for decryption.
1521 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1522 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1523 * // concatenated with 0x00000001. 16-byte aligned pointer.
1524 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1525 * const u8 *aad, // Additional Authentication Data (AAD)
1526 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1527 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1528 * // given authentication tag and only return the plaintext if they match.
1529 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1530 * // (most likely), 12 or 8.
1535 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1536 * set of 11 keys in the data structure void *aes_ctx
1540 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1541 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1542 * | Salt (From the SA) |
1543 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544 * | Initialization Vector |
1545 * | (This is the sequence number from IPSec header) |
1546 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1548 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1553 * AAD padded to 128 bits with 0
1554 * for example, assume AAD is a u32 vector
1556 * if AAD is 8 bytes:
1557 * AAD[3] = {A0, A1};
1558 * padded AAD in xmm register = {A1 A0 0 0}
1561 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1562 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1564 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565 * | 32-bit Sequence Number (A0) |
1566 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570 * AAD Format with 32-bit Sequence Number
1572 * if AAD is 12 bytes:
1573 * AAD[3] = {A0, A1, A2};
1574 * padded AAD in xmm register = {A2 A1 A0 0}
1577 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1578 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1580 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1582 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583 * | 64-bit Extended Sequence Number {A1,A0} |
1585 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1587 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1589 * AAD Format with 64-bit Extended Sequence Number
1591 * poly = x^128 + x^127 + x^126 + x^121 + 1
1593 *****************************************************************************/
1594 SYM_FUNC_START(aesni_gcm_dec)
1597 GCM_INIT %arg6, arg7, arg8, arg9
1599 GCM_COMPLETE arg10, arg11
1602 SYM_FUNC_END(aesni_gcm_dec)
1605 /*****************************************************************************
1606 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1607 * struct gcm_context_data *data
1609 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1610 * const u8 *in, // Plaintext input
1611 * u64 plaintext_len, // Length of data in bytes for encryption.
1612 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1613 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1614 * // concatenated with 0x00000001. 16-byte aligned pointer.
1615 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1616 * const u8 *aad, // Additional Authentication Data (AAD)
1617 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1618 * u8 *auth_tag, // Authenticated Tag output.
1619 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1625 * keys are pre-expanded and aligned to 16 bytes. we are using the
1626 * first set of 11 keys in the data structure void *aes_ctx
1631 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1632 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1633 * | Salt (From the SA) |
1634 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1635 * | Initialization Vector |
1636 * | (This is the sequence number from IPSec header) |
1637 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1639 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1644 * AAD padded to 128 bits with 0
1645 * for example, assume AAD is a u32 vector
1647 * if AAD is 8 bytes:
1648 * AAD[3] = {A0, A1};
1649 * padded AAD in xmm register = {A1 A0 0 0}
1652 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1653 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1655 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656 * | 32-bit Sequence Number (A0) |
1657 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661 * AAD Format with 32-bit Sequence Number
1663 * if AAD is 12 bytes:
1664 * AAD[3] = {A0, A1, A2};
1665 * padded AAD in xmm register = {A2 A1 A0 0}
1668 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1669 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1671 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672 * | 64-bit Extended Sequence Number {A1,A0} |
1674 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1676 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1678 * AAD Format with 64-bit Extended Sequence Number
1680 * poly = x^128 + x^127 + x^126 + x^121 + 1
1681 ***************************************************************************/
1682 SYM_FUNC_START(aesni_gcm_enc)
1685 GCM_INIT %arg6, arg7, arg8, arg9
1688 GCM_COMPLETE arg10, arg11
1691 SYM_FUNC_END(aesni_gcm_enc)
1693 /*****************************************************************************
1694 * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1695 * struct gcm_context_data *data,
1697 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1698 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1699 * // concatenated with 0x00000001. 16-byte aligned pointer.
1700 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1701 * const u8 *aad, // Additional Authentication Data (AAD)
1702 * u64 aad_len) // Length of AAD in bytes.
1704 SYM_FUNC_START(aesni_gcm_init)
1706 GCM_INIT %arg3, %arg4,%arg5, %arg6
1709 SYM_FUNC_END(aesni_gcm_init)
1711 /*****************************************************************************
1712 * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1713 * struct gcm_context_data *data,
1715 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1716 * const u8 *in, // Plaintext input
1717 * u64 plaintext_len, // Length of data in bytes for encryption.
1719 SYM_FUNC_START(aesni_gcm_enc_update)
1724 SYM_FUNC_END(aesni_gcm_enc_update)
1726 /*****************************************************************************
1727 * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1728 * struct gcm_context_data *data,
1730 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1731 * const u8 *in, // Plaintext input
1732 * u64 plaintext_len, // Length of data in bytes for encryption.
1734 SYM_FUNC_START(aesni_gcm_dec_update)
1739 SYM_FUNC_END(aesni_gcm_dec_update)
1741 /*****************************************************************************
1742 * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1743 * struct gcm_context_data *data,
1745 * u8 *auth_tag, // Authenticated Tag output.
1746 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1749 SYM_FUNC_START(aesni_gcm_finalize)
1751 GCM_COMPLETE %arg3 %arg4
1754 SYM_FUNC_END(aesni_gcm_finalize)
1759 SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1760 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1761 pshufd $0b11111111, %xmm1, %xmm1
1762 shufps $0b00010000, %xmm0, %xmm4
1764 shufps $0b10001100, %xmm0, %xmm4
1767 movaps %xmm0, (TKEYP)
1770 SYM_FUNC_END(_key_expansion_256a)
1771 SYM_FUNC_END_ALIAS(_key_expansion_128)
1773 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1774 pshufd $0b01010101, %xmm1, %xmm1
1775 shufps $0b00010000, %xmm0, %xmm4
1777 shufps $0b10001100, %xmm0, %xmm4
1784 pshufd $0b11111111, %xmm0, %xmm3
1789 shufps $0b01000100, %xmm0, %xmm6
1790 movaps %xmm6, (TKEYP)
1791 shufps $0b01001110, %xmm2, %xmm1
1792 movaps %xmm1, 0x10(TKEYP)
1795 SYM_FUNC_END(_key_expansion_192a)
1797 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1798 pshufd $0b01010101, %xmm1, %xmm1
1799 shufps $0b00010000, %xmm0, %xmm4
1801 shufps $0b10001100, %xmm0, %xmm4
1807 pshufd $0b11111111, %xmm0, %xmm3
1811 movaps %xmm0, (TKEYP)
1814 SYM_FUNC_END(_key_expansion_192b)
1816 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1817 pshufd $0b10101010, %xmm1, %xmm1
1818 shufps $0b00010000, %xmm2, %xmm4
1820 shufps $0b10001100, %xmm2, %xmm4
1823 movaps %xmm2, (TKEYP)
1826 SYM_FUNC_END(_key_expansion_256b)
1829 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1830 * unsigned int key_len)
1832 SYM_FUNC_START(aesni_set_key)
1836 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1837 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1838 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1840 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1841 movaps %xmm0, (KEYP)
1842 lea 0x10(KEYP), TKEYP # key addr
1843 movl %edx, 480(KEYP)
1844 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1848 movups 0x10(UKEYP), %xmm2 # other user key
1849 movaps %xmm2, (TKEYP)
1851 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1852 call _key_expansion_256a
1853 aeskeygenassist $0x1, %xmm0, %xmm1
1854 call _key_expansion_256b
1855 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1856 call _key_expansion_256a
1857 aeskeygenassist $0x2, %xmm0, %xmm1
1858 call _key_expansion_256b
1859 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1860 call _key_expansion_256a
1861 aeskeygenassist $0x4, %xmm0, %xmm1
1862 call _key_expansion_256b
1863 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1864 call _key_expansion_256a
1865 aeskeygenassist $0x8, %xmm0, %xmm1
1866 call _key_expansion_256b
1867 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1868 call _key_expansion_256a
1869 aeskeygenassist $0x10, %xmm0, %xmm1
1870 call _key_expansion_256b
1871 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1872 call _key_expansion_256a
1873 aeskeygenassist $0x20, %xmm0, %xmm1
1874 call _key_expansion_256b
1875 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1876 call _key_expansion_256a
1879 movq 0x10(UKEYP), %xmm2 # other user key
1880 aeskeygenassist $0x1, %xmm2, %xmm1 # round 1
1881 call _key_expansion_192a
1882 aeskeygenassist $0x2, %xmm2, %xmm1 # round 2
1883 call _key_expansion_192b
1884 aeskeygenassist $0x4, %xmm2, %xmm1 # round 3
1885 call _key_expansion_192a
1886 aeskeygenassist $0x8, %xmm2, %xmm1 # round 4
1887 call _key_expansion_192b
1888 aeskeygenassist $0x10, %xmm2, %xmm1 # round 5
1889 call _key_expansion_192a
1890 aeskeygenassist $0x20, %xmm2, %xmm1 # round 6
1891 call _key_expansion_192b
1892 aeskeygenassist $0x40, %xmm2, %xmm1 # round 7
1893 call _key_expansion_192a
1894 aeskeygenassist $0x80, %xmm2, %xmm1 # round 8
1895 call _key_expansion_192b
1898 aeskeygenassist $0x1, %xmm0, %xmm1 # round 1
1899 call _key_expansion_128
1900 aeskeygenassist $0x2, %xmm0, %xmm1 # round 2
1901 call _key_expansion_128
1902 aeskeygenassist $0x4, %xmm0, %xmm1 # round 3
1903 call _key_expansion_128
1904 aeskeygenassist $0x8, %xmm0, %xmm1 # round 4
1905 call _key_expansion_128
1906 aeskeygenassist $0x10, %xmm0, %xmm1 # round 5
1907 call _key_expansion_128
1908 aeskeygenassist $0x20, %xmm0, %xmm1 # round 6
1909 call _key_expansion_128
1910 aeskeygenassist $0x40, %xmm0, %xmm1 # round 7
1911 call _key_expansion_128
1912 aeskeygenassist $0x80, %xmm0, %xmm1 # round 8
1913 call _key_expansion_128
1914 aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9
1915 call _key_expansion_128
1916 aeskeygenassist $0x36, %xmm0, %xmm1 # round 10
1917 call _key_expansion_128
1920 movaps (KEYP), %xmm0
1921 movaps (TKEYP), %xmm1
1922 movaps %xmm0, 240(TKEYP)
1923 movaps %xmm1, 240(KEYP)
1925 lea 240-16(TKEYP), UKEYP
1928 movaps (KEYP), %xmm0
1930 movaps %xmm1, (UKEYP)
1941 SYM_FUNC_END(aesni_set_key)
1944 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1946 SYM_FUNC_START(aesni_enc)
1951 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1952 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1953 movl (FRAME_OFFSET+20)(%esp), INP # src
1955 movl 480(KEYP), KLEN # key length
1956 movups (INP), STATE # input
1958 movups STATE, (OUTP) # output
1965 SYM_FUNC_END(aesni_enc)
1968 * _aesni_enc1: internal ABI
1970 * KEYP: key struct pointer
1972 * STATE: initial state (input)
1974 * STATE: finial state (output)
1979 SYM_FUNC_START_LOCAL(_aesni_enc1)
1980 movaps (KEYP), KEY # key
1982 pxor KEY, STATE # round 0
1986 lea 0x20(TKEYP), TKEYP
1989 movaps -0x60(TKEYP), KEY
1991 movaps -0x50(TKEYP), KEY
1995 movaps -0x40(TKEYP), KEY
1997 movaps -0x30(TKEYP), KEY
2001 movaps -0x20(TKEYP), KEY
2003 movaps -0x10(TKEYP), KEY
2007 movaps 0x10(TKEYP), KEY
2009 movaps 0x20(TKEYP), KEY
2011 movaps 0x30(TKEYP), KEY
2013 movaps 0x40(TKEYP), KEY
2015 movaps 0x50(TKEYP), KEY
2017 movaps 0x60(TKEYP), KEY
2019 movaps 0x70(TKEYP), KEY
2020 aesenclast KEY, STATE
2022 SYM_FUNC_END(_aesni_enc1)
2025 * _aesni_enc4: internal ABI
2027 * KEYP: key struct pointer
2029 * STATE1: initial state (input)
2034 * STATE1: finial state (output)
2042 SYM_FUNC_START_LOCAL(_aesni_enc4)
2043 movaps (KEYP), KEY # key
2045 pxor KEY, STATE1 # round 0
2052 lea 0x20(TKEYP), TKEYP
2055 movaps -0x60(TKEYP), KEY
2060 movaps -0x50(TKEYP), KEY
2067 movaps -0x40(TKEYP), KEY
2072 movaps -0x30(TKEYP), KEY
2079 movaps -0x20(TKEYP), KEY
2084 movaps -0x10(TKEYP), KEY
2094 movaps 0x10(TKEYP), KEY
2099 movaps 0x20(TKEYP), KEY
2104 movaps 0x30(TKEYP), KEY
2109 movaps 0x40(TKEYP), KEY
2114 movaps 0x50(TKEYP), KEY
2119 movaps 0x60(TKEYP), KEY
2124 movaps 0x70(TKEYP), KEY
2125 aesenclast KEY, STATE1 # last round
2126 aesenclast KEY, STATE2
2127 aesenclast KEY, STATE3
2128 aesenclast KEY, STATE4
2130 SYM_FUNC_END(_aesni_enc4)
2133 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2135 SYM_FUNC_START(aesni_dec)
2140 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2141 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2142 movl (FRAME_OFFSET+20)(%esp), INP # src
2144 mov 480(KEYP), KLEN # key length
2146 movups (INP), STATE # input
2148 movups STATE, (OUTP) #output
2155 SYM_FUNC_END(aesni_dec)
2158 * _aesni_dec1: internal ABI
2160 * KEYP: key struct pointer
2162 * STATE: initial state (input)
2164 * STATE: finial state (output)
2169 SYM_FUNC_START_LOCAL(_aesni_dec1)
2170 movaps (KEYP), KEY # key
2172 pxor KEY, STATE # round 0
2176 lea 0x20(TKEYP), TKEYP
2179 movaps -0x60(TKEYP), KEY
2181 movaps -0x50(TKEYP), KEY
2185 movaps -0x40(TKEYP), KEY
2187 movaps -0x30(TKEYP), KEY
2191 movaps -0x20(TKEYP), KEY
2193 movaps -0x10(TKEYP), KEY
2197 movaps 0x10(TKEYP), KEY
2199 movaps 0x20(TKEYP), KEY
2201 movaps 0x30(TKEYP), KEY
2203 movaps 0x40(TKEYP), KEY
2205 movaps 0x50(TKEYP), KEY
2207 movaps 0x60(TKEYP), KEY
2209 movaps 0x70(TKEYP), KEY
2210 aesdeclast KEY, STATE
2212 SYM_FUNC_END(_aesni_dec1)
2215 * _aesni_dec4: internal ABI
2217 * KEYP: key struct pointer
2219 * STATE1: initial state (input)
2224 * STATE1: finial state (output)
2232 SYM_FUNC_START_LOCAL(_aesni_dec4)
2233 movaps (KEYP), KEY # key
2235 pxor KEY, STATE1 # round 0
2242 lea 0x20(TKEYP), TKEYP
2245 movaps -0x60(TKEYP), KEY
2250 movaps -0x50(TKEYP), KEY
2257 movaps -0x40(TKEYP), KEY
2262 movaps -0x30(TKEYP), KEY
2269 movaps -0x20(TKEYP), KEY
2274 movaps -0x10(TKEYP), KEY
2284 movaps 0x10(TKEYP), KEY
2289 movaps 0x20(TKEYP), KEY
2294 movaps 0x30(TKEYP), KEY
2299 movaps 0x40(TKEYP), KEY
2304 movaps 0x50(TKEYP), KEY
2309 movaps 0x60(TKEYP), KEY
2314 movaps 0x70(TKEYP), KEY
2315 aesdeclast KEY, STATE1 # last round
2316 aesdeclast KEY, STATE2
2317 aesdeclast KEY, STATE3
2318 aesdeclast KEY, STATE4
2320 SYM_FUNC_END(_aesni_dec4)
2323 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2326 SYM_FUNC_START(aesni_ecb_enc)
2332 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2333 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2334 movl (FRAME_OFFSET+24)(%esp), INP # src
2335 movl (FRAME_OFFSET+28)(%esp), LEN # len
2337 test LEN, LEN # check length
2346 movups (INP), STATE1
2347 movups 0x10(INP), STATE2
2348 movups 0x20(INP), STATE3
2349 movups 0x30(INP), STATE4
2351 movups STATE1, (OUTP)
2352 movups STATE2, 0x10(OUTP)
2353 movups STATE3, 0x20(OUTP)
2354 movups STATE4, 0x30(OUTP)
2364 movups (INP), STATE1
2366 movups STATE1, (OUTP)
2380 SYM_FUNC_END(aesni_ecb_enc)
2383 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2386 SYM_FUNC_START(aesni_ecb_dec)
2392 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2393 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2394 movl (FRAME_OFFSET+24)(%esp), INP # src
2395 movl (FRAME_OFFSET+28)(%esp), LEN # len
2407 movups (INP), STATE1
2408 movups 0x10(INP), STATE2
2409 movups 0x20(INP), STATE3
2410 movups 0x30(INP), STATE4
2412 movups STATE1, (OUTP)
2413 movups STATE2, 0x10(OUTP)
2414 movups STATE3, 0x20(OUTP)
2415 movups STATE4, 0x30(OUTP)
2425 movups (INP), STATE1
2427 movups STATE1, (OUTP)
2441 SYM_FUNC_END(aesni_ecb_dec)
2444 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2445 * size_t len, u8 *iv)
2447 SYM_FUNC_START(aesni_cbc_enc)
2454 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2455 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2456 movl (FRAME_OFFSET+28)(%esp), INP # src
2457 movl (FRAME_OFFSET+32)(%esp), LEN # len
2458 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2463 movups (IVP), STATE # load iv as initial state
2466 movups (INP), IN # load input
2469 movups STATE, (OUTP) # store output
2485 SYM_FUNC_END(aesni_cbc_enc)
2488 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2489 * size_t len, u8 *iv)
2491 SYM_FUNC_START(aesni_cbc_dec)
2498 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2499 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2500 movl (FRAME_OFFSET+28)(%esp), INP # src
2501 movl (FRAME_OFFSET+32)(%esp), LEN # len
2502 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2505 jb .Lcbc_dec_just_ret
2515 movups 0x10(INP), IN2
2518 movups 0x20(INP), IN3
2520 movups 0x30(INP), IN4
2523 movups 0x20(INP), IN1
2525 movups 0x30(INP), IN2
2540 movups 0x10(INP), IN2
2543 movups STATE1, (OUTP)
2544 movups STATE2, 0x10(OUTP)
2545 movups STATE3, 0x20(OUTP)
2546 movups STATE4, 0x30(OUTP)
2560 movups STATE, (OUTP)
2578 SYM_FUNC_END(aesni_cbc_dec)
2581 .pushsection .rodata
2584 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2588 * _aesni_inc_init: internal ABI
2589 * setup registers used by _aesni_inc
2593 * CTR: == IV, in little endian
2594 * TCTR_LOW: == lower qword of CTR
2595 * INC: == 1, in little endian
2596 * BSWAP_MASK == endian swapping mask
2598 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2599 movaps .Lbswap_mask, BSWAP_MASK
2601 pshufb BSWAP_MASK, CTR
2606 SYM_FUNC_END(_aesni_inc_init)
2609 * _aesni_inc: internal ABI
2610 * Increase IV by 1, IV is in big endian
2613 * CTR: == IV, in little endian
2614 * TCTR_LOW: == lower qword of CTR
2615 * INC: == 1, in little endian
2616 * BSWAP_MASK == endian swapping mask
2620 * CTR: == output IV, in little endian
2621 * TCTR_LOW: == lower qword of CTR
2623 SYM_FUNC_START_LOCAL(_aesni_inc)
2632 pshufb BSWAP_MASK, IV
2634 SYM_FUNC_END(_aesni_inc)
2637 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2638 * size_t len, u8 *iv)
2640 SYM_FUNC_START(aesni_ctr_enc)
2643 jb .Lctr_enc_just_ret
2646 call _aesni_inc_init
2656 movups 0x10(INP), IN2
2659 movups 0x20(INP), IN3
2662 movups 0x30(INP), IN4
2665 movups STATE1, (OUTP)
2667 movups STATE2, 0x10(OUTP)
2669 movups STATE3, 0x20(OUTP)
2671 movups STATE4, 0x30(OUTP)
2686 movups STATE, (OUTP)
2697 SYM_FUNC_END(aesni_ctr_enc)
2700 * _aesni_gf128mul_x_ble: internal ABI
2701 * Multiply in GF(2^128) for XTS IVs
2704 * GF128MUL_MASK == mask with 0x87 and 0x01
2708 * CTR: == temporary value
2710 #define _aesni_gf128mul_x_ble() \
2711 pshufd $0x13, IV, CTR; \
2714 pand GF128MUL_MASK, CTR; \
2718 * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
2719 * const u8 *src, bool enc, le128 *iv)
2721 SYM_FUNC_START(aesni_xts_crypt8)
2726 leaq _aesni_enc4, %r11
2727 leaq _aesni_dec4, %rax
2731 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2738 movdqu 0x00(INP), INC
2740 movdqu IV, 0x00(OUTP)
2742 _aesni_gf128mul_x_ble()
2744 movdqu 0x10(INP), INC
2746 movdqu IV, 0x10(OUTP)
2748 _aesni_gf128mul_x_ble()
2750 movdqu 0x20(INP), INC
2752 movdqu IV, 0x20(OUTP)
2754 _aesni_gf128mul_x_ble()
2756 movdqu 0x30(INP), INC
2758 movdqu IV, 0x30(OUTP)
2762 movdqu 0x00(OUTP), INC
2764 movdqu STATE1, 0x00(OUTP)
2766 _aesni_gf128mul_x_ble()
2768 movdqu 0x40(INP), INC
2770 movdqu IV, 0x40(OUTP)
2772 movdqu 0x10(OUTP), INC
2774 movdqu STATE2, 0x10(OUTP)
2776 _aesni_gf128mul_x_ble()
2778 movdqu 0x50(INP), INC
2780 movdqu IV, 0x50(OUTP)
2782 movdqu 0x20(OUTP), INC
2784 movdqu STATE3, 0x20(OUTP)
2786 _aesni_gf128mul_x_ble()
2788 movdqu 0x60(INP), INC
2790 movdqu IV, 0x60(OUTP)
2792 movdqu 0x30(OUTP), INC
2794 movdqu STATE4, 0x30(OUTP)
2796 _aesni_gf128mul_x_ble()
2798 movdqu 0x70(INP), INC
2800 movdqu IV, 0x70(OUTP)
2802 _aesni_gf128mul_x_ble()
2807 movdqu 0x40(OUTP), INC
2809 movdqu STATE1, 0x40(OUTP)
2811 movdqu 0x50(OUTP), INC
2813 movdqu STATE2, 0x50(OUTP)
2815 movdqu 0x60(OUTP), INC
2817 movdqu STATE3, 0x60(OUTP)
2819 movdqu 0x70(OUTP), INC
2821 movdqu STATE4, 0x70(OUTP)
2825 SYM_FUNC_END(aesni_xts_crypt8)