2 * Implement AES algorithm in Intel AES-NI instructions.
4 * The white paper of AES-NI instructions can be downloaded from:
5 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7 * Copyright (C) 2008, Intel Corp.
8 * Author: Huang Ying <ying.huang@intel.com>
9 * Vinodh Gopal <vinodh.gopal@intel.com>
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
26 * This program is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU General Public License as published by
28 * the Free Software Foundation; either version 2 of the License, or
29 * (at your option) any later version.
32 #include <linux/linkage.h>
34 #include <asm/frame.h>
35 #include <asm/nospec-branch.h>
38 * The following macros are used to move an (un)aligned 16 byte value to/from
39 * an XMM register. This can done for either FP or integer values, for FP use
40 * movaps (move aligned packed single) or integer use movdqa (move double quad
41 * aligned). It doesn't make a performance difference which instruction is used
42 * since Nehalem (original Core i7) was released. However, the movaps is a byte
43 * shorter, so that is the one we'll use for now. (same for unaligned).
50 # constants in mergeable sections, linker can reorder and merge
51 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
53 .Lgf128mul_x_ble_mask:
54 .octa 0x00000000000000010000000000000087
55 .section .rodata.cst16.POLY, "aM", @progbits, 16
57 POLY: .octa 0xC2000000000000000000000000000001
58 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
60 TWOONE: .octa 0x00000001000000000000000000000001
62 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
64 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
65 .section .rodata.cst16.MASK1, "aM", @progbits, 16
67 MASK1: .octa 0x0000000000000000ffffffffffffffff
68 .section .rodata.cst16.MASK2, "aM", @progbits, 16
70 MASK2: .octa 0xffffffffffffffff0000000000000000
71 .section .rodata.cst16.ONE, "aM", @progbits, 16
73 ONE: .octa 0x00000000000000000000000000000001
74 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
76 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
77 .section .rodata.cst16.dec, "aM", @progbits, 16
80 .section .rodata.cst16.enc, "aM", @progbits, 16
84 # order of these constants should not change.
85 # more specifically, ALL_F should follow SHIFT_MASK,
86 # and zero should follow ALL_F
87 .section .rodata, "a", @progbits
89 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
90 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
91 .octa 0x00000000000000000000000000000000
96 #define STACK_OFFSET 8*3
100 #define InLen (16*1)+8
101 #define PBlockEncKey 16*2
103 #define CurCount 16*4
104 #define PBlockLen 16*5
105 #define HashKey 16*6 // store HashKey <<1 mod poly here
106 #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
107 #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
108 #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
109 #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
110 // bits of HashKey <<1 mod poly here
111 //(for Karatsuba purposes)
112 #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
113 // bits of HashKey^2 <<1 mod poly here
114 // (for Karatsuba purposes)
115 #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
116 // bits of HashKey^3 <<1 mod poly here
117 // (for Karatsuba purposes)
118 #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
119 // bits of HashKey^4 <<1 mod poly here
120 // (for Karatsuba purposes)
128 #define arg7 STACK_OFFSET+8(%rsp)
129 #define arg8 STACK_OFFSET+16(%rsp)
130 #define arg9 STACK_OFFSET+24(%rsp)
131 #define arg10 STACK_OFFSET+32(%rsp)
132 #define arg11 STACK_OFFSET+40(%rsp)
133 #define keysize 2*15*16(%arg1)
150 #define BSWAP_MASK %xmm10
154 #define GF128MUL_MASK %xmm10
187 # states of %xmm registers %xmm6:%xmm15 not saved
188 # all %xmm registers are clobbered
199 # Precompute hashkeys.
200 # Input: Hash subkey.
201 # Output: HashKeys stored in gcm_context_data. Only needs to be called
203 # clobbers r12, and tmp xmm registers.
204 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
207 movdqa SHUF_MASK(%rip), \TMP2
208 PSHUFB_XMM \TMP2, \TMP3
210 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
222 pshufd $0x24, \TMP1, \TMP2
223 pcmpeqd TWOONE(%rip), \TMP2
224 pand POLY(%rip), \TMP2
226 movdqu \TMP3, HashKey(%arg2)
229 pshufd $78, \TMP3, \TMP1
231 movdqu \TMP1, HashKey_k(%arg2)
233 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
234 # TMP5 = HashKey^2<<1 (mod poly)
235 movdqu \TMP5, HashKey_2(%arg2)
236 # HashKey_2 = HashKey^2<<1 (mod poly)
237 pshufd $78, \TMP5, \TMP1
239 movdqu \TMP1, HashKey_2_k(%arg2)
241 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
242 # TMP5 = HashKey^3<<1 (mod poly)
243 movdqu \TMP5, HashKey_3(%arg2)
244 pshufd $78, \TMP5, \TMP1
246 movdqu \TMP1, HashKey_3_k(%arg2)
248 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
249 # TMP5 = HashKey^3<<1 (mod poly)
250 movdqu \TMP5, HashKey_4(%arg2)
251 pshufd $78, \TMP5, \TMP1
253 movdqu \TMP1, HashKey_4_k(%arg2)
256 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
257 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
258 .macro GCM_INIT Iv SUBKEY AAD AADLEN
260 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
262 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
263 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
264 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
267 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
269 movdqa SHUF_MASK(%rip), %xmm2
270 PSHUFB_XMM %xmm2, %xmm0
271 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
273 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
274 movdqu HashKey(%arg2), %xmm13
276 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
280 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
281 # struct has been initialized by GCM_INIT.
282 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
283 # Clobbers rax, r10-r13, and xmm0-xmm15
284 .macro GCM_ENC_DEC operation
285 movdqu AadHash(%arg2), %xmm8
286 movdqu HashKey(%arg2), %xmm13
287 add %arg5, InLen(%arg2)
289 xor %r11d, %r11d # initialise the data pointer offset as zero
290 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
292 sub %r11, %arg5 # sub partial block data used
293 mov %arg5, %r13 # save the number of bytes
295 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
297 # Encrypt/Decrypt first few blocks
300 jz _initial_num_blocks_is_0_\@
302 jb _initial_num_blocks_is_1_\@
303 je _initial_num_blocks_is_2_\@
304 _initial_num_blocks_is_3_\@:
305 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
308 jmp _initial_blocks_\@
309 _initial_num_blocks_is_2_\@:
310 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
313 jmp _initial_blocks_\@
314 _initial_num_blocks_is_1_\@:
315 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
318 jmp _initial_blocks_\@
319 _initial_num_blocks_is_0_\@:
320 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
321 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
324 # Main loop - Encrypt/Decrypt remaining blocks
327 je _zero_cipher_left_\@
329 je _four_cipher_left_\@
331 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
332 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
337 _four_cipher_left_\@:
338 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
339 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
340 _zero_cipher_left_\@:
341 movdqu %xmm8, AadHash(%arg2)
342 movdqu %xmm0, CurCount(%arg2)
345 and $15, %r13 # %r13 = arg5 (mod 16)
346 je _multiple_of_16_bytes_\@
348 mov %r13, PBlockLen(%arg2)
350 # Handle the last <16 Byte block separately
351 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
352 movdqu %xmm0, CurCount(%arg2)
353 movdqa SHUF_MASK(%rip), %xmm10
354 PSHUFB_XMM %xmm10, %xmm0
356 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
357 movdqu %xmm0, PBlockEncKey(%arg2)
360 jge _large_enough_update_\@
362 lea (%arg4,%r11,1), %r10
364 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
367 _large_enough_update_\@:
371 # receive the last <16 Byte block
372 movdqu (%arg4, %r11, 1), %xmm1
377 lea SHIFT_MASK+16(%rip), %r12
378 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
379 # (r13 is the number of bytes in plaintext mod 16)
381 # get the appropriate shuffle mask
383 # shift right 16-r13 bytes
384 PSHUFB_XMM %xmm2, %xmm1
387 lea ALL_F+16(%rip), %r12
393 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
395 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
396 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
399 movdqa SHUF_MASK(%rip), %xmm10
400 PSHUFB_XMM %xmm10 ,%xmm2
404 movdqa SHUF_MASK(%rip), %xmm10
405 PSHUFB_XMM %xmm10,%xmm0
410 movdqu %xmm8, AadHash(%arg2)
412 # GHASH computation for the last <16 byte block
413 movdqa SHUF_MASK(%rip), %xmm10
414 # shuffle xmm0 back to output as ciphertext
415 PSHUFB_XMM %xmm10, %xmm0
419 MOVQ_R64_XMM %xmm0, %rax
421 jle _less_than_8_bytes_left_\@
422 mov %rax, (%arg3 , %r11, 1)
425 MOVQ_R64_XMM %xmm0, %rax
427 _less_than_8_bytes_left_\@:
428 mov %al, (%arg3, %r11, 1)
432 jne _less_than_8_bytes_left_\@
433 _multiple_of_16_bytes_\@:
436 # GCM_COMPLETE Finishes update of tag of last partial block
437 # Output: Authorization Tag (AUTH_TAG)
438 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
439 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
440 movdqu AadHash(%arg2), %xmm8
441 movdqu HashKey(%arg2), %xmm13
443 mov PBlockLen(%arg2), %r12
448 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
451 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
452 shl $3, %r12 # convert into number of bits
453 movd %r12d, %xmm15 # len(A) in %xmm15
454 mov InLen(%arg2), %r12
455 shl $3, %r12 # len(C) in bits (*128)
456 MOVQ_R64_XMM %r12, %xmm1
458 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
459 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
461 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
462 # final GHASH computation
463 movdqa SHUF_MASK(%rip), %xmm10
464 PSHUFB_XMM %xmm10, %xmm8
466 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
467 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
470 mov \AUTHTAG, %r10 # %r10 = authTag
471 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
477 MOVQ_R64_XMM %xmm0, %rax
503 jmp _return_T_done_\@
510 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
513 * Input: A and B (128-bits each, bit-reflected)
514 * Output: C = A*B*x mod poly, (i.e. >>1 )
515 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
516 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
519 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
521 pshufd $78, \GH, \TMP2
522 pshufd $78, \HK, \TMP3
523 pxor \GH, \TMP2 # TMP2 = a1+a0
524 pxor \HK, \TMP3 # TMP3 = b1+b0
525 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
526 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
527 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
529 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
531 pslldq $8, \TMP3 # left shift TMP3 2 DWs
532 psrldq $8, \TMP2 # right shift TMP2 2 DWs
534 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
536 # first phase of the reduction
540 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
541 # in in order to perform
543 pslld $31, \TMP2 # packed right shift <<31
544 pslld $30, \TMP3 # packed right shift <<30
545 pslld $25, \TMP4 # packed right shift <<25
546 pxor \TMP3, \TMP2 # xor the shifted versions
549 psrldq $4, \TMP5 # right shift TMP5 1 DW
550 pslldq $12, \TMP2 # left shift TMP2 3 DWs
553 # second phase of the reduction
555 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
556 # in in order to perform
560 psrld $1,\TMP2 # packed left shift >>1
561 psrld $2,\TMP3 # packed left shift >>2
562 psrld $7,\TMP4 # packed left shift >>7
563 pxor \TMP3,\TMP2 # xor the shifted versions
567 pxor \TMP1, \GH # result is in TMP1
570 # Reads DLEN bytes starting at DPTR and stores in XMMDst
571 # where 0 < DLEN < 16
572 # Clobbers %rax, DLEN and XMM1
573 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
577 MOVQ_R64_XMM %rax, \XMMDst
579 jz _done_read_partial_block_\@
583 mov 7(\DPTR, \DLEN, 1), %al
585 jnz _read_next_byte_\@
586 MOVQ_R64_XMM %rax, \XMM1
589 jmp _done_read_partial_block_\@
592 _read_next_byte_lt8_\@:
594 mov -1(\DPTR, \DLEN, 1), %al
596 jnz _read_next_byte_lt8_\@
597 MOVQ_R64_XMM %rax, \XMMDst
598 _done_read_partial_block_\@:
601 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
602 # clobbers r10-11, xmm14
603 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
605 MOVADQ SHUF_MASK(%rip), %xmm14
606 mov \AAD, %r10 # %r10 = AAD
607 mov \AADLEN, %r11 # %r11 = aadLen
615 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
617 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
621 jge _get_AAD_blocks\@
625 /* read the last <16B of AAD */
630 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
631 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
633 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
637 movdqu \TMP6, AadHash(%arg2)
640 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
641 # between update calls.
642 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
643 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
644 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
645 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
647 mov PBlockLen(%arg2), %r13
649 je _partial_block_done_\@ # Leave Macro if no partial blocks
650 # Read in input data without over reading
651 cmp $16, \PLAIN_CYPH_LEN
652 jl _fewer_than_16_bytes_\@
653 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
656 _fewer_than_16_bytes_\@:
657 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
658 mov \PLAIN_CYPH_LEN, %r12
659 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
661 mov PBlockLen(%arg2), %r13
663 _data_read_\@: # Finished reading in data
665 movdqu PBlockEncKey(%arg2), %xmm9
666 movdqu HashKey(%arg2), %xmm13
668 lea SHIFT_MASK(%rip), %r12
670 # adjust the shuffle mask pointer to be able to shift r13 bytes
671 # r16-r13 is the number of bytes in plaintext mod 16)
673 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
674 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
678 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
680 mov \PLAIN_CYPH_LEN, %r10
682 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
684 # Determine if if partial block is not being filled and
685 # shift mask accordingly
686 jge _no_extra_mask_1_\@
690 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
691 # get the appropriate mask to mask out bottom r13 bytes of xmm9
692 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
695 movdqa SHUF_MASK(%rip), %xmm10
696 PSHUFB_XMM %xmm10, %xmm3
697 PSHUFB_XMM %xmm2, %xmm3
698 pxor %xmm3, \AAD_HASH
701 jl _partial_incomplete_1_\@
703 # GHASH computation for the last <16 Byte block
704 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
707 mov %rax, PBlockLen(%arg2)
709 _partial_incomplete_1_\@:
710 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
712 movdqu \AAD_HASH, AadHash(%arg2)
714 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
716 mov \PLAIN_CYPH_LEN, %r10
718 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
720 # Determine if if partial block is not being filled and
721 # shift mask accordingly
722 jge _no_extra_mask_2_\@
726 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
727 # get the appropriate mask to mask out bottom r13 bytes of xmm9
730 movdqa SHUF_MASK(%rip), %xmm1
731 PSHUFB_XMM %xmm1, %xmm9
732 PSHUFB_XMM %xmm2, %xmm9
733 pxor %xmm9, \AAD_HASH
736 jl _partial_incomplete_2_\@
738 # GHASH computation for the last <16 Byte block
739 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
742 mov %rax, PBlockLen(%arg2)
744 _partial_incomplete_2_\@:
745 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
747 movdqu \AAD_HASH, AadHash(%arg2)
749 movdqa SHUF_MASK(%rip), %xmm10
750 # shuffle xmm9 back to output as ciphertext
751 PSHUFB_XMM %xmm10, %xmm9
752 PSHUFB_XMM %xmm2, %xmm9
754 # output encrypted Bytes
759 # Set r13 to be the number of bytes to write out
763 mov \PLAIN_CYPH_LEN, %r13
766 MOVQ_R64_XMM %xmm0, %rax
768 jle _less_than_8_bytes_left_\@
770 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
773 MOVQ_R64_XMM %xmm0, %rax
775 _less_than_8_bytes_left_\@:
776 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
780 jne _less_than_8_bytes_left_\@
781 _partial_block_done_\@:
782 .endm # PARTIAL_BLOCK
785 * if a = number of total plaintext bytes
787 * num_initial_blocks = b mod 4
788 * encrypt the initial num_initial_blocks blocks and apply ghash on
790 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
792 * arg1, %arg2, %arg3 are used as a pointer only, not modified
796 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
797 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
798 MOVADQ SHUF_MASK(%rip), %xmm14
800 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
802 # start AES for num_initial_blocks blocks
804 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
806 .if (\i == 5) || (\i == 6) || (\i == 7)
808 MOVADQ ONE(%RIP),\TMP1
809 MOVADQ 0(%arg1),\TMP2
811 paddd \TMP1, \XMM0 # INCR Y0
813 movdqa \XMM0, %xmm\index
815 MOVADQ \XMM0, %xmm\index
817 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
818 pxor \TMP2, %xmm\index
822 shr $2,%eax # 128->4, 192->6, 256->8
823 add $5,%eax # 128->9, 192->11, 256->13
828 AESENC \TMP1, %xmm\index
832 jnz aes_loop_initial_\@
836 AESENCLAST \TMP1, %xmm\index # Last Round
839 movdqu (%arg4 , %r11, 1), \TMP1
840 pxor \TMP1, %xmm\index
841 movdqu %xmm\index, (%arg3 , %r11, 1)
842 # write back plaintext/ciphertext for num_initial_blocks
846 movdqa \TMP1, %xmm\index
848 PSHUFB_XMM %xmm14, %xmm\index
850 # prepare plaintext/ciphertext for GHASH computation
854 # apply GHASH on num_initial_blocks blocks
858 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
860 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
862 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
865 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
867 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
870 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
873 jl _initial_blocks_done\@
874 # no need for precomputed values
877 * Precomputations for HashKey parallel with encryption of first 4 blocks.
878 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
880 MOVADQ ONE(%RIP),\TMP1
881 paddd \TMP1, \XMM0 # INCR Y0
883 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
885 paddd \TMP1, \XMM0 # INCR Y0
887 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
889 paddd \TMP1, \XMM0 # INCR Y0
891 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
893 paddd \TMP1, \XMM0 # INCR Y0
895 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
897 MOVADQ 0(%arg1),\TMP1
902 .irpc index, 1234 # do 4 rounds
903 movaps 0x10*\index(%arg1), \TMP1
909 .irpc index, 56789 # do next 5 rounds
910 movaps 0x10*\index(%arg1), \TMP1
918 shr $2,%eax # 128->4, 192->6, 256->8
919 sub $4,%eax # 128->0, 192->2, 256->4
920 jz aes_loop_pre_done\@
925 AESENC \TMP2, %xmm\index
933 AESENCLAST \TMP2, \XMM1
934 AESENCLAST \TMP2, \XMM2
935 AESENCLAST \TMP2, \XMM3
936 AESENCLAST \TMP2, \XMM4
937 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
940 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
943 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
946 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
949 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
952 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
955 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
958 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
961 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
962 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
963 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
964 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
968 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
970 # combine GHASHed value with the corresponding ciphertext
971 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
972 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
973 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
975 _initial_blocks_done\@:
980 * encrypt 4 blocks at a time
981 * ghash the 4 previously encrypted ciphertext blocks
982 * arg1, %arg3, %arg4 are used as pointers only, not modified
983 * %r11 is the data offset value
985 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
986 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
993 movdqa SHUF_MASK(%rip), %xmm15
994 # multiply TMP5 * HashKey using karatsuba
997 pshufd $78, \XMM5, \TMP6
999 paddd ONE(%rip), \XMM0 # INCR CNT
1000 movdqu HashKey_4(%arg2), \TMP5
1001 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1003 paddd ONE(%rip), \XMM0 # INCR CNT
1005 paddd ONE(%rip), \XMM0 # INCR CNT
1007 paddd ONE(%rip), \XMM0 # INCR CNT
1009 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1010 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1011 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1012 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1013 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1019 movdqu HashKey_4_k(%arg2), \TMP5
1020 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1021 movaps 0x10(%arg1), \TMP1
1022 AESENC \TMP1, \XMM1 # Round 1
1026 movaps 0x20(%arg1), \TMP1
1027 AESENC \TMP1, \XMM1 # Round 2
1032 pshufd $78, \XMM6, \TMP2
1034 movdqu HashKey_3(%arg2), \TMP5
1035 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1036 movaps 0x30(%arg1), \TMP3
1037 AESENC \TMP3, \XMM1 # Round 3
1041 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1042 movaps 0x40(%arg1), \TMP3
1043 AESENC \TMP3, \XMM1 # Round 4
1047 movdqu HashKey_3_k(%arg2), \TMP5
1048 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1049 movaps 0x50(%arg1), \TMP3
1050 AESENC \TMP3, \XMM1 # Round 5
1055 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1059 pshufd $78, \XMM7, \TMP2
1061 movdqu HashKey_2(%arg2), \TMP5
1063 # Multiply TMP5 * HashKey using karatsuba
1065 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1066 movaps 0x60(%arg1), \TMP3
1067 AESENC \TMP3, \XMM1 # Round 6
1071 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1072 movaps 0x70(%arg1), \TMP3
1073 AESENC \TMP3, \XMM1 # Round 7
1077 movdqu HashKey_2_k(%arg2), \TMP5
1078 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1079 movaps 0x80(%arg1), \TMP3
1080 AESENC \TMP3, \XMM1 # Round 8
1085 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1089 # Multiply XMM8 * HashKey
1090 # XMM8 and TMP5 hold the values for the two operands
1093 pshufd $78, \XMM8, \TMP2
1095 movdqu HashKey(%arg2), \TMP5
1096 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1097 movaps 0x90(%arg1), \TMP3
1098 AESENC \TMP3, \XMM1 # Round 9
1102 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1103 lea 0xa0(%arg1),%r10
1105 shr $2,%eax # 128->4, 192->6, 256->8
1106 sub $4,%eax # 128->0, 192->2, 256->4
1107 jz aes_loop_par_enc_done\@
1112 AESENC \TMP3, %xmm\index
1116 jnz aes_loop_par_enc\@
1118 aes_loop_par_enc_done\@:
1119 MOVADQ (%r10), \TMP3
1120 AESENCLAST \TMP3, \XMM1 # Round 10
1121 AESENCLAST \TMP3, \XMM2
1122 AESENCLAST \TMP3, \XMM3
1123 AESENCLAST \TMP3, \XMM4
1124 movdqu HashKey_k(%arg2), \TMP5
1125 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1126 movdqu (%arg4,%r11,1), \TMP3
1127 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1128 movdqu 16(%arg4,%r11,1), \TMP3
1129 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1130 movdqu 32(%arg4,%r11,1), \TMP3
1131 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1132 movdqu 48(%arg4,%r11,1), \TMP3
1133 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1134 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1135 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1136 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1137 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
1138 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1139 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1140 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1141 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1149 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1150 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1152 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1154 # first phase of reduction
1159 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1160 pslld $31, \TMP2 # packed right shift << 31
1161 pslld $30, \TMP3 # packed right shift << 30
1162 pslld $25, \TMP4 # packed right shift << 25
1163 pxor \TMP3, \TMP2 # xor the shifted versions
1166 psrldq $4, \TMP5 # right shift T5 1 DW
1167 pslldq $12, \TMP2 # left shift T2 3 DWs
1170 # second phase of reduction
1172 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1175 psrld $1, \TMP2 # packed left shift >>1
1176 psrld $2, \TMP3 # packed left shift >>2
1177 psrld $7, \TMP4 # packed left shift >>7
1178 pxor \TMP3,\TMP2 # xor the shifted versions
1182 pxor \TMP1, \XMM5 # result is in TMP1
1188 * decrypt 4 blocks at a time
1189 * ghash the 4 previously decrypted ciphertext blocks
1190 * arg1, %arg3, %arg4 are used as pointers only, not modified
1191 * %r11 is the data offset value
1193 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1194 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1201 movdqa SHUF_MASK(%rip), %xmm15
1202 # multiply TMP5 * HashKey using karatsuba
1205 pshufd $78, \XMM5, \TMP6
1207 paddd ONE(%rip), \XMM0 # INCR CNT
1208 movdqu HashKey_4(%arg2), \TMP5
1209 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1211 paddd ONE(%rip), \XMM0 # INCR CNT
1213 paddd ONE(%rip), \XMM0 # INCR CNT
1215 paddd ONE(%rip), \XMM0 # INCR CNT
1217 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1218 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1219 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1220 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1221 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1227 movdqu HashKey_4_k(%arg2), \TMP5
1228 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1229 movaps 0x10(%arg1), \TMP1
1230 AESENC \TMP1, \XMM1 # Round 1
1234 movaps 0x20(%arg1), \TMP1
1235 AESENC \TMP1, \XMM1 # Round 2
1240 pshufd $78, \XMM6, \TMP2
1242 movdqu HashKey_3(%arg2), \TMP5
1243 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1244 movaps 0x30(%arg1), \TMP3
1245 AESENC \TMP3, \XMM1 # Round 3
1249 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1250 movaps 0x40(%arg1), \TMP3
1251 AESENC \TMP3, \XMM1 # Round 4
1255 movdqu HashKey_3_k(%arg2), \TMP5
1256 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1257 movaps 0x50(%arg1), \TMP3
1258 AESENC \TMP3, \XMM1 # Round 5
1263 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1267 pshufd $78, \XMM7, \TMP2
1269 movdqu HashKey_2(%arg2), \TMP5
1271 # Multiply TMP5 * HashKey using karatsuba
1273 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1274 movaps 0x60(%arg1), \TMP3
1275 AESENC \TMP3, \XMM1 # Round 6
1279 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1280 movaps 0x70(%arg1), \TMP3
1281 AESENC \TMP3, \XMM1 # Round 7
1285 movdqu HashKey_2_k(%arg2), \TMP5
1286 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1287 movaps 0x80(%arg1), \TMP3
1288 AESENC \TMP3, \XMM1 # Round 8
1293 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1297 # Multiply XMM8 * HashKey
1298 # XMM8 and TMP5 hold the values for the two operands
1301 pshufd $78, \XMM8, \TMP2
1303 movdqu HashKey(%arg2), \TMP5
1304 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1305 movaps 0x90(%arg1), \TMP3
1306 AESENC \TMP3, \XMM1 # Round 9
1310 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1311 lea 0xa0(%arg1),%r10
1313 shr $2,%eax # 128->4, 192->6, 256->8
1314 sub $4,%eax # 128->0, 192->2, 256->4
1315 jz aes_loop_par_dec_done\@
1320 AESENC \TMP3, %xmm\index
1324 jnz aes_loop_par_dec\@
1326 aes_loop_par_dec_done\@:
1327 MOVADQ (%r10), \TMP3
1328 AESENCLAST \TMP3, \XMM1 # last round
1329 AESENCLAST \TMP3, \XMM2
1330 AESENCLAST \TMP3, \XMM3
1331 AESENCLAST \TMP3, \XMM4
1332 movdqu HashKey_k(%arg2), \TMP5
1333 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1334 movdqu (%arg4,%r11,1), \TMP3
1335 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1336 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1338 movdqu 16(%arg4,%r11,1), \TMP3
1339 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1340 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1342 movdqu 32(%arg4,%r11,1), \TMP3
1343 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1344 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1346 movdqu 48(%arg4,%r11,1), \TMP3
1347 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1348 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1350 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1351 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1352 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1353 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1361 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1362 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1364 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1366 # first phase of reduction
1371 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1372 pslld $31, \TMP2 # packed right shift << 31
1373 pslld $30, \TMP3 # packed right shift << 30
1374 pslld $25, \TMP4 # packed right shift << 25
1375 pxor \TMP3, \TMP2 # xor the shifted versions
1378 psrldq $4, \TMP5 # right shift T5 1 DW
1379 pslldq $12, \TMP2 # left shift T2 3 DWs
1382 # second phase of reduction
1384 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1387 psrld $1, \TMP2 # packed left shift >>1
1388 psrld $2, \TMP3 # packed left shift >>2
1389 psrld $7, \TMP4 # packed left shift >>7
1390 pxor \TMP3,\TMP2 # xor the shifted versions
1394 pxor \TMP1, \XMM5 # result is in TMP1
1399 /* GHASH the last 4 ciphertext blocks. */
1400 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1401 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1403 # Multiply TMP6 * HashKey (using Karatsuba)
1406 pshufd $78, \XMM1, \TMP2
1408 movdqu HashKey_4(%arg2), \TMP5
1409 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1410 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1411 movdqu HashKey_4_k(%arg2), \TMP4
1412 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1413 movdqa \XMM1, \XMMDst
1414 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1416 # Multiply TMP1 * HashKey (using Karatsuba)
1419 pshufd $78, \XMM2, \TMP2
1421 movdqu HashKey_3(%arg2), \TMP5
1422 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1423 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1424 movdqu HashKey_3_k(%arg2), \TMP4
1425 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1429 # results accumulated in TMP6, XMMDst, XMM1
1431 # Multiply TMP1 * HashKey (using Karatsuba)
1434 pshufd $78, \XMM3, \TMP2
1436 movdqu HashKey_2(%arg2), \TMP5
1437 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1438 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1439 movdqu HashKey_2_k(%arg2), \TMP4
1440 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1443 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1445 # Multiply TMP1 * HashKey (using Karatsuba)
1447 pshufd $78, \XMM4, \TMP2
1449 movdqu HashKey(%arg2), \TMP5
1450 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1451 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1452 movdqu HashKey_k(%arg2), \TMP4
1453 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1459 # middle section of the temp results combined as in karatsuba algorithm
1461 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1462 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1465 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1466 # first phase of the reduction
1467 movdqa \XMMDst, \TMP2
1468 movdqa \XMMDst, \TMP3
1469 movdqa \XMMDst, \TMP4
1470 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1471 pslld $31, \TMP2 # packed right shifting << 31
1472 pslld $30, \TMP3 # packed right shifting << 30
1473 pslld $25, \TMP4 # packed right shifting << 25
1474 pxor \TMP3, \TMP2 # xor the shifted versions
1477 psrldq $4, \TMP7 # right shift TMP7 1 DW
1478 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1481 # second phase of the reduction
1482 movdqa \XMMDst, \TMP2
1483 # make 3 copies of XMMDst for doing 3 shift operations
1484 movdqa \XMMDst, \TMP3
1485 movdqa \XMMDst, \TMP4
1486 psrld $1, \TMP2 # packed left shift >> 1
1487 psrld $2, \TMP3 # packed left shift >> 2
1488 psrld $7, \TMP4 # packed left shift >> 7
1489 pxor \TMP3, \TMP2 # xor the shifted versions
1493 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1497 /* Encryption of a single block
1501 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1505 shr $2,%eax # 128->4, 192->6, 256->8
1506 add $5,%eax # 128->9, 192->11, 256->13
1507 lea 16(%arg1), %r10 # get first expanded key address
1517 AESENCLAST \TMP1,\XMM0
1519 /*****************************************************************************
1520 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1521 * struct gcm_context_data *data
1523 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1524 * const u8 *in, // Ciphertext input
1525 * u64 plaintext_len, // Length of data in bytes for decryption.
1526 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1527 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1528 * // concatenated with 0x00000001. 16-byte aligned pointer.
1529 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1530 * const u8 *aad, // Additional Authentication Data (AAD)
1531 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1532 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1533 * // given authentication tag and only return the plaintext if they match.
1534 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1535 * // (most likely), 12 or 8.
1540 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1541 * set of 11 keys in the data structure void *aes_ctx
1545 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1546 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547 * | Salt (From the SA) |
1548 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549 * | Initialization Vector |
1550 * | (This is the sequence number from IPSec header) |
1551 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1553 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1558 * AAD padded to 128 bits with 0
1559 * for example, assume AAD is a u32 vector
1561 * if AAD is 8 bytes:
1562 * AAD[3] = {A0, A1};
1563 * padded AAD in xmm register = {A1 A0 0 0}
1566 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1567 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1569 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570 * | 32-bit Sequence Number (A0) |
1571 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1573 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1575 * AAD Format with 32-bit Sequence Number
1577 * if AAD is 12 bytes:
1578 * AAD[3] = {A0, A1, A2};
1579 * padded AAD in xmm register = {A2 A1 A0 0}
1582 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1583 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1585 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1587 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588 * | 64-bit Extended Sequence Number {A1,A0} |
1590 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1592 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1594 * AAD Format with 64-bit Extended Sequence Number
1596 * poly = x^128 + x^127 + x^126 + x^121 + 1
1598 *****************************************************************************/
1599 ENTRY(aesni_gcm_dec)
1602 GCM_INIT %arg6, arg7, arg8, arg9
1604 GCM_COMPLETE arg10, arg11
1607 ENDPROC(aesni_gcm_dec)
1610 /*****************************************************************************
1611 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1612 * struct gcm_context_data *data
1614 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1615 * const u8 *in, // Plaintext input
1616 * u64 plaintext_len, // Length of data in bytes for encryption.
1617 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1618 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1619 * // concatenated with 0x00000001. 16-byte aligned pointer.
1620 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1621 * const u8 *aad, // Additional Authentication Data (AAD)
1622 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1623 * u8 *auth_tag, // Authenticated Tag output.
1624 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1630 * keys are pre-expanded and aligned to 16 bytes. we are using the
1631 * first set of 11 keys in the data structure void *aes_ctx
1636 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1637 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638 * | Salt (From the SA) |
1639 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640 * | Initialization Vector |
1641 * | (This is the sequence number from IPSec header) |
1642 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1644 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1649 * AAD padded to 128 bits with 0
1650 * for example, assume AAD is a u32 vector
1652 * if AAD is 8 bytes:
1653 * AAD[3] = {A0, A1};
1654 * padded AAD in xmm register = {A1 A0 0 0}
1657 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1658 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1660 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661 * | 32-bit Sequence Number (A0) |
1662 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1664 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1666 * AAD Format with 32-bit Sequence Number
1668 * if AAD is 12 bytes:
1669 * AAD[3] = {A0, A1, A2};
1670 * padded AAD in xmm register = {A2 A1 A0 0}
1673 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1674 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1676 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677 * | 64-bit Extended Sequence Number {A1,A0} |
1679 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1681 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1683 * AAD Format with 64-bit Extended Sequence Number
1685 * poly = x^128 + x^127 + x^126 + x^121 + 1
1686 ***************************************************************************/
1687 ENTRY(aesni_gcm_enc)
1690 GCM_INIT %arg6, arg7, arg8, arg9
1693 GCM_COMPLETE arg10, arg11
1696 ENDPROC(aesni_gcm_enc)
1698 /*****************************************************************************
1699 * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1700 * struct gcm_context_data *data,
1702 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1703 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1704 * // concatenated with 0x00000001. 16-byte aligned pointer.
1705 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1706 * const u8 *aad, // Additional Authentication Data (AAD)
1707 * u64 aad_len) // Length of AAD in bytes.
1709 ENTRY(aesni_gcm_init)
1711 GCM_INIT %arg3, %arg4,%arg5, %arg6
1714 ENDPROC(aesni_gcm_init)
1716 /*****************************************************************************
1717 * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1718 * struct gcm_context_data *data,
1720 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1721 * const u8 *in, // Plaintext input
1722 * u64 plaintext_len, // Length of data in bytes for encryption.
1724 ENTRY(aesni_gcm_enc_update)
1729 ENDPROC(aesni_gcm_enc_update)
1731 /*****************************************************************************
1732 * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1733 * struct gcm_context_data *data,
1735 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1736 * const u8 *in, // Plaintext input
1737 * u64 plaintext_len, // Length of data in bytes for encryption.
1739 ENTRY(aesni_gcm_dec_update)
1744 ENDPROC(aesni_gcm_dec_update)
1746 /*****************************************************************************
1747 * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1748 * struct gcm_context_data *data,
1750 * u8 *auth_tag, // Authenticated Tag output.
1751 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1754 ENTRY(aesni_gcm_finalize)
1756 GCM_COMPLETE %arg3 %arg4
1759 ENDPROC(aesni_gcm_finalize)
1766 _key_expansion_256a:
1767 pshufd $0b11111111, %xmm1, %xmm1
1768 shufps $0b00010000, %xmm0, %xmm4
1770 shufps $0b10001100, %xmm0, %xmm4
1773 movaps %xmm0, (TKEYP)
1776 ENDPROC(_key_expansion_128)
1777 ENDPROC(_key_expansion_256a)
1780 _key_expansion_192a:
1781 pshufd $0b01010101, %xmm1, %xmm1
1782 shufps $0b00010000, %xmm0, %xmm4
1784 shufps $0b10001100, %xmm0, %xmm4
1791 pshufd $0b11111111, %xmm0, %xmm3
1796 shufps $0b01000100, %xmm0, %xmm6
1797 movaps %xmm6, (TKEYP)
1798 shufps $0b01001110, %xmm2, %xmm1
1799 movaps %xmm1, 0x10(TKEYP)
1802 ENDPROC(_key_expansion_192a)
1805 _key_expansion_192b:
1806 pshufd $0b01010101, %xmm1, %xmm1
1807 shufps $0b00010000, %xmm0, %xmm4
1809 shufps $0b10001100, %xmm0, %xmm4
1815 pshufd $0b11111111, %xmm0, %xmm3
1819 movaps %xmm0, (TKEYP)
1822 ENDPROC(_key_expansion_192b)
1825 _key_expansion_256b:
1826 pshufd $0b10101010, %xmm1, %xmm1
1827 shufps $0b00010000, %xmm2, %xmm4
1829 shufps $0b10001100, %xmm2, %xmm4
1832 movaps %xmm2, (TKEYP)
1835 ENDPROC(_key_expansion_256b)
1838 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1839 * unsigned int key_len)
1841 ENTRY(aesni_set_key)
1845 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1846 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1847 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1849 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1850 movaps %xmm0, (KEYP)
1851 lea 0x10(KEYP), TKEYP # key addr
1852 movl %edx, 480(KEYP)
1853 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1857 movups 0x10(UKEYP), %xmm2 # other user key
1858 movaps %xmm2, (TKEYP)
1860 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1861 call _key_expansion_256a
1862 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1863 call _key_expansion_256b
1864 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1865 call _key_expansion_256a
1866 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1867 call _key_expansion_256b
1868 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1869 call _key_expansion_256a
1870 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1871 call _key_expansion_256b
1872 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1873 call _key_expansion_256a
1874 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1875 call _key_expansion_256b
1876 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1877 call _key_expansion_256a
1878 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1879 call _key_expansion_256b
1880 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1881 call _key_expansion_256a
1882 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1883 call _key_expansion_256b
1884 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1885 call _key_expansion_256a
1888 movq 0x10(UKEYP), %xmm2 # other user key
1889 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1890 call _key_expansion_192a
1891 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1892 call _key_expansion_192b
1893 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1894 call _key_expansion_192a
1895 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1896 call _key_expansion_192b
1897 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1898 call _key_expansion_192a
1899 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1900 call _key_expansion_192b
1901 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1902 call _key_expansion_192a
1903 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1904 call _key_expansion_192b
1907 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1908 call _key_expansion_128
1909 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1910 call _key_expansion_128
1911 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1912 call _key_expansion_128
1913 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1914 call _key_expansion_128
1915 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1916 call _key_expansion_128
1917 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1918 call _key_expansion_128
1919 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1920 call _key_expansion_128
1921 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1922 call _key_expansion_128
1923 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1924 call _key_expansion_128
1925 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1926 call _key_expansion_128
1929 movaps (KEYP), %xmm0
1930 movaps (TKEYP), %xmm1
1931 movaps %xmm0, 240(TKEYP)
1932 movaps %xmm1, 240(KEYP)
1934 lea 240-16(TKEYP), UKEYP
1937 movaps (KEYP), %xmm0
1939 movaps %xmm1, (UKEYP)
1950 ENDPROC(aesni_set_key)
1953 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1960 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1961 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1962 movl (FRAME_OFFSET+20)(%esp), INP # src
1964 movl 480(KEYP), KLEN # key length
1965 movups (INP), STATE # input
1967 movups STATE, (OUTP) # output
1977 * _aesni_enc1: internal ABI
1979 * KEYP: key struct pointer
1981 * STATE: initial state (input)
1983 * STATE: finial state (output)
1990 movaps (KEYP), KEY # key
1992 pxor KEY, STATE # round 0
1996 lea 0x20(TKEYP), TKEYP
1999 movaps -0x60(TKEYP), KEY
2001 movaps -0x50(TKEYP), KEY
2005 movaps -0x40(TKEYP), KEY
2007 movaps -0x30(TKEYP), KEY
2011 movaps -0x20(TKEYP), KEY
2013 movaps -0x10(TKEYP), KEY
2017 movaps 0x10(TKEYP), KEY
2019 movaps 0x20(TKEYP), KEY
2021 movaps 0x30(TKEYP), KEY
2023 movaps 0x40(TKEYP), KEY
2025 movaps 0x50(TKEYP), KEY
2027 movaps 0x60(TKEYP), KEY
2029 movaps 0x70(TKEYP), KEY
2030 AESENCLAST KEY STATE
2032 ENDPROC(_aesni_enc1)
2035 * _aesni_enc4: internal ABI
2037 * KEYP: key struct pointer
2039 * STATE1: initial state (input)
2044 * STATE1: finial state (output)
2054 movaps (KEYP), KEY # key
2056 pxor KEY, STATE1 # round 0
2063 lea 0x20(TKEYP), TKEYP
2066 movaps -0x60(TKEYP), KEY
2071 movaps -0x50(TKEYP), KEY
2078 movaps -0x40(TKEYP), KEY
2083 movaps -0x30(TKEYP), KEY
2090 movaps -0x20(TKEYP), KEY
2095 movaps -0x10(TKEYP), KEY
2105 movaps 0x10(TKEYP), KEY
2110 movaps 0x20(TKEYP), KEY
2115 movaps 0x30(TKEYP), KEY
2120 movaps 0x40(TKEYP), KEY
2125 movaps 0x50(TKEYP), KEY
2130 movaps 0x60(TKEYP), KEY
2135 movaps 0x70(TKEYP), KEY
2136 AESENCLAST KEY STATE1 # last round
2137 AESENCLAST KEY STATE2
2138 AESENCLAST KEY STATE3
2139 AESENCLAST KEY STATE4
2141 ENDPROC(_aesni_enc4)
2144 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2151 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2152 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2153 movl (FRAME_OFFSET+20)(%esp), INP # src
2155 mov 480(KEYP), KLEN # key length
2157 movups (INP), STATE # input
2159 movups STATE, (OUTP) #output
2169 * _aesni_dec1: internal ABI
2171 * KEYP: key struct pointer
2173 * STATE: initial state (input)
2175 * STATE: finial state (output)
2182 movaps (KEYP), KEY # key
2184 pxor KEY, STATE # round 0
2188 lea 0x20(TKEYP), TKEYP
2191 movaps -0x60(TKEYP), KEY
2193 movaps -0x50(TKEYP), KEY
2197 movaps -0x40(TKEYP), KEY
2199 movaps -0x30(TKEYP), KEY
2203 movaps -0x20(TKEYP), KEY
2205 movaps -0x10(TKEYP), KEY
2209 movaps 0x10(TKEYP), KEY
2211 movaps 0x20(TKEYP), KEY
2213 movaps 0x30(TKEYP), KEY
2215 movaps 0x40(TKEYP), KEY
2217 movaps 0x50(TKEYP), KEY
2219 movaps 0x60(TKEYP), KEY
2221 movaps 0x70(TKEYP), KEY
2222 AESDECLAST KEY STATE
2224 ENDPROC(_aesni_dec1)
2227 * _aesni_dec4: internal ABI
2229 * KEYP: key struct pointer
2231 * STATE1: initial state (input)
2236 * STATE1: finial state (output)
2246 movaps (KEYP), KEY # key
2248 pxor KEY, STATE1 # round 0
2255 lea 0x20(TKEYP), TKEYP
2258 movaps -0x60(TKEYP), KEY
2263 movaps -0x50(TKEYP), KEY
2270 movaps -0x40(TKEYP), KEY
2275 movaps -0x30(TKEYP), KEY
2282 movaps -0x20(TKEYP), KEY
2287 movaps -0x10(TKEYP), KEY
2297 movaps 0x10(TKEYP), KEY
2302 movaps 0x20(TKEYP), KEY
2307 movaps 0x30(TKEYP), KEY
2312 movaps 0x40(TKEYP), KEY
2317 movaps 0x50(TKEYP), KEY
2322 movaps 0x60(TKEYP), KEY
2327 movaps 0x70(TKEYP), KEY
2328 AESDECLAST KEY STATE1 # last round
2329 AESDECLAST KEY STATE2
2330 AESDECLAST KEY STATE3
2331 AESDECLAST KEY STATE4
2333 ENDPROC(_aesni_dec4)
2336 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2339 ENTRY(aesni_ecb_enc)
2345 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2346 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2347 movl (FRAME_OFFSET+24)(%esp), INP # src
2348 movl (FRAME_OFFSET+28)(%esp), LEN # len
2350 test LEN, LEN # check length
2359 movups (INP), STATE1
2360 movups 0x10(INP), STATE2
2361 movups 0x20(INP), STATE3
2362 movups 0x30(INP), STATE4
2364 movups STATE1, (OUTP)
2365 movups STATE2, 0x10(OUTP)
2366 movups STATE3, 0x20(OUTP)
2367 movups STATE4, 0x30(OUTP)
2377 movups (INP), STATE1
2379 movups STATE1, (OUTP)
2393 ENDPROC(aesni_ecb_enc)
2396 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2399 ENTRY(aesni_ecb_dec)
2405 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2406 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2407 movl (FRAME_OFFSET+24)(%esp), INP # src
2408 movl (FRAME_OFFSET+28)(%esp), LEN # len
2420 movups (INP), STATE1
2421 movups 0x10(INP), STATE2
2422 movups 0x20(INP), STATE3
2423 movups 0x30(INP), STATE4
2425 movups STATE1, (OUTP)
2426 movups STATE2, 0x10(OUTP)
2427 movups STATE3, 0x20(OUTP)
2428 movups STATE4, 0x30(OUTP)
2438 movups (INP), STATE1
2440 movups STATE1, (OUTP)
2454 ENDPROC(aesni_ecb_dec)
2457 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2458 * size_t len, u8 *iv)
2460 ENTRY(aesni_cbc_enc)
2467 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2468 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2469 movl (FRAME_OFFSET+28)(%esp), INP # src
2470 movl (FRAME_OFFSET+32)(%esp), LEN # len
2471 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2476 movups (IVP), STATE # load iv as initial state
2479 movups (INP), IN # load input
2482 movups STATE, (OUTP) # store output
2498 ENDPROC(aesni_cbc_enc)
2501 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2502 * size_t len, u8 *iv)
2504 ENTRY(aesni_cbc_dec)
2511 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2512 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2513 movl (FRAME_OFFSET+28)(%esp), INP # src
2514 movl (FRAME_OFFSET+32)(%esp), LEN # len
2515 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2518 jb .Lcbc_dec_just_ret
2528 movups 0x10(INP), IN2
2531 movups 0x20(INP), IN3
2533 movups 0x30(INP), IN4
2536 movups 0x20(INP), IN1
2538 movups 0x30(INP), IN2
2553 movups 0x10(INP), IN2
2556 movups STATE1, (OUTP)
2557 movups STATE2, 0x10(OUTP)
2558 movups STATE3, 0x20(OUTP)
2559 movups STATE4, 0x30(OUTP)
2573 movups STATE, (OUTP)
2591 ENDPROC(aesni_cbc_dec)
2594 .pushsection .rodata
2597 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2601 * _aesni_inc_init: internal ABI
2602 * setup registers used by _aesni_inc
2606 * CTR: == IV, in little endian
2607 * TCTR_LOW: == lower qword of CTR
2608 * INC: == 1, in little endian
2609 * BSWAP_MASK == endian swapping mask
2613 movaps .Lbswap_mask, BSWAP_MASK
2615 PSHUFB_XMM BSWAP_MASK CTR
2617 MOVQ_R64_XMM TCTR_LOW INC
2618 MOVQ_R64_XMM CTR TCTR_LOW
2620 ENDPROC(_aesni_inc_init)
2623 * _aesni_inc: internal ABI
2624 * Increase IV by 1, IV is in big endian
2627 * CTR: == IV, in little endian
2628 * TCTR_LOW: == lower qword of CTR
2629 * INC: == 1, in little endian
2630 * BSWAP_MASK == endian swapping mask
2634 * CTR: == output IV, in little endian
2635 * TCTR_LOW: == lower qword of CTR
2647 PSHUFB_XMM BSWAP_MASK IV
2652 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2653 * size_t len, u8 *iv)
2655 ENTRY(aesni_ctr_enc)
2658 jb .Lctr_enc_just_ret
2661 call _aesni_inc_init
2671 movups 0x10(INP), IN2
2674 movups 0x20(INP), IN3
2677 movups 0x30(INP), IN4
2680 movups STATE1, (OUTP)
2682 movups STATE2, 0x10(OUTP)
2684 movups STATE3, 0x20(OUTP)
2686 movups STATE4, 0x30(OUTP)
2701 movups STATE, (OUTP)
2712 ENDPROC(aesni_ctr_enc)
2715 * _aesni_gf128mul_x_ble: internal ABI
2716 * Multiply in GF(2^128) for XTS IVs
2719 * GF128MUL_MASK == mask with 0x87 and 0x01
2723 * CTR: == temporary value
2725 #define _aesni_gf128mul_x_ble() \
2726 pshufd $0x13, IV, CTR; \
2729 pand GF128MUL_MASK, CTR; \
2733 * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2736 ENTRY(aesni_xts_crypt8)
2741 leaq _aesni_enc4, %r11
2742 leaq _aesni_dec4, %rax
2746 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2753 movdqu 0x00(INP), INC
2755 movdqu IV, 0x00(OUTP)
2757 _aesni_gf128mul_x_ble()
2759 movdqu 0x10(INP), INC
2761 movdqu IV, 0x10(OUTP)
2763 _aesni_gf128mul_x_ble()
2765 movdqu 0x20(INP), INC
2767 movdqu IV, 0x20(OUTP)
2769 _aesni_gf128mul_x_ble()
2771 movdqu 0x30(INP), INC
2773 movdqu IV, 0x30(OUTP)
2777 movdqu 0x00(OUTP), INC
2779 movdqu STATE1, 0x00(OUTP)
2781 _aesni_gf128mul_x_ble()
2783 movdqu 0x40(INP), INC
2785 movdqu IV, 0x40(OUTP)
2787 movdqu 0x10(OUTP), INC
2789 movdqu STATE2, 0x10(OUTP)
2791 _aesni_gf128mul_x_ble()
2793 movdqu 0x50(INP), INC
2795 movdqu IV, 0x50(OUTP)
2797 movdqu 0x20(OUTP), INC
2799 movdqu STATE3, 0x20(OUTP)
2801 _aesni_gf128mul_x_ble()
2803 movdqu 0x60(INP), INC
2805 movdqu IV, 0x60(OUTP)
2807 movdqu 0x30(OUTP), INC
2809 movdqu STATE4, 0x30(OUTP)
2811 _aesni_gf128mul_x_ble()
2813 movdqu 0x70(INP), INC
2815 movdqu IV, 0x70(OUTP)
2817 _aesni_gf128mul_x_ble()
2822 movdqu 0x40(OUTP), INC
2824 movdqu STATE1, 0x40(OUTP)
2826 movdqu 0x50(OUTP), INC
2828 movdqu STATE2, 0x50(OUTP)
2830 movdqu 0x60(OUTP), INC
2832 movdqu STATE3, 0x60(OUTP)
2834 movdqu 0x70(OUTP), INC
2836 movdqu STATE4, 0x70(OUTP)
2840 ENDPROC(aesni_xts_crypt8)