1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Implement AES algorithm in Intel AES-NI instructions.
5 * The white paper of AES-NI instructions can be downloaded from:
6 * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
8 * Copyright (C) 2008, Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal <vinodh.gopal@intel.com>
13 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14 * interface for 64-bit kernels.
15 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16 * Aidan O'Mahony (aidan.o.mahony@intel.com)
17 * Adrian Hoban <adrian.hoban@intel.com>
18 * James Guilford (james.guilford@intel.com)
19 * Gabriele Paoloni <gabriele.paoloni@intel.com>
20 * Tadeusz Struk (tadeusz.struk@intel.com)
21 * Wajdi Feghali (wajdi.k.feghali@intel.com)
22 * Copyright (c) 2010, Intel Corporation.
24 * Ported x86_64 version to x86:
25 * Author: Mathias Krause <minipli@googlemail.com>
28 #include <linux/linkage.h>
30 #include <asm/frame.h>
31 #include <asm/nospec-branch.h>
34 * The following macros are used to move an (un)aligned 16 byte value to/from
35 * an XMM register. This can done for either FP or integer values, for FP use
36 * movaps (move aligned packed single) or integer use movdqa (move double quad
37 * aligned). It doesn't make a performance difference which instruction is used
38 * since Nehalem (original Core i7) was released. However, the movaps is a byte
39 * shorter, so that is the one we'll use for now. (same for unaligned).
46 # constants in mergeable sections, linker can reorder and merge
47 .section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
49 .Lgf128mul_x_ble_mask:
50 .octa 0x00000000000000010000000000000087
51 .section .rodata.cst16.POLY, "aM", @progbits, 16
53 POLY: .octa 0xC2000000000000000000000000000001
54 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
56 TWOONE: .octa 0x00000001000000000000000000000001
58 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
60 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
61 .section .rodata.cst16.MASK1, "aM", @progbits, 16
63 MASK1: .octa 0x0000000000000000ffffffffffffffff
64 .section .rodata.cst16.MASK2, "aM", @progbits, 16
66 MASK2: .octa 0xffffffffffffffff0000000000000000
67 .section .rodata.cst16.ONE, "aM", @progbits, 16
69 ONE: .octa 0x00000000000000000000000000000001
70 .section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
72 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
73 .section .rodata.cst16.dec, "aM", @progbits, 16
76 .section .rodata.cst16.enc, "aM", @progbits, 16
80 # order of these constants should not change.
81 # more specifically, ALL_F should follow SHIFT_MASK,
82 # and zero should follow ALL_F
83 .section .rodata, "a", @progbits
85 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
86 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
87 .octa 0x00000000000000000000000000000000
92 #define STACK_OFFSET 8*3
96 #define InLen (16*1)+8
97 #define PBlockEncKey 16*2
100 #define PBlockLen 16*5
101 #define HashKey 16*6 // store HashKey <<1 mod poly here
102 #define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here
103 #define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here
104 #define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here
105 #define HashKey_k 16*10 // store XOR of High 64 bits and Low 64
106 // bits of HashKey <<1 mod poly here
107 //(for Karatsuba purposes)
108 #define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64
109 // bits of HashKey^2 <<1 mod poly here
110 // (for Karatsuba purposes)
111 #define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64
112 // bits of HashKey^3 <<1 mod poly here
113 // (for Karatsuba purposes)
114 #define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64
115 // bits of HashKey^4 <<1 mod poly here
116 // (for Karatsuba purposes)
124 #define arg7 STACK_OFFSET+8(%rsp)
125 #define arg8 STACK_OFFSET+16(%rsp)
126 #define arg9 STACK_OFFSET+24(%rsp)
127 #define arg10 STACK_OFFSET+32(%rsp)
128 #define arg11 STACK_OFFSET+40(%rsp)
129 #define keysize 2*15*16(%arg1)
146 #define BSWAP_MASK %xmm10
150 #define GF128MUL_MASK %xmm10
183 # states of %xmm registers %xmm6:%xmm15 not saved
184 # all %xmm registers are clobbered
195 # Precompute hashkeys.
196 # Input: Hash subkey.
197 # Output: HashKeys stored in gcm_context_data. Only needs to be called
199 # clobbers r12, and tmp xmm registers.
200 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
203 movdqa SHUF_MASK(%rip), \TMP2
204 PSHUFB_XMM \TMP2, \TMP3
206 # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
218 pshufd $0x24, \TMP1, \TMP2
219 pcmpeqd TWOONE(%rip), \TMP2
220 pand POLY(%rip), \TMP2
222 movdqu \TMP3, HashKey(%arg2)
225 pshufd $78, \TMP3, \TMP1
227 movdqu \TMP1, HashKey_k(%arg2)
229 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
230 # TMP5 = HashKey^2<<1 (mod poly)
231 movdqu \TMP5, HashKey_2(%arg2)
232 # HashKey_2 = HashKey^2<<1 (mod poly)
233 pshufd $78, \TMP5, \TMP1
235 movdqu \TMP1, HashKey_2_k(%arg2)
237 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
238 # TMP5 = HashKey^3<<1 (mod poly)
239 movdqu \TMP5, HashKey_3(%arg2)
240 pshufd $78, \TMP5, \TMP1
242 movdqu \TMP1, HashKey_3_k(%arg2)
244 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
245 # TMP5 = HashKey^3<<1 (mod poly)
246 movdqu \TMP5, HashKey_4(%arg2)
247 pshufd $78, \TMP5, \TMP1
249 movdqu \TMP1, HashKey_4_k(%arg2)
252 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
253 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
254 .macro GCM_INIT Iv SUBKEY AAD AADLEN
256 mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
258 mov %r11, InLen(%arg2) # ctx_data.in_length = 0
259 mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
260 mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
263 movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
265 movdqa SHUF_MASK(%rip), %xmm2
266 PSHUFB_XMM %xmm2, %xmm0
267 movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
269 PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
270 movdqu HashKey(%arg2), %xmm13
272 CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
276 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
277 # struct has been initialized by GCM_INIT.
278 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
279 # Clobbers rax, r10-r13, and xmm0-xmm15
280 .macro GCM_ENC_DEC operation
281 movdqu AadHash(%arg2), %xmm8
282 movdqu HashKey(%arg2), %xmm13
283 add %arg5, InLen(%arg2)
285 xor %r11d, %r11d # initialise the data pointer offset as zero
286 PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
288 sub %r11, %arg5 # sub partial block data used
289 mov %arg5, %r13 # save the number of bytes
291 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
293 # Encrypt/Decrypt first few blocks
296 jz _initial_num_blocks_is_0_\@
298 jb _initial_num_blocks_is_1_\@
299 je _initial_num_blocks_is_2_\@
300 _initial_num_blocks_is_3_\@:
301 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
302 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
304 jmp _initial_blocks_\@
305 _initial_num_blocks_is_2_\@:
306 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
307 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
309 jmp _initial_blocks_\@
310 _initial_num_blocks_is_1_\@:
311 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
312 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
314 jmp _initial_blocks_\@
315 _initial_num_blocks_is_0_\@:
316 INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
317 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
320 # Main loop - Encrypt/Decrypt remaining blocks
323 je _zero_cipher_left_\@
325 je _four_cipher_left_\@
327 GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \
328 %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
333 _four_cipher_left_\@:
334 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
335 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
336 _zero_cipher_left_\@:
337 movdqu %xmm8, AadHash(%arg2)
338 movdqu %xmm0, CurCount(%arg2)
341 and $15, %r13 # %r13 = arg5 (mod 16)
342 je _multiple_of_16_bytes_\@
344 mov %r13, PBlockLen(%arg2)
346 # Handle the last <16 Byte block separately
347 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
348 movdqu %xmm0, CurCount(%arg2)
349 movdqa SHUF_MASK(%rip), %xmm10
350 PSHUFB_XMM %xmm10, %xmm0
352 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
353 movdqu %xmm0, PBlockEncKey(%arg2)
356 jge _large_enough_update_\@
358 lea (%arg4,%r11,1), %r10
360 READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
363 _large_enough_update_\@:
367 # receive the last <16 Byte block
368 movdqu (%arg4, %r11, 1), %xmm1
373 lea SHIFT_MASK+16(%rip), %r12
374 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
375 # (r13 is the number of bytes in plaintext mod 16)
377 # get the appropriate shuffle mask
379 # shift right 16-r13 bytes
380 PSHUFB_XMM %xmm2, %xmm1
383 lea ALL_F+16(%rip), %r12
389 pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn)
391 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
392 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
395 movdqa SHUF_MASK(%rip), %xmm10
396 PSHUFB_XMM %xmm10 ,%xmm2
400 movdqa SHUF_MASK(%rip), %xmm10
401 PSHUFB_XMM %xmm10,%xmm0
406 movdqu %xmm8, AadHash(%arg2)
408 # GHASH computation for the last <16 byte block
409 movdqa SHUF_MASK(%rip), %xmm10
410 # shuffle xmm0 back to output as ciphertext
411 PSHUFB_XMM %xmm10, %xmm0
415 MOVQ_R64_XMM %xmm0, %rax
417 jle _less_than_8_bytes_left_\@
418 mov %rax, (%arg3 , %r11, 1)
421 MOVQ_R64_XMM %xmm0, %rax
423 _less_than_8_bytes_left_\@:
424 mov %al, (%arg3, %r11, 1)
428 jne _less_than_8_bytes_left_\@
429 _multiple_of_16_bytes_\@:
432 # GCM_COMPLETE Finishes update of tag of last partial block
433 # Output: Authorization Tag (AUTH_TAG)
434 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
435 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
436 movdqu AadHash(%arg2), %xmm8
437 movdqu HashKey(%arg2), %xmm13
439 mov PBlockLen(%arg2), %r12
444 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
447 mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes)
448 shl $3, %r12 # convert into number of bits
449 movd %r12d, %xmm15 # len(A) in %xmm15
450 mov InLen(%arg2), %r12
451 shl $3, %r12 # len(C) in bits (*128)
452 MOVQ_R64_XMM %r12, %xmm1
454 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
455 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
457 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
458 # final GHASH computation
459 movdqa SHUF_MASK(%rip), %xmm10
460 PSHUFB_XMM %xmm10, %xmm8
462 movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0
463 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
466 mov \AUTHTAG, %r10 # %r10 = authTag
467 mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len
473 MOVQ_R64_XMM %xmm0, %rax
499 jmp _return_T_done_\@
506 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
509 * Input: A and B (128-bits each, bit-reflected)
510 * Output: C = A*B*x mod poly, (i.e. >>1 )
511 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
512 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
515 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
517 pshufd $78, \GH, \TMP2
518 pshufd $78, \HK, \TMP3
519 pxor \GH, \TMP2 # TMP2 = a1+a0
520 pxor \HK, \TMP3 # TMP3 = b1+b0
521 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
522 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
523 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
525 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
527 pslldq $8, \TMP3 # left shift TMP3 2 DWs
528 psrldq $8, \TMP2 # right shift TMP2 2 DWs
530 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
532 # first phase of the reduction
536 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
537 # in in order to perform
539 pslld $31, \TMP2 # packed right shift <<31
540 pslld $30, \TMP3 # packed right shift <<30
541 pslld $25, \TMP4 # packed right shift <<25
542 pxor \TMP3, \TMP2 # xor the shifted versions
545 psrldq $4, \TMP5 # right shift TMP5 1 DW
546 pslldq $12, \TMP2 # left shift TMP2 3 DWs
549 # second phase of the reduction
551 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
552 # in in order to perform
556 psrld $1,\TMP2 # packed left shift >>1
557 psrld $2,\TMP3 # packed left shift >>2
558 psrld $7,\TMP4 # packed left shift >>7
559 pxor \TMP3,\TMP2 # xor the shifted versions
563 pxor \TMP1, \GH # result is in TMP1
566 # Reads DLEN bytes starting at DPTR and stores in XMMDst
567 # where 0 < DLEN < 16
568 # Clobbers %rax, DLEN and XMM1
569 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
573 MOVQ_R64_XMM %rax, \XMMDst
575 jz _done_read_partial_block_\@
579 mov 7(\DPTR, \DLEN, 1), %al
581 jnz _read_next_byte_\@
582 MOVQ_R64_XMM %rax, \XMM1
585 jmp _done_read_partial_block_\@
588 _read_next_byte_lt8_\@:
590 mov -1(\DPTR, \DLEN, 1), %al
592 jnz _read_next_byte_lt8_\@
593 MOVQ_R64_XMM %rax, \XMMDst
594 _done_read_partial_block_\@:
597 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
598 # clobbers r10-11, xmm14
599 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
601 MOVADQ SHUF_MASK(%rip), %xmm14
602 mov \AAD, %r10 # %r10 = AAD
603 mov \AADLEN, %r11 # %r11 = aadLen
611 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
613 GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
617 jge _get_AAD_blocks\@
621 /* read the last <16B of AAD */
626 READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
627 PSHUFB_XMM %xmm14, \TMP7 # byte-reflect the AAD data
629 GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
633 movdqu \TMP6, AadHash(%arg2)
636 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
637 # between update calls.
638 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
639 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
640 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
641 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
643 mov PBlockLen(%arg2), %r13
645 je _partial_block_done_\@ # Leave Macro if no partial blocks
646 # Read in input data without over reading
647 cmp $16, \PLAIN_CYPH_LEN
648 jl _fewer_than_16_bytes_\@
649 movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
652 _fewer_than_16_bytes_\@:
653 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
654 mov \PLAIN_CYPH_LEN, %r12
655 READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
657 mov PBlockLen(%arg2), %r13
659 _data_read_\@: # Finished reading in data
661 movdqu PBlockEncKey(%arg2), %xmm9
662 movdqu HashKey(%arg2), %xmm13
664 lea SHIFT_MASK(%rip), %r12
666 # adjust the shuffle mask pointer to be able to shift r13 bytes
667 # r16-r13 is the number of bytes in plaintext mod 16)
669 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
670 PSHUFB_XMM %xmm2, %xmm9 # shift right r13 bytes
674 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
676 mov \PLAIN_CYPH_LEN, %r10
678 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
680 # Determine if if partial block is not being filled and
681 # shift mask accordingly
682 jge _no_extra_mask_1_\@
686 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
687 # get the appropriate mask to mask out bottom r13 bytes of xmm9
688 pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9
691 movdqa SHUF_MASK(%rip), %xmm10
692 PSHUFB_XMM %xmm10, %xmm3
693 PSHUFB_XMM %xmm2, %xmm3
694 pxor %xmm3, \AAD_HASH
697 jl _partial_incomplete_1_\@
699 # GHASH computation for the last <16 Byte block
700 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
703 mov %rax, PBlockLen(%arg2)
705 _partial_incomplete_1_\@:
706 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
708 movdqu \AAD_HASH, AadHash(%arg2)
710 pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn)
712 mov \PLAIN_CYPH_LEN, %r10
714 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
716 # Determine if if partial block is not being filled and
717 # shift mask accordingly
718 jge _no_extra_mask_2_\@
722 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
723 # get the appropriate mask to mask out bottom r13 bytes of xmm9
726 movdqa SHUF_MASK(%rip), %xmm1
727 PSHUFB_XMM %xmm1, %xmm9
728 PSHUFB_XMM %xmm2, %xmm9
729 pxor %xmm9, \AAD_HASH
732 jl _partial_incomplete_2_\@
734 # GHASH computation for the last <16 Byte block
735 GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
738 mov %rax, PBlockLen(%arg2)
740 _partial_incomplete_2_\@:
741 add \PLAIN_CYPH_LEN, PBlockLen(%arg2)
743 movdqu \AAD_HASH, AadHash(%arg2)
745 movdqa SHUF_MASK(%rip), %xmm10
746 # shuffle xmm9 back to output as ciphertext
747 PSHUFB_XMM %xmm10, %xmm9
748 PSHUFB_XMM %xmm2, %xmm9
750 # output encrypted Bytes
755 # Set r13 to be the number of bytes to write out
759 mov \PLAIN_CYPH_LEN, %r13
762 MOVQ_R64_XMM %xmm0, %rax
764 jle _less_than_8_bytes_left_\@
766 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
769 MOVQ_R64_XMM %xmm0, %rax
771 _less_than_8_bytes_left_\@:
772 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
776 jne _less_than_8_bytes_left_\@
777 _partial_block_done_\@:
778 .endm # PARTIAL_BLOCK
781 * if a = number of total plaintext bytes
783 * num_initial_blocks = b mod 4
784 * encrypt the initial num_initial_blocks blocks and apply ghash on
786 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
788 * arg1, %arg2, %arg3 are used as a pointer only, not modified
792 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
793 XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
794 MOVADQ SHUF_MASK(%rip), %xmm14
796 movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0
798 # start AES for num_initial_blocks blocks
800 movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0
802 .if (\i == 5) || (\i == 6) || (\i == 7)
804 MOVADQ ONE(%RIP),\TMP1
805 MOVADQ 0(%arg1),\TMP2
807 paddd \TMP1, \XMM0 # INCR Y0
809 movdqa \XMM0, %xmm\index
811 MOVADQ \XMM0, %xmm\index
813 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
814 pxor \TMP2, %xmm\index
818 shr $2,%eax # 128->4, 192->6, 256->8
819 add $5,%eax # 128->9, 192->11, 256->13
824 AESENC \TMP1, %xmm\index
828 jnz aes_loop_initial_\@
832 AESENCLAST \TMP1, %xmm\index # Last Round
835 movdqu (%arg4 , %r11, 1), \TMP1
836 pxor \TMP1, %xmm\index
837 movdqu %xmm\index, (%arg3 , %r11, 1)
838 # write back plaintext/ciphertext for num_initial_blocks
842 movdqa \TMP1, %xmm\index
844 PSHUFB_XMM %xmm14, %xmm\index
846 # prepare plaintext/ciphertext for GHASH computation
850 # apply GHASH on num_initial_blocks blocks
854 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
856 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
858 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
869 jl _initial_blocks_done\@
870 # no need for precomputed values
873 * Precomputations for HashKey parallel with encryption of first 4 blocks.
874 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
876 MOVADQ ONE(%RIP),\TMP1
877 paddd \TMP1, \XMM0 # INCR Y0
879 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
881 paddd \TMP1, \XMM0 # INCR Y0
883 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
885 paddd \TMP1, \XMM0 # INCR Y0
887 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
889 paddd \TMP1, \XMM0 # INCR Y0
891 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
893 MOVADQ 0(%arg1),\TMP1
898 .irpc index, 1234 # do 4 rounds
899 movaps 0x10*\index(%arg1), \TMP1
905 .irpc index, 56789 # do next 5 rounds
906 movaps 0x10*\index(%arg1), \TMP1
914 shr $2,%eax # 128->4, 192->6, 256->8
915 sub $4,%eax # 128->0, 192->2, 256->4
916 jz aes_loop_pre_done\@
921 AESENC \TMP2, %xmm\index
929 AESENCLAST \TMP2, \XMM1
930 AESENCLAST \TMP2, \XMM2
931 AESENCLAST \TMP2, \XMM3
932 AESENCLAST \TMP2, \XMM4
933 movdqu 16*0(%arg4 , %r11 , 1), \TMP1
936 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
939 movdqu 16*1(%arg4 , %r11 , 1), \TMP1
942 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
945 movdqu 16*2(%arg4 , %r11 , 1), \TMP1
948 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
951 movdqu 16*3(%arg4 , %r11 , 1), \TMP1
954 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
957 movdqu \XMM1, 16*0(%arg3 , %r11 , 1)
958 movdqu \XMM2, 16*1(%arg3 , %r11 , 1)
959 movdqu \XMM3, 16*2(%arg3 , %r11 , 1)
960 movdqu \XMM4, 16*3(%arg3 , %r11 , 1)
964 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
966 # combine GHASHed value with the corresponding ciphertext
967 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
968 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
969 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
971 _initial_blocks_done\@:
976 * encrypt 4 blocks at a time
977 * ghash the 4 previously encrypted ciphertext blocks
978 * arg1, %arg3, %arg4 are used as pointers only, not modified
979 * %r11 is the data offset value
981 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
982 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
989 movdqa SHUF_MASK(%rip), %xmm15
990 # multiply TMP5 * HashKey using karatsuba
993 pshufd $78, \XMM5, \TMP6
995 paddd ONE(%rip), \XMM0 # INCR CNT
996 movdqu HashKey_4(%arg2), \TMP5
997 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
999 paddd ONE(%rip), \XMM0 # INCR CNT
1001 paddd ONE(%rip), \XMM0 # INCR CNT
1003 paddd ONE(%rip), \XMM0 # INCR CNT
1005 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1006 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1015 movdqu HashKey_4_k(%arg2), \TMP5
1016 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1017 movaps 0x10(%arg1), \TMP1
1018 AESENC \TMP1, \XMM1 # Round 1
1022 movaps 0x20(%arg1), \TMP1
1023 AESENC \TMP1, \XMM1 # Round 2
1028 pshufd $78, \XMM6, \TMP2
1030 movdqu HashKey_3(%arg2), \TMP5
1031 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1032 movaps 0x30(%arg1), \TMP3
1033 AESENC \TMP3, \XMM1 # Round 3
1037 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1038 movaps 0x40(%arg1), \TMP3
1039 AESENC \TMP3, \XMM1 # Round 4
1043 movdqu HashKey_3_k(%arg2), \TMP5
1044 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1045 movaps 0x50(%arg1), \TMP3
1046 AESENC \TMP3, \XMM1 # Round 5
1051 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1055 pshufd $78, \XMM7, \TMP2
1057 movdqu HashKey_2(%arg2), \TMP5
1059 # Multiply TMP5 * HashKey using karatsuba
1061 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1062 movaps 0x60(%arg1), \TMP3
1063 AESENC \TMP3, \XMM1 # Round 6
1067 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1068 movaps 0x70(%arg1), \TMP3
1069 AESENC \TMP3, \XMM1 # Round 7
1073 movdqu HashKey_2_k(%arg2), \TMP5
1074 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1075 movaps 0x80(%arg1), \TMP3
1076 AESENC \TMP3, \XMM1 # Round 8
1081 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1085 # Multiply XMM8 * HashKey
1086 # XMM8 and TMP5 hold the values for the two operands
1089 pshufd $78, \XMM8, \TMP2
1091 movdqu HashKey(%arg2), \TMP5
1092 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1093 movaps 0x90(%arg1), \TMP3
1094 AESENC \TMP3, \XMM1 # Round 9
1098 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1099 lea 0xa0(%arg1),%r10
1101 shr $2,%eax # 128->4, 192->6, 256->8
1102 sub $4,%eax # 128->0, 192->2, 256->4
1103 jz aes_loop_par_enc_done\@
1108 AESENC \TMP3, %xmm\index
1112 jnz aes_loop_par_enc\@
1114 aes_loop_par_enc_done\@:
1115 MOVADQ (%r10), \TMP3
1116 AESENCLAST \TMP3, \XMM1 # Round 10
1117 AESENCLAST \TMP3, \XMM2
1118 AESENCLAST \TMP3, \XMM3
1119 AESENCLAST \TMP3, \XMM4
1120 movdqu HashKey_k(%arg2), \TMP5
1121 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1122 movdqu (%arg4,%r11,1), \TMP3
1123 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1124 movdqu 16(%arg4,%r11,1), \TMP3
1125 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1126 movdqu 32(%arg4,%r11,1), \TMP3
1127 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1128 movdqu 48(%arg4,%r11,1), \TMP3
1129 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1130 movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer
1131 movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer
1132 movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer
1133 movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer
1134 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1135 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1136 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1137 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1145 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1146 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1148 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1150 # first phase of reduction
1155 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1156 pslld $31, \TMP2 # packed right shift << 31
1157 pslld $30, \TMP3 # packed right shift << 30
1158 pslld $25, \TMP4 # packed right shift << 25
1159 pxor \TMP3, \TMP2 # xor the shifted versions
1162 psrldq $4, \TMP5 # right shift T5 1 DW
1163 pslldq $12, \TMP2 # left shift T2 3 DWs
1166 # second phase of reduction
1168 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1171 psrld $1, \TMP2 # packed left shift >>1
1172 psrld $2, \TMP3 # packed left shift >>2
1173 psrld $7, \TMP4 # packed left shift >>7
1174 pxor \TMP3,\TMP2 # xor the shifted versions
1178 pxor \TMP1, \XMM5 # result is in TMP1
1184 * decrypt 4 blocks at a time
1185 * ghash the 4 previously decrypted ciphertext blocks
1186 * arg1, %arg3, %arg4 are used as pointers only, not modified
1187 * %r11 is the data offset value
1189 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1190 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1197 movdqa SHUF_MASK(%rip), %xmm15
1198 # multiply TMP5 * HashKey using karatsuba
1201 pshufd $78, \XMM5, \TMP6
1203 paddd ONE(%rip), \XMM0 # INCR CNT
1204 movdqu HashKey_4(%arg2), \TMP5
1205 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
1207 paddd ONE(%rip), \XMM0 # INCR CNT
1209 paddd ONE(%rip), \XMM0 # INCR CNT
1211 paddd ONE(%rip), \XMM0 # INCR CNT
1213 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1214 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
1215 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1216 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1217 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1223 movdqu HashKey_4_k(%arg2), \TMP5
1224 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
1225 movaps 0x10(%arg1), \TMP1
1226 AESENC \TMP1, \XMM1 # Round 1
1230 movaps 0x20(%arg1), \TMP1
1231 AESENC \TMP1, \XMM1 # Round 2
1236 pshufd $78, \XMM6, \TMP2
1238 movdqu HashKey_3(%arg2), \TMP5
1239 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
1240 movaps 0x30(%arg1), \TMP3
1241 AESENC \TMP3, \XMM1 # Round 3
1245 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
1246 movaps 0x40(%arg1), \TMP3
1247 AESENC \TMP3, \XMM1 # Round 4
1251 movdqu HashKey_3_k(%arg2), \TMP5
1252 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1253 movaps 0x50(%arg1), \TMP3
1254 AESENC \TMP3, \XMM1 # Round 5
1259 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1263 pshufd $78, \XMM7, \TMP2
1265 movdqu HashKey_2(%arg2), \TMP5
1267 # Multiply TMP5 * HashKey using karatsuba
1269 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1270 movaps 0x60(%arg1), \TMP3
1271 AESENC \TMP3, \XMM1 # Round 6
1275 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
1276 movaps 0x70(%arg1), \TMP3
1277 AESENC \TMP3, \XMM1 # Round 7
1281 movdqu HashKey_2_k(%arg2), \TMP5
1282 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1283 movaps 0x80(%arg1), \TMP3
1284 AESENC \TMP3, \XMM1 # Round 8
1289 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1293 # Multiply XMM8 * HashKey
1294 # XMM8 and TMP5 hold the values for the two operands
1297 pshufd $78, \XMM8, \TMP2
1299 movdqu HashKey(%arg2), \TMP5
1300 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1301 movaps 0x90(%arg1), \TMP3
1302 AESENC \TMP3, \XMM1 # Round 9
1306 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
1307 lea 0xa0(%arg1),%r10
1309 shr $2,%eax # 128->4, 192->6, 256->8
1310 sub $4,%eax # 128->0, 192->2, 256->4
1311 jz aes_loop_par_dec_done\@
1316 AESENC \TMP3, %xmm\index
1320 jnz aes_loop_par_dec\@
1322 aes_loop_par_dec_done\@:
1323 MOVADQ (%r10), \TMP3
1324 AESENCLAST \TMP3, \XMM1 # last round
1325 AESENCLAST \TMP3, \XMM2
1326 AESENCLAST \TMP3, \XMM3
1327 AESENCLAST \TMP3, \XMM4
1328 movdqu HashKey_k(%arg2), \TMP5
1329 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1330 movdqu (%arg4,%r11,1), \TMP3
1331 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
1332 movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer
1334 movdqu 16(%arg4,%r11,1), \TMP3
1335 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
1336 movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer
1338 movdqu 32(%arg4,%r11,1), \TMP3
1339 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1340 movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer
1342 movdqu 48(%arg4,%r11,1), \TMP3
1343 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1344 movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer
1346 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1347 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1348 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1349 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1357 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1358 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1360 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1362 # first phase of reduction
1367 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1368 pslld $31, \TMP2 # packed right shift << 31
1369 pslld $30, \TMP3 # packed right shift << 30
1370 pslld $25, \TMP4 # packed right shift << 25
1371 pxor \TMP3, \TMP2 # xor the shifted versions
1374 psrldq $4, \TMP5 # right shift T5 1 DW
1375 pslldq $12, \TMP2 # left shift T2 3 DWs
1378 # second phase of reduction
1380 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1383 psrld $1, \TMP2 # packed left shift >>1
1384 psrld $2, \TMP3 # packed left shift >>2
1385 psrld $7, \TMP4 # packed left shift >>7
1386 pxor \TMP3,\TMP2 # xor the shifted versions
1390 pxor \TMP1, \XMM5 # result is in TMP1
1395 /* GHASH the last 4 ciphertext blocks. */
1396 .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1397 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1399 # Multiply TMP6 * HashKey (using Karatsuba)
1402 pshufd $78, \XMM1, \TMP2
1404 movdqu HashKey_4(%arg2), \TMP5
1405 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1406 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1407 movdqu HashKey_4_k(%arg2), \TMP4
1408 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1409 movdqa \XMM1, \XMMDst
1410 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1412 # Multiply TMP1 * HashKey (using Karatsuba)
1415 pshufd $78, \XMM2, \TMP2
1417 movdqu HashKey_3(%arg2), \TMP5
1418 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1419 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1420 movdqu HashKey_3_k(%arg2), \TMP4
1421 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1425 # results accumulated in TMP6, XMMDst, XMM1
1427 # Multiply TMP1 * HashKey (using Karatsuba)
1430 pshufd $78, \XMM3, \TMP2
1432 movdqu HashKey_2(%arg2), \TMP5
1433 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1434 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1435 movdqu HashKey_2_k(%arg2), \TMP4
1436 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1439 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1441 # Multiply TMP1 * HashKey (using Karatsuba)
1443 pshufd $78, \XMM4, \TMP2
1445 movdqu HashKey(%arg2), \TMP5
1446 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1447 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1448 movdqu HashKey_k(%arg2), \TMP4
1449 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1455 # middle section of the temp results combined as in karatsuba algorithm
1457 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1458 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1461 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1462 # first phase of the reduction
1463 movdqa \XMMDst, \TMP2
1464 movdqa \XMMDst, \TMP3
1465 movdqa \XMMDst, \TMP4
1466 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1467 pslld $31, \TMP2 # packed right shifting << 31
1468 pslld $30, \TMP3 # packed right shifting << 30
1469 pslld $25, \TMP4 # packed right shifting << 25
1470 pxor \TMP3, \TMP2 # xor the shifted versions
1473 psrldq $4, \TMP7 # right shift TMP7 1 DW
1474 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1477 # second phase of the reduction
1478 movdqa \XMMDst, \TMP2
1479 # make 3 copies of XMMDst for doing 3 shift operations
1480 movdqa \XMMDst, \TMP3
1481 movdqa \XMMDst, \TMP4
1482 psrld $1, \TMP2 # packed left shift >> 1
1483 psrld $2, \TMP3 # packed left shift >> 2
1484 psrld $7, \TMP4 # packed left shift >> 7
1485 pxor \TMP3, \TMP2 # xor the shifted versions
1489 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1493 /* Encryption of a single block
1497 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1501 shr $2,%eax # 128->4, 192->6, 256->8
1502 add $5,%eax # 128->9, 192->11, 256->13
1503 lea 16(%arg1), %r10 # get first expanded key address
1513 AESENCLAST \TMP1,\XMM0
1515 /*****************************************************************************
1516 * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1517 * struct gcm_context_data *data
1519 * u8 *out, // Plaintext output. Encrypt in-place is allowed.
1520 * const u8 *in, // Ciphertext input
1521 * u64 plaintext_len, // Length of data in bytes for decryption.
1522 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1523 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1524 * // concatenated with 0x00000001. 16-byte aligned pointer.
1525 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1526 * const u8 *aad, // Additional Authentication Data (AAD)
1527 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1528 * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1529 * // given authentication tag and only return the plaintext if they match.
1530 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1531 * // (most likely), 12 or 8.
1536 * keys are pre-expanded and aligned to 16 bytes. we are using the first
1537 * set of 11 keys in the data structure void *aes_ctx
1541 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1542 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1543 * | Salt (From the SA) |
1544 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1545 * | Initialization Vector |
1546 * | (This is the sequence number from IPSec header) |
1547 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1554 * AAD padded to 128 bits with 0
1555 * for example, assume AAD is a u32 vector
1557 * if AAD is 8 bytes:
1558 * AAD[3] = {A0, A1};
1559 * padded AAD in xmm register = {A1 A0 0 0}
1562 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1563 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1566 * | 32-bit Sequence Number (A0) |
1567 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1569 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1571 * AAD Format with 32-bit Sequence Number
1573 * if AAD is 12 bytes:
1574 * AAD[3] = {A0, A1, A2};
1575 * padded AAD in xmm register = {A2 A1 A0 0}
1578 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1579 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1580 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1581 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584 * | 64-bit Extended Sequence Number {A1,A0} |
1586 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1590 * AAD Format with 64-bit Extended Sequence Number
1592 * poly = x^128 + x^127 + x^126 + x^121 + 1
1594 *****************************************************************************/
1595 SYM_FUNC_START(aesni_gcm_dec)
1598 GCM_INIT %arg6, arg7, arg8, arg9
1600 GCM_COMPLETE arg10, arg11
1603 SYM_FUNC_END(aesni_gcm_dec)
1606 /*****************************************************************************
1607 * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1608 * struct gcm_context_data *data
1610 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1611 * const u8 *in, // Plaintext input
1612 * u64 plaintext_len, // Length of data in bytes for encryption.
1613 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1614 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1615 * // concatenated with 0x00000001. 16-byte aligned pointer.
1616 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1617 * const u8 *aad, // Additional Authentication Data (AAD)
1618 * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1619 * u8 *auth_tag, // Authenticated Tag output.
1620 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1626 * keys are pre-expanded and aligned to 16 bytes. we are using the
1627 * first set of 11 keys in the data structure void *aes_ctx
1632 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1633 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1634 * | Salt (From the SA) |
1635 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1636 * | Initialization Vector |
1637 * | (This is the sequence number from IPSec header) |
1638 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645 * AAD padded to 128 bits with 0
1646 * for example, assume AAD is a u32 vector
1648 * if AAD is 8 bytes:
1649 * AAD[3] = {A0, A1};
1650 * padded AAD in xmm register = {A1 A0 0 0}
1653 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1654 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1657 * | 32-bit Sequence Number (A0) |
1658 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1660 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1662 * AAD Format with 32-bit Sequence Number
1664 * if AAD is 12 bytes:
1665 * AAD[3] = {A0, A1, A2};
1666 * padded AAD in xmm register = {A2 A1 A0 0}
1669 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1670 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1673 * | 64-bit Extended Sequence Number {A1,A0} |
1675 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1679 * AAD Format with 64-bit Extended Sequence Number
1681 * poly = x^128 + x^127 + x^126 + x^121 + 1
1682 ***************************************************************************/
1683 SYM_FUNC_START(aesni_gcm_enc)
1686 GCM_INIT %arg6, arg7, arg8, arg9
1689 GCM_COMPLETE arg10, arg11
1692 SYM_FUNC_END(aesni_gcm_enc)
1694 /*****************************************************************************
1695 * void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1696 * struct gcm_context_data *data,
1698 * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1699 * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1700 * // concatenated with 0x00000001. 16-byte aligned pointer.
1701 * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1702 * const u8 *aad, // Additional Authentication Data (AAD)
1703 * u64 aad_len) // Length of AAD in bytes.
1705 SYM_FUNC_START(aesni_gcm_init)
1707 GCM_INIT %arg3, %arg4,%arg5, %arg6
1710 SYM_FUNC_END(aesni_gcm_init)
1712 /*****************************************************************************
1713 * void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1714 * struct gcm_context_data *data,
1716 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1717 * const u8 *in, // Plaintext input
1718 * u64 plaintext_len, // Length of data in bytes for encryption.
1720 SYM_FUNC_START(aesni_gcm_enc_update)
1725 SYM_FUNC_END(aesni_gcm_enc_update)
1727 /*****************************************************************************
1728 * void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1729 * struct gcm_context_data *data,
1731 * u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1732 * const u8 *in, // Plaintext input
1733 * u64 plaintext_len, // Length of data in bytes for encryption.
1735 SYM_FUNC_START(aesni_gcm_dec_update)
1740 SYM_FUNC_END(aesni_gcm_dec_update)
1742 /*****************************************************************************
1743 * void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1744 * struct gcm_context_data *data,
1746 * u8 *auth_tag, // Authenticated Tag output.
1747 * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1750 SYM_FUNC_START(aesni_gcm_finalize)
1752 GCM_COMPLETE %arg3 %arg4
1755 SYM_FUNC_END(aesni_gcm_finalize)
1760 SYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1761 SYM_FUNC_START_LOCAL(_key_expansion_256a)
1762 pshufd $0b11111111, %xmm1, %xmm1
1763 shufps $0b00010000, %xmm0, %xmm4
1765 shufps $0b10001100, %xmm0, %xmm4
1768 movaps %xmm0, (TKEYP)
1771 SYM_FUNC_END(_key_expansion_256a)
1772 SYM_FUNC_END_ALIAS(_key_expansion_128)
1774 SYM_FUNC_START_LOCAL(_key_expansion_192a)
1775 pshufd $0b01010101, %xmm1, %xmm1
1776 shufps $0b00010000, %xmm0, %xmm4
1778 shufps $0b10001100, %xmm0, %xmm4
1785 pshufd $0b11111111, %xmm0, %xmm3
1790 shufps $0b01000100, %xmm0, %xmm6
1791 movaps %xmm6, (TKEYP)
1792 shufps $0b01001110, %xmm2, %xmm1
1793 movaps %xmm1, 0x10(TKEYP)
1796 SYM_FUNC_END(_key_expansion_192a)
1798 SYM_FUNC_START_LOCAL(_key_expansion_192b)
1799 pshufd $0b01010101, %xmm1, %xmm1
1800 shufps $0b00010000, %xmm0, %xmm4
1802 shufps $0b10001100, %xmm0, %xmm4
1808 pshufd $0b11111111, %xmm0, %xmm3
1812 movaps %xmm0, (TKEYP)
1815 SYM_FUNC_END(_key_expansion_192b)
1817 SYM_FUNC_START_LOCAL(_key_expansion_256b)
1818 pshufd $0b10101010, %xmm1, %xmm1
1819 shufps $0b00010000, %xmm2, %xmm4
1821 shufps $0b10001100, %xmm2, %xmm4
1824 movaps %xmm2, (TKEYP)
1827 SYM_FUNC_END(_key_expansion_256b)
1830 * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1831 * unsigned int key_len)
1833 SYM_FUNC_START(aesni_set_key)
1837 movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
1838 movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
1839 movl (FRAME_OFFSET+16)(%esp), %edx # key_len
1841 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1842 movaps %xmm0, (KEYP)
1843 lea 0x10(KEYP), TKEYP # key addr
1844 movl %edx, 480(KEYP)
1845 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
1849 movups 0x10(UKEYP), %xmm2 # other user key
1850 movaps %xmm2, (TKEYP)
1852 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1853 call _key_expansion_256a
1854 AESKEYGENASSIST 0x1 %xmm0 %xmm1
1855 call _key_expansion_256b
1856 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1857 call _key_expansion_256a
1858 AESKEYGENASSIST 0x2 %xmm0 %xmm1
1859 call _key_expansion_256b
1860 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1861 call _key_expansion_256a
1862 AESKEYGENASSIST 0x4 %xmm0 %xmm1
1863 call _key_expansion_256b
1864 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1865 call _key_expansion_256a
1866 AESKEYGENASSIST 0x8 %xmm0 %xmm1
1867 call _key_expansion_256b
1868 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1869 call _key_expansion_256a
1870 AESKEYGENASSIST 0x10 %xmm0 %xmm1
1871 call _key_expansion_256b
1872 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1873 call _key_expansion_256a
1874 AESKEYGENASSIST 0x20 %xmm0 %xmm1
1875 call _key_expansion_256b
1876 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1877 call _key_expansion_256a
1880 movq 0x10(UKEYP), %xmm2 # other user key
1881 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
1882 call _key_expansion_192a
1883 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
1884 call _key_expansion_192b
1885 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
1886 call _key_expansion_192a
1887 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
1888 call _key_expansion_192b
1889 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
1890 call _key_expansion_192a
1891 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
1892 call _key_expansion_192b
1893 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
1894 call _key_expansion_192a
1895 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
1896 call _key_expansion_192b
1899 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
1900 call _key_expansion_128
1901 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
1902 call _key_expansion_128
1903 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
1904 call _key_expansion_128
1905 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
1906 call _key_expansion_128
1907 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
1908 call _key_expansion_128
1909 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
1910 call _key_expansion_128
1911 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
1912 call _key_expansion_128
1913 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
1914 call _key_expansion_128
1915 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
1916 call _key_expansion_128
1917 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
1918 call _key_expansion_128
1921 movaps (KEYP), %xmm0
1922 movaps (TKEYP), %xmm1
1923 movaps %xmm0, 240(TKEYP)
1924 movaps %xmm1, 240(KEYP)
1926 lea 240-16(TKEYP), UKEYP
1929 movaps (KEYP), %xmm0
1931 movaps %xmm1, (UKEYP)
1942 SYM_FUNC_END(aesni_set_key)
1945 * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1947 SYM_FUNC_START(aesni_enc)
1952 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
1953 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
1954 movl (FRAME_OFFSET+20)(%esp), INP # src
1956 movl 480(KEYP), KLEN # key length
1957 movups (INP), STATE # input
1959 movups STATE, (OUTP) # output
1966 SYM_FUNC_END(aesni_enc)
1969 * _aesni_enc1: internal ABI
1971 * KEYP: key struct pointer
1973 * STATE: initial state (input)
1975 * STATE: finial state (output)
1980 SYM_FUNC_START_LOCAL(_aesni_enc1)
1981 movaps (KEYP), KEY # key
1983 pxor KEY, STATE # round 0
1987 lea 0x20(TKEYP), TKEYP
1990 movaps -0x60(TKEYP), KEY
1992 movaps -0x50(TKEYP), KEY
1996 movaps -0x40(TKEYP), KEY
1998 movaps -0x30(TKEYP), KEY
2002 movaps -0x20(TKEYP), KEY
2004 movaps -0x10(TKEYP), KEY
2008 movaps 0x10(TKEYP), KEY
2010 movaps 0x20(TKEYP), KEY
2012 movaps 0x30(TKEYP), KEY
2014 movaps 0x40(TKEYP), KEY
2016 movaps 0x50(TKEYP), KEY
2018 movaps 0x60(TKEYP), KEY
2020 movaps 0x70(TKEYP), KEY
2021 AESENCLAST KEY STATE
2023 SYM_FUNC_END(_aesni_enc1)
2026 * _aesni_enc4: internal ABI
2028 * KEYP: key struct pointer
2030 * STATE1: initial state (input)
2035 * STATE1: finial state (output)
2043 SYM_FUNC_START_LOCAL(_aesni_enc4)
2044 movaps (KEYP), KEY # key
2046 pxor KEY, STATE1 # round 0
2053 lea 0x20(TKEYP), TKEYP
2056 movaps -0x60(TKEYP), KEY
2061 movaps -0x50(TKEYP), KEY
2068 movaps -0x40(TKEYP), KEY
2073 movaps -0x30(TKEYP), KEY
2080 movaps -0x20(TKEYP), KEY
2085 movaps -0x10(TKEYP), KEY
2095 movaps 0x10(TKEYP), KEY
2100 movaps 0x20(TKEYP), KEY
2105 movaps 0x30(TKEYP), KEY
2110 movaps 0x40(TKEYP), KEY
2115 movaps 0x50(TKEYP), KEY
2120 movaps 0x60(TKEYP), KEY
2125 movaps 0x70(TKEYP), KEY
2126 AESENCLAST KEY STATE1 # last round
2127 AESENCLAST KEY STATE2
2128 AESENCLAST KEY STATE3
2129 AESENCLAST KEY STATE4
2131 SYM_FUNC_END(_aesni_enc4)
2134 * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2136 SYM_FUNC_START(aesni_dec)
2141 movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
2142 movl (FRAME_OFFSET+16)(%esp), OUTP # dst
2143 movl (FRAME_OFFSET+20)(%esp), INP # src
2145 mov 480(KEYP), KLEN # key length
2147 movups (INP), STATE # input
2149 movups STATE, (OUTP) #output
2156 SYM_FUNC_END(aesni_dec)
2159 * _aesni_dec1: internal ABI
2161 * KEYP: key struct pointer
2163 * STATE: initial state (input)
2165 * STATE: finial state (output)
2170 SYM_FUNC_START_LOCAL(_aesni_dec1)
2171 movaps (KEYP), KEY # key
2173 pxor KEY, STATE # round 0
2177 lea 0x20(TKEYP), TKEYP
2180 movaps -0x60(TKEYP), KEY
2182 movaps -0x50(TKEYP), KEY
2186 movaps -0x40(TKEYP), KEY
2188 movaps -0x30(TKEYP), KEY
2192 movaps -0x20(TKEYP), KEY
2194 movaps -0x10(TKEYP), KEY
2198 movaps 0x10(TKEYP), KEY
2200 movaps 0x20(TKEYP), KEY
2202 movaps 0x30(TKEYP), KEY
2204 movaps 0x40(TKEYP), KEY
2206 movaps 0x50(TKEYP), KEY
2208 movaps 0x60(TKEYP), KEY
2210 movaps 0x70(TKEYP), KEY
2211 AESDECLAST KEY STATE
2213 SYM_FUNC_END(_aesni_dec1)
2216 * _aesni_dec4: internal ABI
2218 * KEYP: key struct pointer
2220 * STATE1: initial state (input)
2225 * STATE1: finial state (output)
2233 SYM_FUNC_START_LOCAL(_aesni_dec4)
2234 movaps (KEYP), KEY # key
2236 pxor KEY, STATE1 # round 0
2243 lea 0x20(TKEYP), TKEYP
2246 movaps -0x60(TKEYP), KEY
2251 movaps -0x50(TKEYP), KEY
2258 movaps -0x40(TKEYP), KEY
2263 movaps -0x30(TKEYP), KEY
2270 movaps -0x20(TKEYP), KEY
2275 movaps -0x10(TKEYP), KEY
2285 movaps 0x10(TKEYP), KEY
2290 movaps 0x20(TKEYP), KEY
2295 movaps 0x30(TKEYP), KEY
2300 movaps 0x40(TKEYP), KEY
2305 movaps 0x50(TKEYP), KEY
2310 movaps 0x60(TKEYP), KEY
2315 movaps 0x70(TKEYP), KEY
2316 AESDECLAST KEY STATE1 # last round
2317 AESDECLAST KEY STATE2
2318 AESDECLAST KEY STATE3
2319 AESDECLAST KEY STATE4
2321 SYM_FUNC_END(_aesni_dec4)
2324 * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2327 SYM_FUNC_START(aesni_ecb_enc)
2333 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2334 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2335 movl (FRAME_OFFSET+24)(%esp), INP # src
2336 movl (FRAME_OFFSET+28)(%esp), LEN # len
2338 test LEN, LEN # check length
2347 movups (INP), STATE1
2348 movups 0x10(INP), STATE2
2349 movups 0x20(INP), STATE3
2350 movups 0x30(INP), STATE4
2352 movups STATE1, (OUTP)
2353 movups STATE2, 0x10(OUTP)
2354 movups STATE3, 0x20(OUTP)
2355 movups STATE4, 0x30(OUTP)
2365 movups (INP), STATE1
2367 movups STATE1, (OUTP)
2381 SYM_FUNC_END(aesni_ecb_enc)
2384 * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2387 SYM_FUNC_START(aesni_ecb_dec)
2393 movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
2394 movl (FRAME_OFFSET+20)(%esp), OUTP # dst
2395 movl (FRAME_OFFSET+24)(%esp), INP # src
2396 movl (FRAME_OFFSET+28)(%esp), LEN # len
2408 movups (INP), STATE1
2409 movups 0x10(INP), STATE2
2410 movups 0x20(INP), STATE3
2411 movups 0x30(INP), STATE4
2413 movups STATE1, (OUTP)
2414 movups STATE2, 0x10(OUTP)
2415 movups STATE3, 0x20(OUTP)
2416 movups STATE4, 0x30(OUTP)
2426 movups (INP), STATE1
2428 movups STATE1, (OUTP)
2442 SYM_FUNC_END(aesni_ecb_dec)
2445 * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2446 * size_t len, u8 *iv)
2448 SYM_FUNC_START(aesni_cbc_enc)
2455 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2456 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2457 movl (FRAME_OFFSET+28)(%esp), INP # src
2458 movl (FRAME_OFFSET+32)(%esp), LEN # len
2459 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2464 movups (IVP), STATE # load iv as initial state
2467 movups (INP), IN # load input
2470 movups STATE, (OUTP) # store output
2486 SYM_FUNC_END(aesni_cbc_enc)
2489 * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2490 * size_t len, u8 *iv)
2492 SYM_FUNC_START(aesni_cbc_dec)
2499 movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
2500 movl (FRAME_OFFSET+24)(%esp), OUTP # dst
2501 movl (FRAME_OFFSET+28)(%esp), INP # src
2502 movl (FRAME_OFFSET+32)(%esp), LEN # len
2503 movl (FRAME_OFFSET+36)(%esp), IVP # iv
2506 jb .Lcbc_dec_just_ret
2516 movups 0x10(INP), IN2
2519 movups 0x20(INP), IN3
2521 movups 0x30(INP), IN4
2524 movups 0x20(INP), IN1
2526 movups 0x30(INP), IN2
2541 movups 0x10(INP), IN2
2544 movups STATE1, (OUTP)
2545 movups STATE2, 0x10(OUTP)
2546 movups STATE3, 0x20(OUTP)
2547 movups STATE4, 0x30(OUTP)
2561 movups STATE, (OUTP)
2579 SYM_FUNC_END(aesni_cbc_dec)
2582 .pushsection .rodata
2585 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2589 * _aesni_inc_init: internal ABI
2590 * setup registers used by _aesni_inc
2594 * CTR: == IV, in little endian
2595 * TCTR_LOW: == lower qword of CTR
2596 * INC: == 1, in little endian
2597 * BSWAP_MASK == endian swapping mask
2599 SYM_FUNC_START_LOCAL(_aesni_inc_init)
2600 movaps .Lbswap_mask, BSWAP_MASK
2602 PSHUFB_XMM BSWAP_MASK CTR
2604 MOVQ_R64_XMM TCTR_LOW INC
2605 MOVQ_R64_XMM CTR TCTR_LOW
2607 SYM_FUNC_END(_aesni_inc_init)
2610 * _aesni_inc: internal ABI
2611 * Increase IV by 1, IV is in big endian
2614 * CTR: == IV, in little endian
2615 * TCTR_LOW: == lower qword of CTR
2616 * INC: == 1, in little endian
2617 * BSWAP_MASK == endian swapping mask
2621 * CTR: == output IV, in little endian
2622 * TCTR_LOW: == lower qword of CTR
2624 SYM_FUNC_START_LOCAL(_aesni_inc)
2633 PSHUFB_XMM BSWAP_MASK IV
2635 SYM_FUNC_END(_aesni_inc)
2638 * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2639 * size_t len, u8 *iv)
2641 SYM_FUNC_START(aesni_ctr_enc)
2644 jb .Lctr_enc_just_ret
2647 call _aesni_inc_init
2657 movups 0x10(INP), IN2
2660 movups 0x20(INP), IN3
2663 movups 0x30(INP), IN4
2666 movups STATE1, (OUTP)
2668 movups STATE2, 0x10(OUTP)
2670 movups STATE3, 0x20(OUTP)
2672 movups STATE4, 0x30(OUTP)
2687 movups STATE, (OUTP)
2698 SYM_FUNC_END(aesni_ctr_enc)
2701 * _aesni_gf128mul_x_ble: internal ABI
2702 * Multiply in GF(2^128) for XTS IVs
2705 * GF128MUL_MASK == mask with 0x87 and 0x01
2709 * CTR: == temporary value
2711 #define _aesni_gf128mul_x_ble() \
2712 pshufd $0x13, IV, CTR; \
2715 pand GF128MUL_MASK, CTR; \
2719 * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
2720 * const u8 *src, bool enc, le128 *iv)
2722 SYM_FUNC_START(aesni_xts_crypt8)
2727 leaq _aesni_enc4, %r11
2728 leaq _aesni_dec4, %rax
2732 movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2739 movdqu 0x00(INP), INC
2741 movdqu IV, 0x00(OUTP)
2743 _aesni_gf128mul_x_ble()
2745 movdqu 0x10(INP), INC
2747 movdqu IV, 0x10(OUTP)
2749 _aesni_gf128mul_x_ble()
2751 movdqu 0x20(INP), INC
2753 movdqu IV, 0x20(OUTP)
2755 _aesni_gf128mul_x_ble()
2757 movdqu 0x30(INP), INC
2759 movdqu IV, 0x30(OUTP)
2763 movdqu 0x00(OUTP), INC
2765 movdqu STATE1, 0x00(OUTP)
2767 _aesni_gf128mul_x_ble()
2769 movdqu 0x40(INP), INC
2771 movdqu IV, 0x40(OUTP)
2773 movdqu 0x10(OUTP), INC
2775 movdqu STATE2, 0x10(OUTP)
2777 _aesni_gf128mul_x_ble()
2779 movdqu 0x50(INP), INC
2781 movdqu IV, 0x50(OUTP)
2783 movdqu 0x20(OUTP), INC
2785 movdqu STATE3, 0x20(OUTP)
2787 _aesni_gf128mul_x_ble()
2789 movdqu 0x60(INP), INC
2791 movdqu IV, 0x60(OUTP)
2793 movdqu 0x30(OUTP), INC
2795 movdqu STATE4, 0x30(OUTP)
2797 _aesni_gf128mul_x_ble()
2799 movdqu 0x70(INP), INC
2801 movdqu IV, 0x70(OUTP)
2803 _aesni_gf128mul_x_ble()
2808 movdqu 0x40(OUTP), INC
2810 movdqu STATE1, 0x40(OUTP)
2812 movdqu 0x50(OUTP), INC
2814 movdqu STATE2, 0x50(OUTP)
2816 movdqu 0x60(OUTP), INC
2818 movdqu STATE3, 0x60(OUTP)
2820 movdqu 0x70(OUTP), INC
2822 movdqu STATE4, 0x70(OUTP)
2826 SYM_FUNC_END(aesni_xts_crypt8)