1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
124 # constants in mergeable sections, linker can reorder and merge
125 .section .rodata.cst16.POLY, "aM", @progbits, 16
127 POLY: .octa 0xC2000000000000000000000000000001
129 .section .rodata.cst16.POLY2, "aM", @progbits, 16
131 POLY2: .octa 0xC20000000000000000000001C2000000
133 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
135 TWOONE: .octa 0x00000001000000000000000000000001
137 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
141 .section .rodata.cst16.ONE, "aM", @progbits, 16
143 ONE: .octa 0x00000000000000000000000000000001
145 .section .rodata.cst16.ONEf, "aM", @progbits, 16
147 ONEf: .octa 0x01000000000000000000000000000000
149 # order of these constants should not change.
150 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151 .section .rodata, "a", @progbits
153 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
154 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
155 .octa 0x00000000000000000000000000000000
159 .type aad_shift_arr, @object
160 .size aad_shift_arr, 272
162 .octa 0xffffffffffffffffffffffffffffffff
163 .octa 0xffffffffffffffffffffffffffffff0C
164 .octa 0xffffffffffffffffffffffffffff0D0C
165 .octa 0xffffffffffffffffffffffffff0E0D0C
166 .octa 0xffffffffffffffffffffffff0F0E0D0C
167 .octa 0xffffffffffffffffffffff0C0B0A0908
168 .octa 0xffffffffffffffffffff0D0C0B0A0908
169 .octa 0xffffffffffffffffff0E0D0C0B0A0908
170 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
171 .octa 0xffffffffffffff0C0B0A090807060504
172 .octa 0xffffffffffff0D0C0B0A090807060504
173 .octa 0xffffffffff0E0D0C0B0A090807060504
174 .octa 0xffffffff0F0E0D0C0B0A090807060504
175 .octa 0xffffff0C0B0A09080706050403020100
176 .octa 0xffff0D0C0B0A09080706050403020100
177 .octa 0xff0E0D0C0B0A09080706050403020100
178 .octa 0x0F0E0D0C0B0A09080706050403020100
186 #define InLen (16*1)+8
187 #define PBlockEncKey 16*2
189 #define CurCount 16*4
190 #define PBlockLen 16*5
192 HashKey = 16*6 # store HashKey <<1 mod poly here
193 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
194 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
195 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
196 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
197 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
198 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
199 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
200 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
201 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
202 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
203 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
204 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
205 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
206 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
207 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
215 #define arg7 STACK_OFFSET+8*1(%r14)
216 #define arg8 STACK_OFFSET+8*2(%r14)
217 #define arg9 STACK_OFFSET+8*3(%r14)
218 #define arg10 STACK_OFFSET+8*4(%r14)
219 #define keysize 2*15*16(arg1)
229 .macro define_reg r n
240 # need to push 4 registers into stack to maintain
243 TMP1 = 16*0 # Temporary storage for AAD
244 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
245 TMP3 = 16*2 # Temporary storage for AES State 3
246 TMP4 = 16*3 # Temporary storage for AES State 4
247 TMP5 = 16*4 # Temporary storage for AES State 5
248 TMP6 = 16*5 # Temporary storage for AES State 6
249 TMP7 = 16*6 # Temporary storage for AES State 7
250 TMP8 = 16*7 # Temporary storage for AES State 8
252 VARIABLE_OFFSET = 16*8
254 ################################
256 ################################
259 #the number of pushes must equal STACK_OFFSET
269 sub $VARIABLE_OFFSET, %rsp
270 and $~63, %rsp # align rsp to 64 bytes
282 # Encryption of a single block
283 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
284 vpxor (arg1), \XMM0, \XMM0
288 vaesenc 16*i(arg1), \XMM0, \XMM0
292 vaesenclast 16*i(arg1), \XMM0, \XMM0
295 # combined for GCM encrypt and decrypt functions
296 # clobbering all xmm registers
297 # clobbering r10, r11, r12, r13, r14, r15
298 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
299 vmovdqu AadHash(arg2), %xmm8
300 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
301 add arg5, InLen(arg2)
303 # initialize the data pointer offset as zero
306 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
309 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
310 and $-16, %r13 # r13 = r13 - (r13 mod 16)
315 jz _initial_num_blocks_is_0\@
318 je _initial_num_blocks_is_7\@
320 je _initial_num_blocks_is_6\@
322 je _initial_num_blocks_is_5\@
324 je _initial_num_blocks_is_4\@
326 je _initial_num_blocks_is_3\@
328 je _initial_num_blocks_is_2\@
330 jmp _initial_num_blocks_is_1\@
332 _initial_num_blocks_is_7\@:
333 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
335 jmp _initial_blocks_encrypted\@
337 _initial_num_blocks_is_6\@:
338 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
340 jmp _initial_blocks_encrypted\@
342 _initial_num_blocks_is_5\@:
343 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
345 jmp _initial_blocks_encrypted\@
347 _initial_num_blocks_is_4\@:
348 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
350 jmp _initial_blocks_encrypted\@
352 _initial_num_blocks_is_3\@:
353 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
355 jmp _initial_blocks_encrypted\@
357 _initial_num_blocks_is_2\@:
358 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
360 jmp _initial_blocks_encrypted\@
362 _initial_num_blocks_is_1\@:
363 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
365 jmp _initial_blocks_encrypted\@
367 _initial_num_blocks_is_0\@:
368 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
371 _initial_blocks_encrypted\@:
373 je _zero_cipher_left\@
376 je _eight_cipher_left\@
383 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
393 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
396 jne _encrypt_by_8_new\@
398 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
399 jmp _eight_cipher_left\@
402 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
404 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
405 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
408 jne _encrypt_by_8_new\@
410 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
415 _eight_cipher_left\@:
416 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
420 vmovdqu %xmm14, AadHash(arg2)
421 vmovdqu %xmm9, CurCount(arg2)
425 and $15, %r13 # r13 = (arg5 mod 16)
427 je _multiple_of_16_bytes\@
429 # handle the last <16 Byte block separately
431 mov %r13, PBlockLen(arg2)
433 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
434 vmovdqu %xmm9, CurCount(arg2)
435 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
437 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
438 vmovdqu %xmm9, PBlockEncKey(arg2)
441 jge _large_enough_update\@
443 lea (arg4,%r11,1), %r10
446 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
448 lea SHIFT_MASK+16(%rip), %r12
449 sub %r13, %r12 # adjust the shuffle mask pointer to be
450 # able to shift 16-r13 bytes (r13 is the
451 # number of bytes in plaintext mod 16)
453 jmp _final_ghash_mul\@
455 _large_enough_update\@:
459 # receive the last <16 Byte block
460 vmovdqu (arg4, %r11, 1), %xmm1
465 lea SHIFT_MASK+16(%rip), %r12
466 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
467 # (r13 is the number of bytes in plaintext mod 16)
469 # get the appropriate shuffle mask
470 vmovdqu (%r12), %xmm2
471 # shift right 16-r13 bytes
472 vpshufb %xmm2, %xmm1, %xmm1
477 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
478 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
479 # mask out top 16-r13 bytes of xmm9
480 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
481 vpand %xmm1, %xmm2, %xmm2
482 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
483 vpxor %xmm2, %xmm14, %xmm14
485 vmovdqu %xmm14, AadHash(arg2)
487 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
488 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
489 # mask out top 16-r13 bytes of xmm9
490 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
491 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
492 vpxor %xmm9, %xmm14, %xmm14
494 vmovdqu %xmm14, AadHash(arg2)
495 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
499 #############################
503 jle _less_than_8_bytes_left\@
505 mov %rax, (arg3 , %r11)
507 vpsrldq $8, %xmm9, %xmm9
511 _less_than_8_bytes_left\@:
512 movb %al, (arg3 , %r11)
516 jne _less_than_8_bytes_left\@
517 #############################
519 _multiple_of_16_bytes\@:
523 # GCM_COMPLETE Finishes update of tag of last partial block
524 # Output: Authorization Tag (AUTH_TAG)
525 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
526 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
527 vmovdqu AadHash(arg2), %xmm14
528 vmovdqu HashKey(arg2), %xmm13
530 mov PBlockLen(arg2), %r12
534 #GHASH computation for the last <16 Byte block
535 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
538 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
539 shl $3, %r12 # convert into number of bits
540 vmovd %r12d, %xmm15 # len(A) in xmm15
542 mov InLen(arg2), %r12
543 shl $3, %r12 # len(C) in bits (*128)
545 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
546 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
548 vpxor %xmm15, %xmm14, %xmm14
549 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
550 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
552 vmovdqu OrigIV(arg2), %xmm9
554 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
556 vpxor %xmm14, %xmm9, %xmm9
561 mov \AUTH_TAG, %r10 # r10 = authTag
562 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
575 vpsrldq $8, %xmm9, %xmm9
583 vpsrldq $4, %xmm9, %xmm9
600 vmovdqu %xmm9, (%r10)
605 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
607 mov \AAD, %r10 # r10 = AAD
608 mov \AADLEN, %r12 # r12 = aadLen
619 vpshufb SHUF_MASK(%rip), \T7, \T7
621 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
626 jge _get_AAD_blocks\@
633 /* read the last <16B of AAD. since we have at least 4B of
634 data right after the AAD (the ICV, and maybe some CT), we can
635 read 4B/8B blocks safely, and then get rid of the extra stuff */
653 vpslldq $12, \T1, \T1
657 /* finalize: shift out the extra bytes we read, and align
658 left. since pslldq can only shift by an immediate, we use
659 vpshufb and an array of shuffle masks */
662 vmovdqu aad_shift_arr(%r11), \T1
663 vpshufb \T1, \T7, \T7
664 _get_AAD_rest_final\@:
665 vpshufb SHUF_MASK(%rip), \T7, \T7
667 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
670 vmovdqu \T7, AadHash(arg2)
673 .macro INIT GHASH_MUL PRECOMPUTE
675 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
677 mov %r11, InLen(arg2) # ctx_data.in_length = 0
679 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
680 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
683 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
685 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
686 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
688 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
690 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
691 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
693 vpsllq $1, %xmm6, %xmm6
694 vpsrlq $63, %xmm2, %xmm2
696 vpslldq $8, %xmm2, %xmm2
697 vpsrldq $8, %xmm1, %xmm1
698 vpor %xmm2, %xmm6, %xmm6
700 vpshufd $0b00100100, %xmm1, %xmm2
701 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
702 vpand POLY(%rip), %xmm2, %xmm2
703 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
704 #######################################################################
705 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
707 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
709 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
713 # Reads DLEN bytes starting at DPTR and stores in XMMDst
714 # where 0 < DLEN < 16
715 # Clobbers %rax, DLEN
716 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
717 vpxor \XMMDst, \XMMDst, \XMMDst
722 vpinsrq $0, %rax, \XMMDst, \XMMDst
724 jz _done_read_partial_block_\@
728 mov 7(\DPTR, \DLEN, 1), %al
730 jnz _read_next_byte_\@
731 vpinsrq $1, %rax, \XMMDst, \XMMDst
732 jmp _done_read_partial_block_\@
735 _read_next_byte_lt8_\@:
737 mov -1(\DPTR, \DLEN, 1), %al
739 jnz _read_next_byte_lt8_\@
740 vpinsrq $0, %rax, \XMMDst, \XMMDst
741 _done_read_partial_block_\@:
744 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
745 # between update calls.
746 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
747 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
748 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
749 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
751 mov PBlockLen(arg2), %r13
753 je _partial_block_done_\@ # Leave Macro if no partial blocks
754 # Read in input data without over reading
755 cmp $16, \PLAIN_CYPH_LEN
756 jl _fewer_than_16_bytes_\@
757 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
760 _fewer_than_16_bytes_\@:
761 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
762 mov \PLAIN_CYPH_LEN, %r12
763 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
765 mov PBlockLen(arg2), %r13
767 _data_read_\@: # Finished reading in data
769 vmovdqu PBlockEncKey(arg2), %xmm9
770 vmovdqu HashKey(arg2), %xmm13
772 lea SHIFT_MASK(%rip), %r12
774 # adjust the shuffle mask pointer to be able to shift r13 bytes
775 # r16-r13 is the number of bytes in plaintext mod 16)
777 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
778 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
782 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
784 mov \PLAIN_CYPH_LEN, %r10
786 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
788 # Determine if if partial block is not being filled and
789 # shift mask accordingly
790 jge _no_extra_mask_1_\@
794 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
795 # get the appropriate mask to mask out bottom r13 bytes of xmm9
796 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
798 vpand %xmm1, %xmm3, %xmm3
799 vmovdqa SHUF_MASK(%rip), %xmm10
800 vpshufb %xmm10, %xmm3, %xmm3
801 vpshufb %xmm2, %xmm3, %xmm3
802 vpxor %xmm3, \AAD_HASH, \AAD_HASH
805 jl _partial_incomplete_1_\@
807 # GHASH computation for the last <16 Byte block
808 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
811 mov %rax, PBlockLen(arg2)
813 _partial_incomplete_1_\@:
814 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
816 vmovdqu \AAD_HASH, AadHash(arg2)
818 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
820 mov \PLAIN_CYPH_LEN, %r10
822 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
824 # Determine if if partial block is not being filled and
825 # shift mask accordingly
826 jge _no_extra_mask_2_\@
830 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
831 # get the appropriate mask to mask out bottom r13 bytes of xmm9
832 vpand %xmm1, %xmm9, %xmm9
834 vmovdqa SHUF_MASK(%rip), %xmm1
835 vpshufb %xmm1, %xmm9, %xmm9
836 vpshufb %xmm2, %xmm9, %xmm9
837 vpxor %xmm9, \AAD_HASH, \AAD_HASH
840 jl _partial_incomplete_2_\@
842 # GHASH computation for the last <16 Byte block
843 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
846 mov %rax, PBlockLen(arg2)
848 _partial_incomplete_2_\@:
849 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
851 vmovdqu \AAD_HASH, AadHash(arg2)
853 vmovdqa SHUF_MASK(%rip), %xmm10
854 # shuffle xmm9 back to output as ciphertext
855 vpshufb %xmm10, %xmm9, %xmm9
856 vpshufb %xmm2, %xmm9, %xmm9
858 # output encrypted Bytes
863 # Set r13 to be the number of bytes to write out
867 mov \PLAIN_CYPH_LEN, %r13
872 jle _less_than_8_bytes_left_\@
874 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
879 _less_than_8_bytes_left_\@:
880 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
884 jne _less_than_8_bytes_left_\@
885 _partial_block_done_\@:
886 .endm # PARTIAL_BLOCK
888 ###############################################################################
889 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
890 # Input: A and B (128-bits each, bit-reflected)
891 # Output: C = A*B*x mod poly, (i.e. >>1 )
892 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
893 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
894 ###############################################################################
895 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
897 vpshufd $0b01001110, \GH, \T2
898 vpshufd $0b01001110, \HK, \T3
899 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
900 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
902 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
903 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
904 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
906 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
908 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
909 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
911 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
913 #first phase of the reduction
914 vpslld $31, \GH, \T2 # packed right shifting << 31
915 vpslld $30, \GH, \T3 # packed right shifting shift << 30
916 vpslld $25, \GH, \T4 # packed right shifting shift << 25
918 vpxor \T3, \T2, \T2 # xor the shifted versions
921 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
923 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
924 vpxor \T2, \GH, \GH # first phase of the reduction complete
926 #second phase of the reduction
928 vpsrld $1,\GH, \T2 # packed left shifting >> 1
929 vpsrld $2,\GH, \T3 # packed left shifting >> 2
930 vpsrld $7,\GH, \T4 # packed left shifting >> 7
931 vpxor \T3, \T2, \T2 # xor the shifted versions
936 vpxor \T1, \GH, \GH # the result is in GH
941 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
943 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
946 vpshufd $0b01001110, \T5, \T1
948 vmovdqu \T1, HashKey_k(arg2)
950 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
951 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
952 vpshufd $0b01001110, \T5, \T1
954 vmovdqu \T1, HashKey_2_k(arg2)
956 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
957 vmovdqu \T5, HashKey_3(arg2)
958 vpshufd $0b01001110, \T5, \T1
960 vmovdqu \T1, HashKey_3_k(arg2)
962 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
963 vmovdqu \T5, HashKey_4(arg2)
964 vpshufd $0b01001110, \T5, \T1
966 vmovdqu \T1, HashKey_4_k(arg2)
968 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
969 vmovdqu \T5, HashKey_5(arg2)
970 vpshufd $0b01001110, \T5, \T1
972 vmovdqu \T1, HashKey_5_k(arg2)
974 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
975 vmovdqu \T5, HashKey_6(arg2)
976 vpshufd $0b01001110, \T5, \T1
978 vmovdqu \T1, HashKey_6_k(arg2)
980 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
981 vmovdqu \T5, HashKey_7(arg2)
982 vpshufd $0b01001110, \T5, \T1
984 vmovdqu \T1, HashKey_7_k(arg2)
986 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
987 vmovdqu \T5, HashKey_8(arg2)
988 vpshufd $0b01001110, \T5, \T1
990 vmovdqu \T1, HashKey_8_k(arg2)
994 ## if a = number of total plaintext bytes
996 ## num_initial_blocks = b mod 4#
997 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
998 ## r10, r11, r12, rax are clobbered
999 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1001 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1002 i = (8-\num_initial_blocks)
1004 vmovdqu AadHash(arg2), reg_i
1006 # start AES for num_initial_blocks blocks
1007 vmovdqu CurCount(arg2), \CTR
1009 i = (9-\num_initial_blocks)
1011 .rep \num_initial_blocks
1012 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1014 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1019 vmovdqa (arg1), \T_key
1020 i = (9-\num_initial_blocks)
1022 .rep \num_initial_blocks
1023 vpxor \T_key, reg_i, reg_i
1031 vmovdqa 16*j(arg1), \T_key
1032 i = (9-\num_initial_blocks)
1034 .rep \num_initial_blocks
1035 vaesenc \T_key, reg_i, reg_i
1044 vmovdqa 16*j(arg1), \T_key
1045 i = (9-\num_initial_blocks)
1047 .rep \num_initial_blocks
1048 vaesenclast \T_key, reg_i, reg_i
1053 i = (9-\num_initial_blocks)
1055 .rep \num_initial_blocks
1056 vmovdqu (arg4, %r11), \T1
1057 vpxor \T1, reg_i, reg_i
1058 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1063 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1069 i = (8-\num_initial_blocks)
1070 j = (9-\num_initial_blocks)
1073 .rep \num_initial_blocks
1074 vpxor reg_i, reg_j, reg_j
1075 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1080 # XMM8 has the combined result here
1082 vmovdqa \XMM8, TMP1(%rsp)
1086 jl _initial_blocks_done\@ # no need for precomputed constants
1088 ###############################################################################
1089 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1090 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1092 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1094 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1096 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1098 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1100 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1102 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1104 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1106 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1108 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1110 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1112 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1114 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1116 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1118 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1120 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1122 vmovdqa (arg1), \T_key
1123 vpxor \T_key, \XMM1, \XMM1
1124 vpxor \T_key, \XMM2, \XMM2
1125 vpxor \T_key, \XMM3, \XMM3
1126 vpxor \T_key, \XMM4, \XMM4
1127 vpxor \T_key, \XMM5, \XMM5
1128 vpxor \T_key, \XMM6, \XMM6
1129 vpxor \T_key, \XMM7, \XMM7
1130 vpxor \T_key, \XMM8, \XMM8
1134 .rep \REP # do REP rounds
1135 vmovdqa 16*i(arg1), \T_key
1136 vaesenc \T_key, \XMM1, \XMM1
1137 vaesenc \T_key, \XMM2, \XMM2
1138 vaesenc \T_key, \XMM3, \XMM3
1139 vaesenc \T_key, \XMM4, \XMM4
1140 vaesenc \T_key, \XMM5, \XMM5
1141 vaesenc \T_key, \XMM6, \XMM6
1142 vaesenc \T_key, \XMM7, \XMM7
1143 vaesenc \T_key, \XMM8, \XMM8
1148 vmovdqa 16*i(arg1), \T_key
1149 vaesenclast \T_key, \XMM1, \XMM1
1150 vaesenclast \T_key, \XMM2, \XMM2
1151 vaesenclast \T_key, \XMM3, \XMM3
1152 vaesenclast \T_key, \XMM4, \XMM4
1153 vaesenclast \T_key, \XMM5, \XMM5
1154 vaesenclast \T_key, \XMM6, \XMM6
1155 vaesenclast \T_key, \XMM7, \XMM7
1156 vaesenclast \T_key, \XMM8, \XMM8
1158 vmovdqu (arg4, %r11), \T1
1159 vpxor \T1, \XMM1, \XMM1
1160 vmovdqu \XMM1, (arg3 , %r11)
1165 vmovdqu 16*1(arg4, %r11), \T1
1166 vpxor \T1, \XMM2, \XMM2
1167 vmovdqu \XMM2, 16*1(arg3 , %r11)
1172 vmovdqu 16*2(arg4, %r11), \T1
1173 vpxor \T1, \XMM3, \XMM3
1174 vmovdqu \XMM3, 16*2(arg3 , %r11)
1179 vmovdqu 16*3(arg4, %r11), \T1
1180 vpxor \T1, \XMM4, \XMM4
1181 vmovdqu \XMM4, 16*3(arg3 , %r11)
1186 vmovdqu 16*4(arg4, %r11), \T1
1187 vpxor \T1, \XMM5, \XMM5
1188 vmovdqu \XMM5, 16*4(arg3 , %r11)
1193 vmovdqu 16*5(arg4, %r11), \T1
1194 vpxor \T1, \XMM6, \XMM6
1195 vmovdqu \XMM6, 16*5(arg3 , %r11)
1200 vmovdqu 16*6(arg4, %r11), \T1
1201 vpxor \T1, \XMM7, \XMM7
1202 vmovdqu \XMM7, 16*6(arg3 , %r11)
1207 vmovdqu 16*7(arg4, %r11), \T1
1208 vpxor \T1, \XMM8, \XMM8
1209 vmovdqu \XMM8, 16*7(arg3 , %r11)
1216 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1217 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1218 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1219 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1220 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1221 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1222 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1223 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1224 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1226 ###############################################################################
1228 _initial_blocks_done\@:
1232 # encrypt 8 blocks at a time
1233 # ghash the 8 previously encrypted ciphertext blocks
1234 # arg1, arg3, arg4 are used as pointers only, not modified
1235 # r11 is the data offset value
1236 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1239 vmovdqa \XMM2, TMP2(%rsp)
1240 vmovdqa \XMM3, TMP3(%rsp)
1241 vmovdqa \XMM4, TMP4(%rsp)
1242 vmovdqa \XMM5, TMP5(%rsp)
1243 vmovdqa \XMM6, TMP6(%rsp)
1244 vmovdqa \XMM7, TMP7(%rsp)
1245 vmovdqa \XMM8, TMP8(%rsp)
1247 .if \loop_idx == in_order
1248 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1249 vpaddd ONE(%rip), \XMM1, \XMM2
1250 vpaddd ONE(%rip), \XMM2, \XMM3
1251 vpaddd ONE(%rip), \XMM3, \XMM4
1252 vpaddd ONE(%rip), \XMM4, \XMM5
1253 vpaddd ONE(%rip), \XMM5, \XMM6
1254 vpaddd ONE(%rip), \XMM6, \XMM7
1255 vpaddd ONE(%rip), \XMM7, \XMM8
1258 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1259 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1260 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1261 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1262 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1263 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1264 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1265 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1267 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1268 vpaddd ONEf(%rip), \XMM1, \XMM2
1269 vpaddd ONEf(%rip), \XMM2, \XMM3
1270 vpaddd ONEf(%rip), \XMM3, \XMM4
1271 vpaddd ONEf(%rip), \XMM4, \XMM5
1272 vpaddd ONEf(%rip), \XMM5, \XMM6
1273 vpaddd ONEf(%rip), \XMM6, \XMM7
1274 vpaddd ONEf(%rip), \XMM7, \XMM8
1279 #######################################################################
1282 vpxor \T1, \XMM1, \XMM1
1283 vpxor \T1, \XMM2, \XMM2
1284 vpxor \T1, \XMM3, \XMM3
1285 vpxor \T1, \XMM4, \XMM4
1286 vpxor \T1, \XMM5, \XMM5
1287 vpxor \T1, \XMM6, \XMM6
1288 vpxor \T1, \XMM7, \XMM7
1289 vpxor \T1, \XMM8, \XMM8
1291 #######################################################################
1297 vmovdqu 16*1(arg1), \T1
1298 vaesenc \T1, \XMM1, \XMM1
1299 vaesenc \T1, \XMM2, \XMM2
1300 vaesenc \T1, \XMM3, \XMM3
1301 vaesenc \T1, \XMM4, \XMM4
1302 vaesenc \T1, \XMM5, \XMM5
1303 vaesenc \T1, \XMM6, \XMM6
1304 vaesenc \T1, \XMM7, \XMM7
1305 vaesenc \T1, \XMM8, \XMM8
1307 vmovdqu 16*2(arg1), \T1
1308 vaesenc \T1, \XMM1, \XMM1
1309 vaesenc \T1, \XMM2, \XMM2
1310 vaesenc \T1, \XMM3, \XMM3
1311 vaesenc \T1, \XMM4, \XMM4
1312 vaesenc \T1, \XMM5, \XMM5
1313 vaesenc \T1, \XMM6, \XMM6
1314 vaesenc \T1, \XMM7, \XMM7
1315 vaesenc \T1, \XMM8, \XMM8
1318 #######################################################################
1320 vmovdqu HashKey_8(arg2), \T5
1321 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1322 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1324 vpshufd $0b01001110, \T2, \T6
1327 vmovdqu HashKey_8_k(arg2), \T5
1328 vpclmulqdq $0x00, \T5, \T6, \T6
1330 vmovdqu 16*3(arg1), \T1
1331 vaesenc \T1, \XMM1, \XMM1
1332 vaesenc \T1, \XMM2, \XMM2
1333 vaesenc \T1, \XMM3, \XMM3
1334 vaesenc \T1, \XMM4, \XMM4
1335 vaesenc \T1, \XMM5, \XMM5
1336 vaesenc \T1, \XMM6, \XMM6
1337 vaesenc \T1, \XMM7, \XMM7
1338 vaesenc \T1, \XMM8, \XMM8
1340 vmovdqa TMP2(%rsp), \T1
1341 vmovdqu HashKey_7(arg2), \T5
1342 vpclmulqdq $0x11, \T5, \T1, \T3
1344 vpclmulqdq $0x00, \T5, \T1, \T3
1347 vpshufd $0b01001110, \T1, \T3
1349 vmovdqu HashKey_7_k(arg2), \T5
1350 vpclmulqdq $0x10, \T5, \T3, \T3
1353 vmovdqu 16*4(arg1), \T1
1354 vaesenc \T1, \XMM1, \XMM1
1355 vaesenc \T1, \XMM2, \XMM2
1356 vaesenc \T1, \XMM3, \XMM3
1357 vaesenc \T1, \XMM4, \XMM4
1358 vaesenc \T1, \XMM5, \XMM5
1359 vaesenc \T1, \XMM6, \XMM6
1360 vaesenc \T1, \XMM7, \XMM7
1361 vaesenc \T1, \XMM8, \XMM8
1363 #######################################################################
1365 vmovdqa TMP3(%rsp), \T1
1366 vmovdqu HashKey_6(arg2), \T5
1367 vpclmulqdq $0x11, \T5, \T1, \T3
1369 vpclmulqdq $0x00, \T5, \T1, \T3
1372 vpshufd $0b01001110, \T1, \T3
1374 vmovdqu HashKey_6_k(arg2), \T5
1375 vpclmulqdq $0x10, \T5, \T3, \T3
1378 vmovdqu 16*5(arg1), \T1
1379 vaesenc \T1, \XMM1, \XMM1
1380 vaesenc \T1, \XMM2, \XMM2
1381 vaesenc \T1, \XMM3, \XMM3
1382 vaesenc \T1, \XMM4, \XMM4
1383 vaesenc \T1, \XMM5, \XMM5
1384 vaesenc \T1, \XMM6, \XMM6
1385 vaesenc \T1, \XMM7, \XMM7
1386 vaesenc \T1, \XMM8, \XMM8
1388 vmovdqa TMP4(%rsp), \T1
1389 vmovdqu HashKey_5(arg2), \T5
1390 vpclmulqdq $0x11, \T5, \T1, \T3
1392 vpclmulqdq $0x00, \T5, \T1, \T3
1395 vpshufd $0b01001110, \T1, \T3
1397 vmovdqu HashKey_5_k(arg2), \T5
1398 vpclmulqdq $0x10, \T5, \T3, \T3
1401 vmovdqu 16*6(arg1), \T1
1402 vaesenc \T1, \XMM1, \XMM1
1403 vaesenc \T1, \XMM2, \XMM2
1404 vaesenc \T1, \XMM3, \XMM3
1405 vaesenc \T1, \XMM4, \XMM4
1406 vaesenc \T1, \XMM5, \XMM5
1407 vaesenc \T1, \XMM6, \XMM6
1408 vaesenc \T1, \XMM7, \XMM7
1409 vaesenc \T1, \XMM8, \XMM8
1412 vmovdqa TMP5(%rsp), \T1
1413 vmovdqu HashKey_4(arg2), \T5
1414 vpclmulqdq $0x11, \T5, \T1, \T3
1416 vpclmulqdq $0x00, \T5, \T1, \T3
1419 vpshufd $0b01001110, \T1, \T3
1421 vmovdqu HashKey_4_k(arg2), \T5
1422 vpclmulqdq $0x10, \T5, \T3, \T3
1425 vmovdqu 16*7(arg1), \T1
1426 vaesenc \T1, \XMM1, \XMM1
1427 vaesenc \T1, \XMM2, \XMM2
1428 vaesenc \T1, \XMM3, \XMM3
1429 vaesenc \T1, \XMM4, \XMM4
1430 vaesenc \T1, \XMM5, \XMM5
1431 vaesenc \T1, \XMM6, \XMM6
1432 vaesenc \T1, \XMM7, \XMM7
1433 vaesenc \T1, \XMM8, \XMM8
1435 vmovdqa TMP6(%rsp), \T1
1436 vmovdqu HashKey_3(arg2), \T5
1437 vpclmulqdq $0x11, \T5, \T1, \T3
1439 vpclmulqdq $0x00, \T5, \T1, \T3
1442 vpshufd $0b01001110, \T1, \T3
1444 vmovdqu HashKey_3_k(arg2), \T5
1445 vpclmulqdq $0x10, \T5, \T3, \T3
1449 vmovdqu 16*8(arg1), \T1
1450 vaesenc \T1, \XMM1, \XMM1
1451 vaesenc \T1, \XMM2, \XMM2
1452 vaesenc \T1, \XMM3, \XMM3
1453 vaesenc \T1, \XMM4, \XMM4
1454 vaesenc \T1, \XMM5, \XMM5
1455 vaesenc \T1, \XMM6, \XMM6
1456 vaesenc \T1, \XMM7, \XMM7
1457 vaesenc \T1, \XMM8, \XMM8
1459 vmovdqa TMP7(%rsp), \T1
1460 vmovdqu HashKey_2(arg2), \T5
1461 vpclmulqdq $0x11, \T5, \T1, \T3
1463 vpclmulqdq $0x00, \T5, \T1, \T3
1466 vpshufd $0b01001110, \T1, \T3
1468 vmovdqu HashKey_2_k(arg2), \T5
1469 vpclmulqdq $0x10, \T5, \T3, \T3
1472 #######################################################################
1474 vmovdqu 16*9(arg1), \T5
1475 vaesenc \T5, \XMM1, \XMM1
1476 vaesenc \T5, \XMM2, \XMM2
1477 vaesenc \T5, \XMM3, \XMM3
1478 vaesenc \T5, \XMM4, \XMM4
1479 vaesenc \T5, \XMM5, \XMM5
1480 vaesenc \T5, \XMM6, \XMM6
1481 vaesenc \T5, \XMM7, \XMM7
1482 vaesenc \T5, \XMM8, \XMM8
1484 vmovdqa TMP8(%rsp), \T1
1485 vmovdqu HashKey(arg2), \T5
1486 vpclmulqdq $0x11, \T5, \T1, \T3
1488 vpclmulqdq $0x00, \T5, \T1, \T3
1491 vpshufd $0b01001110, \T1, \T3
1493 vmovdqu HashKey_k(arg2), \T5
1494 vpclmulqdq $0x10, \T5, \T3, \T3
1500 vmovdqu 16*10(arg1), \T5
1506 vaesenc \T5, \XMM1, \XMM1
1507 vaesenc \T5, \XMM2, \XMM2
1508 vaesenc \T5, \XMM3, \XMM3
1509 vaesenc \T5, \XMM4, \XMM4
1510 vaesenc \T5, \XMM5, \XMM5
1511 vaesenc \T5, \XMM6, \XMM6
1512 vaesenc \T5, \XMM7, \XMM7
1513 vaesenc \T5, \XMM8, \XMM8
1515 vmovdqu 16*i(arg1), \T5
1524 vpxor 16*i(arg4, %r11), \T5, \T2
1526 vaesenclast \T2, reg_j, reg_j
1528 vaesenclast \T2, reg_j, \T3
1529 vmovdqu 16*i(arg4, %r11), reg_j
1530 vmovdqu \T3, 16*i(arg3, %r11)
1536 #######################################################################
1539 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1540 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1542 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1546 #######################################################################
1547 #first phase of the reduction
1548 #######################################################################
1549 vpslld $31, \T7, \T2 # packed right shifting << 31
1550 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1551 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1553 vpxor \T3, \T2, \T2 # xor the shifted versions
1556 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1558 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1559 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1560 #######################################################################
1562 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1563 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1564 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1565 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1566 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1567 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1568 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1569 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1572 #######################################################################
1573 #second phase of the reduction
1574 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1575 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1576 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1577 vpxor \T3, \T2, \T2 # xor the shifted versions
1582 vpxor \T7, \T6, \T6 # the result is in T6
1583 #######################################################################
1585 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1586 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1587 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1588 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1589 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1590 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1591 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1592 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1595 vpxor \T6, \XMM1, \XMM1
1602 # GHASH the last 4 ciphertext blocks.
1603 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1608 vpshufd $0b01001110, \XMM1, \T2
1609 vpxor \XMM1, \T2, \T2
1610 vmovdqu HashKey_8(arg2), \T5
1611 vpclmulqdq $0x11, \T5, \XMM1, \T6
1612 vpclmulqdq $0x00, \T5, \XMM1, \T7
1614 vmovdqu HashKey_8_k(arg2), \T3
1615 vpclmulqdq $0x00, \T3, \T2, \XMM1
1617 ######################
1619 vpshufd $0b01001110, \XMM2, \T2
1620 vpxor \XMM2, \T2, \T2
1621 vmovdqu HashKey_7(arg2), \T5
1622 vpclmulqdq $0x11, \T5, \XMM2, \T4
1625 vpclmulqdq $0x00, \T5, \XMM2, \T4
1628 vmovdqu HashKey_7_k(arg2), \T3
1629 vpclmulqdq $0x00, \T3, \T2, \T2
1630 vpxor \T2, \XMM1, \XMM1
1632 ######################
1634 vpshufd $0b01001110, \XMM3, \T2
1635 vpxor \XMM3, \T2, \T2
1636 vmovdqu HashKey_6(arg2), \T5
1637 vpclmulqdq $0x11, \T5, \XMM3, \T4
1640 vpclmulqdq $0x00, \T5, \XMM3, \T4
1643 vmovdqu HashKey_6_k(arg2), \T3
1644 vpclmulqdq $0x00, \T3, \T2, \T2
1645 vpxor \T2, \XMM1, \XMM1
1647 ######################
1649 vpshufd $0b01001110, \XMM4, \T2
1650 vpxor \XMM4, \T2, \T2
1651 vmovdqu HashKey_5(arg2), \T5
1652 vpclmulqdq $0x11, \T5, \XMM4, \T4
1655 vpclmulqdq $0x00, \T5, \XMM4, \T4
1658 vmovdqu HashKey_5_k(arg2), \T3
1659 vpclmulqdq $0x00, \T3, \T2, \T2
1660 vpxor \T2, \XMM1, \XMM1
1662 ######################
1664 vpshufd $0b01001110, \XMM5, \T2
1665 vpxor \XMM5, \T2, \T2
1666 vmovdqu HashKey_4(arg2), \T5
1667 vpclmulqdq $0x11, \T5, \XMM5, \T4
1670 vpclmulqdq $0x00, \T5, \XMM5, \T4
1673 vmovdqu HashKey_4_k(arg2), \T3
1674 vpclmulqdq $0x00, \T3, \T2, \T2
1675 vpxor \T2, \XMM1, \XMM1
1677 ######################
1679 vpshufd $0b01001110, \XMM6, \T2
1680 vpxor \XMM6, \T2, \T2
1681 vmovdqu HashKey_3(arg2), \T5
1682 vpclmulqdq $0x11, \T5, \XMM6, \T4
1685 vpclmulqdq $0x00, \T5, \XMM6, \T4
1688 vmovdqu HashKey_3_k(arg2), \T3
1689 vpclmulqdq $0x00, \T3, \T2, \T2
1690 vpxor \T2, \XMM1, \XMM1
1692 ######################
1694 vpshufd $0b01001110, \XMM7, \T2
1695 vpxor \XMM7, \T2, \T2
1696 vmovdqu HashKey_2(arg2), \T5
1697 vpclmulqdq $0x11, \T5, \XMM7, \T4
1700 vpclmulqdq $0x00, \T5, \XMM7, \T4
1703 vmovdqu HashKey_2_k(arg2), \T3
1704 vpclmulqdq $0x00, \T3, \T2, \T2
1705 vpxor \T2, \XMM1, \XMM1
1707 ######################
1709 vpshufd $0b01001110, \XMM8, \T2
1710 vpxor \XMM8, \T2, \T2
1711 vmovdqu HashKey(arg2), \T5
1712 vpclmulqdq $0x11, \T5, \XMM8, \T4
1715 vpclmulqdq $0x00, \T5, \XMM8, \T4
1718 vmovdqu HashKey_k(arg2), \T3
1719 vpclmulqdq $0x00, \T3, \T2, \T2
1721 vpxor \T2, \XMM1, \XMM1
1722 vpxor \T6, \XMM1, \XMM1
1723 vpxor \T7, \XMM1, \T2
1728 vpslldq $8, \T2, \T4
1729 vpsrldq $8, \T2, \T2
1732 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1733 # the accumulated carry-less multiplications
1735 #######################################################################
1736 #first phase of the reduction
1737 vpslld $31, \T7, \T2 # packed right shifting << 31
1738 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1739 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1741 vpxor \T3, \T2, \T2 # xor the shifted versions
1744 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1746 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1747 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1748 #######################################################################
1751 #second phase of the reduction
1752 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1753 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1754 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1755 vpxor \T3, \T2, \T2 # xor the shifted versions
1760 vpxor \T7, \T6, \T6 # the result is in T6
1764 #############################################################
1765 #void aesni_gcm_precomp_avx_gen2
1766 # (gcm_data *my_ctx_data,
1767 # gcm_context_data *data,
1768 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1769 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1770 # (from Security Association) concatenated with 8 byte
1771 # Initialisation Vector (from IPSec ESP Payload)
1772 # concatenated with 0x00000001. 16-byte aligned pointer. */
1773 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1774 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1775 #############################################################
1776 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1778 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1781 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1783 ###############################################################################
1784 #void aesni_gcm_enc_update_avx_gen2(
1785 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1786 # gcm_context_data *data,
1787 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1788 # const u8 *in, /* Plaintext input */
1789 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1790 ###############################################################################
1791 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1795 je key_256_enc_update
1797 je key_128_enc_update
1799 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1803 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1807 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1810 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1812 ###############################################################################
1813 #void aesni_gcm_dec_update_avx_gen2(
1814 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1815 # gcm_context_data *data,
1816 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1817 # const u8 *in, /* Ciphertext input */
1818 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1819 ###############################################################################
1820 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1824 je key_256_dec_update
1826 je key_128_dec_update
1828 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1832 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1836 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1839 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1841 ###############################################################################
1842 #void aesni_gcm_finalize_avx_gen2(
1843 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1844 # gcm_context_data *data,
1845 # u8 *auth_tag, /* Authenticated Tag output. */
1846 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1847 # Valid values are 16 (most likely), 12 or 8. */
1848 ###############################################################################
1849 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1857 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1861 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1865 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1868 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1870 ###############################################################################
1871 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1872 # Input: A and B (128-bits each, bit-reflected)
1873 # Output: C = A*B*x mod poly, (i.e. >>1 )
1874 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1875 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1876 ###############################################################################
1877 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1879 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1880 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1881 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1882 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1886 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1887 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1892 #######################################################################
1893 #first phase of the reduction
1894 vmovdqa POLY2(%rip), \T3
1896 vpclmulqdq $0x01, \GH, \T3, \T2
1897 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1899 vpxor \T2, \GH, \GH # first phase of the reduction complete
1900 #######################################################################
1901 #second phase of the reduction
1902 vpclmulqdq $0x00, \GH, \T3, \T2
1903 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1905 vpclmulqdq $0x10, \GH, \T3, \GH
1906 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1908 vpxor \T2, \GH, \GH # second phase of the reduction complete
1909 #######################################################################
1910 vpxor \T1, \GH, \GH # the result is in GH
1915 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1917 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1919 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1920 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1922 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1923 vmovdqu \T5, HashKey_3(arg2)
1925 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1926 vmovdqu \T5, HashKey_4(arg2)
1928 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1929 vmovdqu \T5, HashKey_5(arg2)
1931 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1932 vmovdqu \T5, HashKey_6(arg2)
1934 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1935 vmovdqu \T5, HashKey_7(arg2)
1937 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1938 vmovdqu \T5, HashKey_8(arg2)
1942 ## if a = number of total plaintext bytes
1944 ## num_initial_blocks = b mod 4#
1945 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1946 ## r10, r11, r12, rax are clobbered
1947 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1949 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1950 i = (8-\num_initial_blocks)
1952 vmovdqu AadHash(arg2), reg_i
1954 # start AES for num_initial_blocks blocks
1955 vmovdqu CurCount(arg2), \CTR
1957 i = (9-\num_initial_blocks)
1959 .rep \num_initial_blocks
1960 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1962 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1967 vmovdqa (arg1), \T_key
1968 i = (9-\num_initial_blocks)
1970 .rep \num_initial_blocks
1971 vpxor \T_key, reg_i, reg_i
1979 vmovdqa 16*j(arg1), \T_key
1980 i = (9-\num_initial_blocks)
1982 .rep \num_initial_blocks
1983 vaesenc \T_key, reg_i, reg_i
1993 vmovdqa 16*j(arg1), \T_key
1994 i = (9-\num_initial_blocks)
1996 .rep \num_initial_blocks
1997 vaesenclast \T_key, reg_i, reg_i
2002 i = (9-\num_initial_blocks)
2004 .rep \num_initial_blocks
2005 vmovdqu (arg4, %r11), \T1
2006 vpxor \T1, reg_i, reg_i
2007 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
2008 # num_initial_blocks blocks
2013 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
2019 i = (8-\num_initial_blocks)
2020 j = (9-\num_initial_blocks)
2023 .rep \num_initial_blocks
2024 vpxor reg_i, reg_j, reg_j
2025 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
2030 # XMM8 has the combined result here
2032 vmovdqa \XMM8, TMP1(%rsp)
2036 jl _initial_blocks_done\@ # no need for precomputed constants
2038 ###############################################################################
2039 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2040 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2042 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2044 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2046 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2048 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2050 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2052 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2054 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2056 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2058 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2060 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2062 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2064 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2066 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2068 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2070 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2072 vmovdqa (arg1), \T_key
2073 vpxor \T_key, \XMM1, \XMM1
2074 vpxor \T_key, \XMM2, \XMM2
2075 vpxor \T_key, \XMM3, \XMM3
2076 vpxor \T_key, \XMM4, \XMM4
2077 vpxor \T_key, \XMM5, \XMM5
2078 vpxor \T_key, \XMM6, \XMM6
2079 vpxor \T_key, \XMM7, \XMM7
2080 vpxor \T_key, \XMM8, \XMM8
2084 .rep \REP # do REP rounds
2085 vmovdqa 16*i(arg1), \T_key
2086 vaesenc \T_key, \XMM1, \XMM1
2087 vaesenc \T_key, \XMM2, \XMM2
2088 vaesenc \T_key, \XMM3, \XMM3
2089 vaesenc \T_key, \XMM4, \XMM4
2090 vaesenc \T_key, \XMM5, \XMM5
2091 vaesenc \T_key, \XMM6, \XMM6
2092 vaesenc \T_key, \XMM7, \XMM7
2093 vaesenc \T_key, \XMM8, \XMM8
2099 vmovdqa 16*i(arg1), \T_key
2100 vaesenclast \T_key, \XMM1, \XMM1
2101 vaesenclast \T_key, \XMM2, \XMM2
2102 vaesenclast \T_key, \XMM3, \XMM3
2103 vaesenclast \T_key, \XMM4, \XMM4
2104 vaesenclast \T_key, \XMM5, \XMM5
2105 vaesenclast \T_key, \XMM6, \XMM6
2106 vaesenclast \T_key, \XMM7, \XMM7
2107 vaesenclast \T_key, \XMM8, \XMM8
2109 vmovdqu (arg4, %r11), \T1
2110 vpxor \T1, \XMM1, \XMM1
2111 vmovdqu \XMM1, (arg3 , %r11)
2116 vmovdqu 16*1(arg4, %r11), \T1
2117 vpxor \T1, \XMM2, \XMM2
2118 vmovdqu \XMM2, 16*1(arg3 , %r11)
2123 vmovdqu 16*2(arg4, %r11), \T1
2124 vpxor \T1, \XMM3, \XMM3
2125 vmovdqu \XMM3, 16*2(arg3 , %r11)
2130 vmovdqu 16*3(arg4, %r11), \T1
2131 vpxor \T1, \XMM4, \XMM4
2132 vmovdqu \XMM4, 16*3(arg3 , %r11)
2137 vmovdqu 16*4(arg4, %r11), \T1
2138 vpxor \T1, \XMM5, \XMM5
2139 vmovdqu \XMM5, 16*4(arg3 , %r11)
2144 vmovdqu 16*5(arg4, %r11), \T1
2145 vpxor \T1, \XMM6, \XMM6
2146 vmovdqu \XMM6, 16*5(arg3 , %r11)
2151 vmovdqu 16*6(arg4, %r11), \T1
2152 vpxor \T1, \XMM7, \XMM7
2153 vmovdqu \XMM7, 16*6(arg3 , %r11)
2158 vmovdqu 16*7(arg4, %r11), \T1
2159 vpxor \T1, \XMM8, \XMM8
2160 vmovdqu \XMM8, 16*7(arg3 , %r11)
2167 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2168 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2169 # the corresponding ciphertext
2170 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2171 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2172 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2173 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2174 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2175 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2176 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2178 ###############################################################################
2180 _initial_blocks_done\@:
2187 # encrypt 8 blocks at a time
2188 # ghash the 8 previously encrypted ciphertext blocks
2189 # arg1, arg3, arg4 are used as pointers only, not modified
2190 # r11 is the data offset value
2191 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2194 vmovdqa \XMM2, TMP2(%rsp)
2195 vmovdqa \XMM3, TMP3(%rsp)
2196 vmovdqa \XMM4, TMP4(%rsp)
2197 vmovdqa \XMM5, TMP5(%rsp)
2198 vmovdqa \XMM6, TMP6(%rsp)
2199 vmovdqa \XMM7, TMP7(%rsp)
2200 vmovdqa \XMM8, TMP8(%rsp)
2202 .if \loop_idx == in_order
2203 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2204 vpaddd ONE(%rip), \XMM1, \XMM2
2205 vpaddd ONE(%rip), \XMM2, \XMM3
2206 vpaddd ONE(%rip), \XMM3, \XMM4
2207 vpaddd ONE(%rip), \XMM4, \XMM5
2208 vpaddd ONE(%rip), \XMM5, \XMM6
2209 vpaddd ONE(%rip), \XMM6, \XMM7
2210 vpaddd ONE(%rip), \XMM7, \XMM8
2213 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2214 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2215 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2216 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2217 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2218 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2219 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2220 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2222 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2223 vpaddd ONEf(%rip), \XMM1, \XMM2
2224 vpaddd ONEf(%rip), \XMM2, \XMM3
2225 vpaddd ONEf(%rip), \XMM3, \XMM4
2226 vpaddd ONEf(%rip), \XMM4, \XMM5
2227 vpaddd ONEf(%rip), \XMM5, \XMM6
2228 vpaddd ONEf(%rip), \XMM6, \XMM7
2229 vpaddd ONEf(%rip), \XMM7, \XMM8
2234 #######################################################################
2237 vpxor \T1, \XMM1, \XMM1
2238 vpxor \T1, \XMM2, \XMM2
2239 vpxor \T1, \XMM3, \XMM3
2240 vpxor \T1, \XMM4, \XMM4
2241 vpxor \T1, \XMM5, \XMM5
2242 vpxor \T1, \XMM6, \XMM6
2243 vpxor \T1, \XMM7, \XMM7
2244 vpxor \T1, \XMM8, \XMM8
2246 #######################################################################
2252 vmovdqu 16*1(arg1), \T1
2253 vaesenc \T1, \XMM1, \XMM1
2254 vaesenc \T1, \XMM2, \XMM2
2255 vaesenc \T1, \XMM3, \XMM3
2256 vaesenc \T1, \XMM4, \XMM4
2257 vaesenc \T1, \XMM5, \XMM5
2258 vaesenc \T1, \XMM6, \XMM6
2259 vaesenc \T1, \XMM7, \XMM7
2260 vaesenc \T1, \XMM8, \XMM8
2262 vmovdqu 16*2(arg1), \T1
2263 vaesenc \T1, \XMM1, \XMM1
2264 vaesenc \T1, \XMM2, \XMM2
2265 vaesenc \T1, \XMM3, \XMM3
2266 vaesenc \T1, \XMM4, \XMM4
2267 vaesenc \T1, \XMM5, \XMM5
2268 vaesenc \T1, \XMM6, \XMM6
2269 vaesenc \T1, \XMM7, \XMM7
2270 vaesenc \T1, \XMM8, \XMM8
2273 #######################################################################
2275 vmovdqu HashKey_8(arg2), \T5
2276 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2277 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2278 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2279 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2282 vmovdqu 16*3(arg1), \T1
2283 vaesenc \T1, \XMM1, \XMM1
2284 vaesenc \T1, \XMM2, \XMM2
2285 vaesenc \T1, \XMM3, \XMM3
2286 vaesenc \T1, \XMM4, \XMM4
2287 vaesenc \T1, \XMM5, \XMM5
2288 vaesenc \T1, \XMM6, \XMM6
2289 vaesenc \T1, \XMM7, \XMM7
2290 vaesenc \T1, \XMM8, \XMM8
2292 vmovdqa TMP2(%rsp), \T1
2293 vmovdqu HashKey_7(arg2), \T5
2294 vpclmulqdq $0x11, \T5, \T1, \T3
2297 vpclmulqdq $0x00, \T5, \T1, \T3
2300 vpclmulqdq $0x01, \T5, \T1, \T3
2303 vpclmulqdq $0x10, \T5, \T1, \T3
2306 vmovdqu 16*4(arg1), \T1
2307 vaesenc \T1, \XMM1, \XMM1
2308 vaesenc \T1, \XMM2, \XMM2
2309 vaesenc \T1, \XMM3, \XMM3
2310 vaesenc \T1, \XMM4, \XMM4
2311 vaesenc \T1, \XMM5, \XMM5
2312 vaesenc \T1, \XMM6, \XMM6
2313 vaesenc \T1, \XMM7, \XMM7
2314 vaesenc \T1, \XMM8, \XMM8
2316 #######################################################################
2318 vmovdqa TMP3(%rsp), \T1
2319 vmovdqu HashKey_6(arg2), \T5
2320 vpclmulqdq $0x11, \T5, \T1, \T3
2323 vpclmulqdq $0x00, \T5, \T1, \T3
2326 vpclmulqdq $0x01, \T5, \T1, \T3
2329 vpclmulqdq $0x10, \T5, \T1, \T3
2332 vmovdqu 16*5(arg1), \T1
2333 vaesenc \T1, \XMM1, \XMM1
2334 vaesenc \T1, \XMM2, \XMM2
2335 vaesenc \T1, \XMM3, \XMM3
2336 vaesenc \T1, \XMM4, \XMM4
2337 vaesenc \T1, \XMM5, \XMM5
2338 vaesenc \T1, \XMM6, \XMM6
2339 vaesenc \T1, \XMM7, \XMM7
2340 vaesenc \T1, \XMM8, \XMM8
2342 vmovdqa TMP4(%rsp), \T1
2343 vmovdqu HashKey_5(arg2), \T5
2344 vpclmulqdq $0x11, \T5, \T1, \T3
2347 vpclmulqdq $0x00, \T5, \T1, \T3
2350 vpclmulqdq $0x01, \T5, \T1, \T3
2353 vpclmulqdq $0x10, \T5, \T1, \T3
2356 vmovdqu 16*6(arg1), \T1
2357 vaesenc \T1, \XMM1, \XMM1
2358 vaesenc \T1, \XMM2, \XMM2
2359 vaesenc \T1, \XMM3, \XMM3
2360 vaesenc \T1, \XMM4, \XMM4
2361 vaesenc \T1, \XMM5, \XMM5
2362 vaesenc \T1, \XMM6, \XMM6
2363 vaesenc \T1, \XMM7, \XMM7
2364 vaesenc \T1, \XMM8, \XMM8
2367 vmovdqa TMP5(%rsp), \T1
2368 vmovdqu HashKey_4(arg2), \T5
2369 vpclmulqdq $0x11, \T5, \T1, \T3
2372 vpclmulqdq $0x00, \T5, \T1, \T3
2375 vpclmulqdq $0x01, \T5, \T1, \T3
2378 vpclmulqdq $0x10, \T5, \T1, \T3
2381 vmovdqu 16*7(arg1), \T1
2382 vaesenc \T1, \XMM1, \XMM1
2383 vaesenc \T1, \XMM2, \XMM2
2384 vaesenc \T1, \XMM3, \XMM3
2385 vaesenc \T1, \XMM4, \XMM4
2386 vaesenc \T1, \XMM5, \XMM5
2387 vaesenc \T1, \XMM6, \XMM6
2388 vaesenc \T1, \XMM7, \XMM7
2389 vaesenc \T1, \XMM8, \XMM8
2391 vmovdqa TMP6(%rsp), \T1
2392 vmovdqu HashKey_3(arg2), \T5
2393 vpclmulqdq $0x11, \T5, \T1, \T3
2396 vpclmulqdq $0x00, \T5, \T1, \T3
2399 vpclmulqdq $0x01, \T5, \T1, \T3
2402 vpclmulqdq $0x10, \T5, \T1, \T3
2405 vmovdqu 16*8(arg1), \T1
2406 vaesenc \T1, \XMM1, \XMM1
2407 vaesenc \T1, \XMM2, \XMM2
2408 vaesenc \T1, \XMM3, \XMM3
2409 vaesenc \T1, \XMM4, \XMM4
2410 vaesenc \T1, \XMM5, \XMM5
2411 vaesenc \T1, \XMM6, \XMM6
2412 vaesenc \T1, \XMM7, \XMM7
2413 vaesenc \T1, \XMM8, \XMM8
2415 vmovdqa TMP7(%rsp), \T1
2416 vmovdqu HashKey_2(arg2), \T5
2417 vpclmulqdq $0x11, \T5, \T1, \T3
2420 vpclmulqdq $0x00, \T5, \T1, \T3
2423 vpclmulqdq $0x01, \T5, \T1, \T3
2426 vpclmulqdq $0x10, \T5, \T1, \T3
2430 #######################################################################
2432 vmovdqu 16*9(arg1), \T5
2433 vaesenc \T5, \XMM1, \XMM1
2434 vaesenc \T5, \XMM2, \XMM2
2435 vaesenc \T5, \XMM3, \XMM3
2436 vaesenc \T5, \XMM4, \XMM4
2437 vaesenc \T5, \XMM5, \XMM5
2438 vaesenc \T5, \XMM6, \XMM6
2439 vaesenc \T5, \XMM7, \XMM7
2440 vaesenc \T5, \XMM8, \XMM8
2442 vmovdqa TMP8(%rsp), \T1
2443 vmovdqu HashKey(arg2), \T5
2445 vpclmulqdq $0x00, \T5, \T1, \T3
2448 vpclmulqdq $0x01, \T5, \T1, \T3
2451 vpclmulqdq $0x10, \T5, \T1, \T3
2454 vpclmulqdq $0x11, \T5, \T1, \T3
2458 vmovdqu 16*10(arg1), \T5
2463 vaesenc \T5, \XMM1, \XMM1
2464 vaesenc \T5, \XMM2, \XMM2
2465 vaesenc \T5, \XMM3, \XMM3
2466 vaesenc \T5, \XMM4, \XMM4
2467 vaesenc \T5, \XMM5, \XMM5
2468 vaesenc \T5, \XMM6, \XMM6
2469 vaesenc \T5, \XMM7, \XMM7
2470 vaesenc \T5, \XMM8, \XMM8
2472 vmovdqu 16*i(arg1), \T5
2481 vpxor 16*i(arg4, %r11), \T5, \T2
2483 vaesenclast \T2, reg_j, reg_j
2485 vaesenclast \T2, reg_j, \T3
2486 vmovdqu 16*i(arg4, %r11), reg_j
2487 vmovdqu \T3, 16*i(arg3, %r11)
2493 #######################################################################
2496 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2497 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2499 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2503 #######################################################################
2504 #first phase of the reduction
2505 vmovdqa POLY2(%rip), \T3
2507 vpclmulqdq $0x01, \T7, \T3, \T2
2508 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2510 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2511 #######################################################################
2513 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2514 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2515 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2516 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2517 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2518 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2519 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2520 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2523 #######################################################################
2524 #second phase of the reduction
2525 vpclmulqdq $0x00, \T7, \T3, \T2
2526 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2528 vpclmulqdq $0x10, \T7, \T3, \T4
2529 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2531 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2532 #######################################################################
2533 vpxor \T4, \T1, \T1 # the result is in T1
2535 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2536 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2537 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2538 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2539 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2540 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2541 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2542 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2545 vpxor \T1, \XMM1, \XMM1
2552 # GHASH the last 4 ciphertext blocks.
2553 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2557 vmovdqu HashKey_8(arg2), \T5
2559 vpshufd $0b01001110, \XMM1, \T2
2560 vpshufd $0b01001110, \T5, \T3
2561 vpxor \XMM1, \T2, \T2
2564 vpclmulqdq $0x11, \T5, \XMM1, \T6
2565 vpclmulqdq $0x00, \T5, \XMM1, \T7
2567 vpclmulqdq $0x00, \T3, \T2, \XMM1
2569 ######################
2571 vmovdqu HashKey_7(arg2), \T5
2572 vpshufd $0b01001110, \XMM2, \T2
2573 vpshufd $0b01001110, \T5, \T3
2574 vpxor \XMM2, \T2, \T2
2577 vpclmulqdq $0x11, \T5, \XMM2, \T4
2580 vpclmulqdq $0x00, \T5, \XMM2, \T4
2583 vpclmulqdq $0x00, \T3, \T2, \T2
2585 vpxor \T2, \XMM1, \XMM1
2587 ######################
2589 vmovdqu HashKey_6(arg2), \T5
2590 vpshufd $0b01001110, \XMM3, \T2
2591 vpshufd $0b01001110, \T5, \T3
2592 vpxor \XMM3, \T2, \T2
2595 vpclmulqdq $0x11, \T5, \XMM3, \T4
2598 vpclmulqdq $0x00, \T5, \XMM3, \T4
2601 vpclmulqdq $0x00, \T3, \T2, \T2
2603 vpxor \T2, \XMM1, \XMM1
2605 ######################
2607 vmovdqu HashKey_5(arg2), \T5
2608 vpshufd $0b01001110, \XMM4, \T2
2609 vpshufd $0b01001110, \T5, \T3
2610 vpxor \XMM4, \T2, \T2
2613 vpclmulqdq $0x11, \T5, \XMM4, \T4
2616 vpclmulqdq $0x00, \T5, \XMM4, \T4
2619 vpclmulqdq $0x00, \T3, \T2, \T2
2621 vpxor \T2, \XMM1, \XMM1
2623 ######################
2625 vmovdqu HashKey_4(arg2), \T5
2626 vpshufd $0b01001110, \XMM5, \T2
2627 vpshufd $0b01001110, \T5, \T3
2628 vpxor \XMM5, \T2, \T2
2631 vpclmulqdq $0x11, \T5, \XMM5, \T4
2634 vpclmulqdq $0x00, \T5, \XMM5, \T4
2637 vpclmulqdq $0x00, \T3, \T2, \T2
2639 vpxor \T2, \XMM1, \XMM1
2641 ######################
2643 vmovdqu HashKey_3(arg2), \T5
2644 vpshufd $0b01001110, \XMM6, \T2
2645 vpshufd $0b01001110, \T5, \T3
2646 vpxor \XMM6, \T2, \T2
2649 vpclmulqdq $0x11, \T5, \XMM6, \T4
2652 vpclmulqdq $0x00, \T5, \XMM6, \T4
2655 vpclmulqdq $0x00, \T3, \T2, \T2
2657 vpxor \T2, \XMM1, \XMM1
2659 ######################
2661 vmovdqu HashKey_2(arg2), \T5
2662 vpshufd $0b01001110, \XMM7, \T2
2663 vpshufd $0b01001110, \T5, \T3
2664 vpxor \XMM7, \T2, \T2
2667 vpclmulqdq $0x11, \T5, \XMM7, \T4
2670 vpclmulqdq $0x00, \T5, \XMM7, \T4
2673 vpclmulqdq $0x00, \T3, \T2, \T2
2675 vpxor \T2, \XMM1, \XMM1
2677 ######################
2679 vmovdqu HashKey(arg2), \T5
2680 vpshufd $0b01001110, \XMM8, \T2
2681 vpshufd $0b01001110, \T5, \T3
2682 vpxor \XMM8, \T2, \T2
2685 vpclmulqdq $0x11, \T5, \XMM8, \T4
2688 vpclmulqdq $0x00, \T5, \XMM8, \T4
2691 vpclmulqdq $0x00, \T3, \T2, \T2
2693 vpxor \T2, \XMM1, \XMM1
2694 vpxor \T6, \XMM1, \XMM1
2695 vpxor \T7, \XMM1, \T2
2700 vpslldq $8, \T2, \T4
2701 vpsrldq $8, \T2, \T2
2704 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2705 # accumulated carry-less multiplications
2707 #######################################################################
2708 #first phase of the reduction
2709 vmovdqa POLY2(%rip), \T3
2711 vpclmulqdq $0x01, \T7, \T3, \T2
2712 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2714 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2715 #######################################################################
2718 #second phase of the reduction
2719 vpclmulqdq $0x00, \T7, \T3, \T2
2720 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2722 vpclmulqdq $0x10, \T7, \T3, \T4
2723 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2725 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2726 #######################################################################
2727 vpxor \T4, \T6, \T6 # the result is in T6
2732 #############################################################
2733 #void aesni_gcm_init_avx_gen4
2734 # (gcm_data *my_ctx_data,
2735 # gcm_context_data *data,
2736 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2737 # (from Security Association) concatenated with 8 byte
2738 # Initialisation Vector (from IPSec ESP Payload)
2739 # concatenated with 0x00000001. 16-byte aligned pointer. */
2740 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2741 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2742 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2743 #############################################################
2744 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2746 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2749 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2751 ###############################################################################
2752 #void aesni_gcm_enc_avx_gen4(
2753 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2754 # gcm_context_data *data,
2755 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2756 # const u8 *in, /* Plaintext input */
2757 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2758 ###############################################################################
2759 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2763 je key_256_enc_update4
2765 je key_128_enc_update4
2767 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2770 key_128_enc_update4:
2771 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2774 key_256_enc_update4:
2775 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2778 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2780 ###############################################################################
2781 #void aesni_gcm_dec_update_avx_gen4(
2782 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2783 # gcm_context_data *data,
2784 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2785 # const u8 *in, /* Ciphertext input */
2786 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2787 ###############################################################################
2788 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2792 je key_256_dec_update4
2794 je key_128_dec_update4
2796 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2799 key_128_dec_update4:
2800 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2803 key_256_dec_update4:
2804 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2807 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2809 ###############################################################################
2810 #void aesni_gcm_finalize_avx_gen4(
2811 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2812 # gcm_context_data *data,
2813 # u8 *auth_tag, /* Authenticated Tag output. */
2814 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2815 # Valid values are 16 (most likely), 12 or 8. */
2816 ###############################################################################
2817 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2821 je key_256_finalize4
2823 je key_128_finalize4
2825 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2829 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2833 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2836 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)