1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
128 POLY: .octa 0xC2000000000000000000000000000001
129 POLY2: .octa 0xC20000000000000000000001C2000000
130 TWOONE: .octa 0x00000001000000000000000000000001
132 # order of these constants should not change.
133 # more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
135 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
136 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
137 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
138 ZERO: .octa 0x00000000000000000000000000000000
139 ONE: .octa 0x00000000000000000000000000000001
140 ONEf: .octa 0x01000000000000000000000000000000
145 ##define the fields of the gcm aes context
147 # u8 expanded_keys[16*11] store expanded keys
148 # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
149 # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
150 # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
151 # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
152 # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
153 # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
154 # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
155 # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
156 # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
157 # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
158 # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
159 # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
160 # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
161 # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
162 # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
163 # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
166 HashKey = 16*11 # store HashKey <<1 mod poly here
167 HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
168 HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
169 HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
170 HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
171 HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
172 HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
173 HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
174 HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
175 HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
176 HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
177 HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
178 HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
179 HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
180 HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
181 HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
189 #define arg7 STACK_OFFSET+8*1(%r14)
190 #define arg8 STACK_OFFSET+8*2(%r14)
191 #define arg9 STACK_OFFSET+8*3(%r14)
201 .macro define_reg r n
212 # need to push 4 registers into stack to maintain
215 TMP1 = 16*0 # Temporary storage for AAD
216 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
217 TMP3 = 16*2 # Temporary storage for AES State 3
218 TMP4 = 16*3 # Temporary storage for AES State 4
219 TMP5 = 16*4 # Temporary storage for AES State 5
220 TMP6 = 16*5 # Temporary storage for AES State 6
221 TMP7 = 16*6 # Temporary storage for AES State 7
222 TMP8 = 16*7 # Temporary storage for AES State 8
224 VARIABLE_OFFSET = 16*8
226 ################################
228 ################################
230 # Encryption of a single block
231 .macro ENCRYPT_SINGLE_BLOCK XMM0
232 vpxor (arg1), \XMM0, \XMM0
236 vaesenc 16*i(arg1), \XMM0, \XMM0
240 vaesenclast 16*10(arg1), \XMM0, \XMM0
244 ###############################################################################
245 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
246 # Input: A and B (128-bits each, bit-reflected)
247 # Output: C = A*B*x mod poly, (i.e. >>1 )
248 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
249 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
250 ###############################################################################
251 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
253 vpshufd $0b01001110, \GH, \T2
254 vpshufd $0b01001110, \HK, \T3
255 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
256 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
258 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
259 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
260 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
262 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
264 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
265 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
267 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
269 #first phase of the reduction
270 vpslld $31, \GH, \T2 # packed right shifting << 31
271 vpslld $30, \GH, \T3 # packed right shifting shift << 30
272 vpslld $25, \GH, \T4 # packed right shifting shift << 25
274 vpxor \T3, \T2, \T2 # xor the shifted versions
277 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
279 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
280 vpxor \T2, \GH, \GH # first phase of the reduction complete
282 #second phase of the reduction
284 vpsrld $1,\GH, \T2 # packed left shifting >> 1
285 vpsrld $2,\GH, \T3 # packed left shifting >> 2
286 vpsrld $7,\GH, \T4 # packed left shifting >> 7
287 vpxor \T3, \T2, \T2 # xor the shifted versions
292 vpxor \T1, \GH, \GH # the result is in GH
297 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
299 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
302 vpshufd $0b01001110, \T5, \T1
304 vmovdqa \T1, HashKey_k(arg1)
306 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
307 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
308 vpshufd $0b01001110, \T5, \T1
310 vmovdqa \T1, HashKey_2_k(arg1)
312 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
313 vmovdqa \T5, HashKey_3(arg1)
314 vpshufd $0b01001110, \T5, \T1
316 vmovdqa \T1, HashKey_3_k(arg1)
318 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
319 vmovdqa \T5, HashKey_4(arg1)
320 vpshufd $0b01001110, \T5, \T1
322 vmovdqa \T1, HashKey_4_k(arg1)
324 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
325 vmovdqa \T5, HashKey_5(arg1)
326 vpshufd $0b01001110, \T5, \T1
328 vmovdqa \T1, HashKey_5_k(arg1)
330 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
331 vmovdqa \T5, HashKey_6(arg1)
332 vpshufd $0b01001110, \T5, \T1
334 vmovdqa \T1, HashKey_6_k(arg1)
336 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
337 vmovdqa \T5, HashKey_7(arg1)
338 vpshufd $0b01001110, \T5, \T1
340 vmovdqa \T1, HashKey_7_k(arg1)
342 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
343 vmovdqa \T5, HashKey_8(arg1)
344 vpshufd $0b01001110, \T5, \T1
346 vmovdqa \T1, HashKey_8_k(arg1)
350 ## if a = number of total plaintext bytes
352 ## num_initial_blocks = b mod 4#
353 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
354 ## r10, r11, r12, rax are clobbered
355 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
357 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
358 i = (8-\num_initial_blocks)
361 mov arg6, %r10 # r10 = AAD
362 mov arg7, %r12 # r12 = aadLen
367 vpxor reg_i, reg_i, reg_i
370 vpslldq $12, \T1, \T1
371 vpsrldq $4, reg_i, reg_i
372 vpxor \T1, reg_i, reg_i
380 je _get_AAD_loop2_done\@
384 vpsrldq $4, reg_i, reg_i
389 _get_AAD_loop2_done\@:
391 #byte-reflect the AAD data
392 vpshufb SHUF_MASK(%rip), reg_i, reg_i
394 # initialize the data pointer offset as zero
397 # start AES for num_initial_blocks blocks
398 mov arg5, %rax # rax = *Y0
399 vmovdqu (%rax), \CTR # CTR = Y0
400 vpshufb SHUF_MASK(%rip), \CTR, \CTR
403 i = (9-\num_initial_blocks)
405 .rep \num_initial_blocks
406 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
408 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
413 vmovdqa (arg1), \T_key
414 i = (9-\num_initial_blocks)
416 .rep \num_initial_blocks
417 vpxor \T_key, reg_i, reg_i
425 vmovdqa 16*j(arg1), \T_key
426 i = (9-\num_initial_blocks)
428 .rep \num_initial_blocks
429 vaesenc \T_key, reg_i, reg_i
439 vmovdqa 16*10(arg1), \T_key
440 i = (9-\num_initial_blocks)
442 .rep \num_initial_blocks
443 vaesenclast \T_key, reg_i, reg_i
448 i = (9-\num_initial_blocks)
450 .rep \num_initial_blocks
451 vmovdqu (arg3, %r11), \T1
452 vpxor \T1, reg_i, reg_i
453 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
458 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
464 i = (8-\num_initial_blocks)
465 j = (9-\num_initial_blocks)
467 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
469 .rep \num_initial_blocks
470 vpxor reg_i, reg_j, reg_j
471 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
476 # XMM8 has the combined result here
478 vmovdqa \XMM8, TMP1(%rsp)
482 jl _initial_blocks_done\@ # no need for precomputed constants
484 ###############################################################################
485 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
486 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
488 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
490 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
492 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
494 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
496 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
498 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
500 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
502 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
504 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
506 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
508 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
510 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
512 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
514 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
516 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
518 vmovdqa (arg1), \T_key
519 vpxor \T_key, \XMM1, \XMM1
520 vpxor \T_key, \XMM2, \XMM2
521 vpxor \T_key, \XMM3, \XMM3
522 vpxor \T_key, \XMM4, \XMM4
523 vpxor \T_key, \XMM5, \XMM5
524 vpxor \T_key, \XMM6, \XMM6
525 vpxor \T_key, \XMM7, \XMM7
526 vpxor \T_key, \XMM8, \XMM8
531 vmovdqa 16*i(arg1), \T_key
532 vaesenc \T_key, \XMM1, \XMM1
533 vaesenc \T_key, \XMM2, \XMM2
534 vaesenc \T_key, \XMM3, \XMM3
535 vaesenc \T_key, \XMM4, \XMM4
536 vaesenc \T_key, \XMM5, \XMM5
537 vaesenc \T_key, \XMM6, \XMM6
538 vaesenc \T_key, \XMM7, \XMM7
539 vaesenc \T_key, \XMM8, \XMM8
545 vmovdqa 16*i(arg1), \T_key
546 vaesenclast \T_key, \XMM1, \XMM1
547 vaesenclast \T_key, \XMM2, \XMM2
548 vaesenclast \T_key, \XMM3, \XMM3
549 vaesenclast \T_key, \XMM4, \XMM4
550 vaesenclast \T_key, \XMM5, \XMM5
551 vaesenclast \T_key, \XMM6, \XMM6
552 vaesenclast \T_key, \XMM7, \XMM7
553 vaesenclast \T_key, \XMM8, \XMM8
555 vmovdqu (arg3, %r11), \T1
556 vpxor \T1, \XMM1, \XMM1
557 vmovdqu \XMM1, (arg2 , %r11)
562 vmovdqu 16*1(arg3, %r11), \T1
563 vpxor \T1, \XMM2, \XMM2
564 vmovdqu \XMM2, 16*1(arg2 , %r11)
569 vmovdqu 16*2(arg3, %r11), \T1
570 vpxor \T1, \XMM3, \XMM3
571 vmovdqu \XMM3, 16*2(arg2 , %r11)
576 vmovdqu 16*3(arg3, %r11), \T1
577 vpxor \T1, \XMM4, \XMM4
578 vmovdqu \XMM4, 16*3(arg2 , %r11)
583 vmovdqu 16*4(arg3, %r11), \T1
584 vpxor \T1, \XMM5, \XMM5
585 vmovdqu \XMM5, 16*4(arg2 , %r11)
590 vmovdqu 16*5(arg3, %r11), \T1
591 vpxor \T1, \XMM6, \XMM6
592 vmovdqu \XMM6, 16*5(arg2 , %r11)
597 vmovdqu 16*6(arg3, %r11), \T1
598 vpxor \T1, \XMM7, \XMM7
599 vmovdqu \XMM7, 16*6(arg2 , %r11)
604 vmovdqu 16*7(arg3, %r11), \T1
605 vpxor \T1, \XMM8, \XMM8
606 vmovdqu \XMM8, 16*7(arg2 , %r11)
613 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
614 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
615 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
616 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
617 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
618 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
619 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
620 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
621 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
623 ###############################################################################
625 _initial_blocks_done\@:
629 # encrypt 8 blocks at a time
630 # ghash the 8 previously encrypted ciphertext blocks
631 # arg1, arg2, arg3 are used as pointers only, not modified
632 # r11 is the data offset value
633 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
636 vmovdqa \XMM2, TMP2(%rsp)
637 vmovdqa \XMM3, TMP3(%rsp)
638 vmovdqa \XMM4, TMP4(%rsp)
639 vmovdqa \XMM5, TMP5(%rsp)
640 vmovdqa \XMM6, TMP6(%rsp)
641 vmovdqa \XMM7, TMP7(%rsp)
642 vmovdqa \XMM8, TMP8(%rsp)
644 .if \loop_idx == in_order
645 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
646 vpaddd ONE(%rip), \XMM1, \XMM2
647 vpaddd ONE(%rip), \XMM2, \XMM3
648 vpaddd ONE(%rip), \XMM3, \XMM4
649 vpaddd ONE(%rip), \XMM4, \XMM5
650 vpaddd ONE(%rip), \XMM5, \XMM6
651 vpaddd ONE(%rip), \XMM6, \XMM7
652 vpaddd ONE(%rip), \XMM7, \XMM8
655 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
656 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
657 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
658 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
659 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
660 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
661 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
662 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
664 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
665 vpaddd ONEf(%rip), \XMM1, \XMM2
666 vpaddd ONEf(%rip), \XMM2, \XMM3
667 vpaddd ONEf(%rip), \XMM3, \XMM4
668 vpaddd ONEf(%rip), \XMM4, \XMM5
669 vpaddd ONEf(%rip), \XMM5, \XMM6
670 vpaddd ONEf(%rip), \XMM6, \XMM7
671 vpaddd ONEf(%rip), \XMM7, \XMM8
676 #######################################################################
679 vpxor \T1, \XMM1, \XMM1
680 vpxor \T1, \XMM2, \XMM2
681 vpxor \T1, \XMM3, \XMM3
682 vpxor \T1, \XMM4, \XMM4
683 vpxor \T1, \XMM5, \XMM5
684 vpxor \T1, \XMM6, \XMM6
685 vpxor \T1, \XMM7, \XMM7
686 vpxor \T1, \XMM8, \XMM8
688 #######################################################################
694 vmovdqu 16*1(arg1), \T1
695 vaesenc \T1, \XMM1, \XMM1
696 vaesenc \T1, \XMM2, \XMM2
697 vaesenc \T1, \XMM3, \XMM3
698 vaesenc \T1, \XMM4, \XMM4
699 vaesenc \T1, \XMM5, \XMM5
700 vaesenc \T1, \XMM6, \XMM6
701 vaesenc \T1, \XMM7, \XMM7
702 vaesenc \T1, \XMM8, \XMM8
704 vmovdqu 16*2(arg1), \T1
705 vaesenc \T1, \XMM1, \XMM1
706 vaesenc \T1, \XMM2, \XMM2
707 vaesenc \T1, \XMM3, \XMM3
708 vaesenc \T1, \XMM4, \XMM4
709 vaesenc \T1, \XMM5, \XMM5
710 vaesenc \T1, \XMM6, \XMM6
711 vaesenc \T1, \XMM7, \XMM7
712 vaesenc \T1, \XMM8, \XMM8
715 #######################################################################
717 vmovdqa HashKey_8(arg1), \T5
718 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
719 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
721 vpshufd $0b01001110, \T2, \T6
724 vmovdqa HashKey_8_k(arg1), \T5
725 vpclmulqdq $0x00, \T5, \T6, \T6
727 vmovdqu 16*3(arg1), \T1
728 vaesenc \T1, \XMM1, \XMM1
729 vaesenc \T1, \XMM2, \XMM2
730 vaesenc \T1, \XMM3, \XMM3
731 vaesenc \T1, \XMM4, \XMM4
732 vaesenc \T1, \XMM5, \XMM5
733 vaesenc \T1, \XMM6, \XMM6
734 vaesenc \T1, \XMM7, \XMM7
735 vaesenc \T1, \XMM8, \XMM8
737 vmovdqa TMP2(%rsp), \T1
738 vmovdqa HashKey_7(arg1), \T5
739 vpclmulqdq $0x11, \T5, \T1, \T3
741 vpclmulqdq $0x00, \T5, \T1, \T3
744 vpshufd $0b01001110, \T1, \T3
746 vmovdqa HashKey_7_k(arg1), \T5
747 vpclmulqdq $0x10, \T5, \T3, \T3
750 vmovdqu 16*4(arg1), \T1
751 vaesenc \T1, \XMM1, \XMM1
752 vaesenc \T1, \XMM2, \XMM2
753 vaesenc \T1, \XMM3, \XMM3
754 vaesenc \T1, \XMM4, \XMM4
755 vaesenc \T1, \XMM5, \XMM5
756 vaesenc \T1, \XMM6, \XMM6
757 vaesenc \T1, \XMM7, \XMM7
758 vaesenc \T1, \XMM8, \XMM8
760 #######################################################################
762 vmovdqa TMP3(%rsp), \T1
763 vmovdqa HashKey_6(arg1), \T5
764 vpclmulqdq $0x11, \T5, \T1, \T3
766 vpclmulqdq $0x00, \T5, \T1, \T3
769 vpshufd $0b01001110, \T1, \T3
771 vmovdqa HashKey_6_k(arg1), \T5
772 vpclmulqdq $0x10, \T5, \T3, \T3
775 vmovdqu 16*5(arg1), \T1
776 vaesenc \T1, \XMM1, \XMM1
777 vaesenc \T1, \XMM2, \XMM2
778 vaesenc \T1, \XMM3, \XMM3
779 vaesenc \T1, \XMM4, \XMM4
780 vaesenc \T1, \XMM5, \XMM5
781 vaesenc \T1, \XMM6, \XMM6
782 vaesenc \T1, \XMM7, \XMM7
783 vaesenc \T1, \XMM8, \XMM8
785 vmovdqa TMP4(%rsp), \T1
786 vmovdqa HashKey_5(arg1), \T5
787 vpclmulqdq $0x11, \T5, \T1, \T3
789 vpclmulqdq $0x00, \T5, \T1, \T3
792 vpshufd $0b01001110, \T1, \T3
794 vmovdqa HashKey_5_k(arg1), \T5
795 vpclmulqdq $0x10, \T5, \T3, \T3
798 vmovdqu 16*6(arg1), \T1
799 vaesenc \T1, \XMM1, \XMM1
800 vaesenc \T1, \XMM2, \XMM2
801 vaesenc \T1, \XMM3, \XMM3
802 vaesenc \T1, \XMM4, \XMM4
803 vaesenc \T1, \XMM5, \XMM5
804 vaesenc \T1, \XMM6, \XMM6
805 vaesenc \T1, \XMM7, \XMM7
806 vaesenc \T1, \XMM8, \XMM8
809 vmovdqa TMP5(%rsp), \T1
810 vmovdqa HashKey_4(arg1), \T5
811 vpclmulqdq $0x11, \T5, \T1, \T3
813 vpclmulqdq $0x00, \T5, \T1, \T3
816 vpshufd $0b01001110, \T1, \T3
818 vmovdqa HashKey_4_k(arg1), \T5
819 vpclmulqdq $0x10, \T5, \T3, \T3
822 vmovdqu 16*7(arg1), \T1
823 vaesenc \T1, \XMM1, \XMM1
824 vaesenc \T1, \XMM2, \XMM2
825 vaesenc \T1, \XMM3, \XMM3
826 vaesenc \T1, \XMM4, \XMM4
827 vaesenc \T1, \XMM5, \XMM5
828 vaesenc \T1, \XMM6, \XMM6
829 vaesenc \T1, \XMM7, \XMM7
830 vaesenc \T1, \XMM8, \XMM8
832 vmovdqa TMP6(%rsp), \T1
833 vmovdqa HashKey_3(arg1), \T5
834 vpclmulqdq $0x11, \T5, \T1, \T3
836 vpclmulqdq $0x00, \T5, \T1, \T3
839 vpshufd $0b01001110, \T1, \T3
841 vmovdqa HashKey_3_k(arg1), \T5
842 vpclmulqdq $0x10, \T5, \T3, \T3
846 vmovdqu 16*8(arg1), \T1
847 vaesenc \T1, \XMM1, \XMM1
848 vaesenc \T1, \XMM2, \XMM2
849 vaesenc \T1, \XMM3, \XMM3
850 vaesenc \T1, \XMM4, \XMM4
851 vaesenc \T1, \XMM5, \XMM5
852 vaesenc \T1, \XMM6, \XMM6
853 vaesenc \T1, \XMM7, \XMM7
854 vaesenc \T1, \XMM8, \XMM8
856 vmovdqa TMP7(%rsp), \T1
857 vmovdqa HashKey_2(arg1), \T5
858 vpclmulqdq $0x11, \T5, \T1, \T3
860 vpclmulqdq $0x00, \T5, \T1, \T3
863 vpshufd $0b01001110, \T1, \T3
865 vmovdqa HashKey_2_k(arg1), \T5
866 vpclmulqdq $0x10, \T5, \T3, \T3
869 #######################################################################
871 vmovdqu 16*9(arg1), \T5
872 vaesenc \T5, \XMM1, \XMM1
873 vaesenc \T5, \XMM2, \XMM2
874 vaesenc \T5, \XMM3, \XMM3
875 vaesenc \T5, \XMM4, \XMM4
876 vaesenc \T5, \XMM5, \XMM5
877 vaesenc \T5, \XMM6, \XMM6
878 vaesenc \T5, \XMM7, \XMM7
879 vaesenc \T5, \XMM8, \XMM8
881 vmovdqa TMP8(%rsp), \T1
882 vmovdqa HashKey(arg1), \T5
883 vpclmulqdq $0x11, \T5, \T1, \T3
885 vpclmulqdq $0x00, \T5, \T1, \T3
888 vpshufd $0b01001110, \T1, \T3
890 vmovdqa HashKey_k(arg1), \T5
891 vpclmulqdq $0x10, \T5, \T3, \T3
897 vmovdqu 16*10(arg1), \T5
903 vpxor 16*i(arg3, %r11), \T5, \T2
905 vaesenclast \T2, reg_j, reg_j
907 vaesenclast \T2, reg_j, \T3
908 vmovdqu 16*i(arg3, %r11), reg_j
909 vmovdqu \T3, 16*i(arg2, %r11)
915 #######################################################################
918 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
919 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
921 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
925 #######################################################################
926 #first phase of the reduction
927 #######################################################################
928 vpslld $31, \T7, \T2 # packed right shifting << 31
929 vpslld $30, \T7, \T3 # packed right shifting shift << 30
930 vpslld $25, \T7, \T4 # packed right shifting shift << 25
932 vpxor \T3, \T2, \T2 # xor the shifted versions
935 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
937 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
938 vpxor \T2, \T7, \T7 # first phase of the reduction complete
939 #######################################################################
941 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
942 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
943 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
944 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
945 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
946 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
947 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
948 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
951 #######################################################################
952 #second phase of the reduction
953 vpsrld $1, \T7, \T2 # packed left shifting >> 1
954 vpsrld $2, \T7, \T3 # packed left shifting >> 2
955 vpsrld $7, \T7, \T4 # packed left shifting >> 7
956 vpxor \T3, \T2, \T2 # xor the shifted versions
961 vpxor \T7, \T6, \T6 # the result is in T6
962 #######################################################################
964 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
965 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
966 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
967 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
968 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
969 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
970 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
971 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
974 vpxor \T6, \XMM1, \XMM1
981 # GHASH the last 4 ciphertext blocks.
982 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
987 vpshufd $0b01001110, \XMM1, \T2
988 vpxor \XMM1, \T2, \T2
989 vmovdqa HashKey_8(arg1), \T5
990 vpclmulqdq $0x11, \T5, \XMM1, \T6
991 vpclmulqdq $0x00, \T5, \XMM1, \T7
993 vmovdqa HashKey_8_k(arg1), \T3
994 vpclmulqdq $0x00, \T3, \T2, \XMM1
996 ######################
998 vpshufd $0b01001110, \XMM2, \T2
999 vpxor \XMM2, \T2, \T2
1000 vmovdqa HashKey_7(arg1), \T5
1001 vpclmulqdq $0x11, \T5, \XMM2, \T4
1004 vpclmulqdq $0x00, \T5, \XMM2, \T4
1007 vmovdqa HashKey_7_k(arg1), \T3
1008 vpclmulqdq $0x00, \T3, \T2, \T2
1009 vpxor \T2, \XMM1, \XMM1
1011 ######################
1013 vpshufd $0b01001110, \XMM3, \T2
1014 vpxor \XMM3, \T2, \T2
1015 vmovdqa HashKey_6(arg1), \T5
1016 vpclmulqdq $0x11, \T5, \XMM3, \T4
1019 vpclmulqdq $0x00, \T5, \XMM3, \T4
1022 vmovdqa HashKey_6_k(arg1), \T3
1023 vpclmulqdq $0x00, \T3, \T2, \T2
1024 vpxor \T2, \XMM1, \XMM1
1026 ######################
1028 vpshufd $0b01001110, \XMM4, \T2
1029 vpxor \XMM4, \T2, \T2
1030 vmovdqa HashKey_5(arg1), \T5
1031 vpclmulqdq $0x11, \T5, \XMM4, \T4
1034 vpclmulqdq $0x00, \T5, \XMM4, \T4
1037 vmovdqa HashKey_5_k(arg1), \T3
1038 vpclmulqdq $0x00, \T3, \T2, \T2
1039 vpxor \T2, \XMM1, \XMM1
1041 ######################
1043 vpshufd $0b01001110, \XMM5, \T2
1044 vpxor \XMM5, \T2, \T2
1045 vmovdqa HashKey_4(arg1), \T5
1046 vpclmulqdq $0x11, \T5, \XMM5, \T4
1049 vpclmulqdq $0x00, \T5, \XMM5, \T4
1052 vmovdqa HashKey_4_k(arg1), \T3
1053 vpclmulqdq $0x00, \T3, \T2, \T2
1054 vpxor \T2, \XMM1, \XMM1
1056 ######################
1058 vpshufd $0b01001110, \XMM6, \T2
1059 vpxor \XMM6, \T2, \T2
1060 vmovdqa HashKey_3(arg1), \T5
1061 vpclmulqdq $0x11, \T5, \XMM6, \T4
1064 vpclmulqdq $0x00, \T5, \XMM6, \T4
1067 vmovdqa HashKey_3_k(arg1), \T3
1068 vpclmulqdq $0x00, \T3, \T2, \T2
1069 vpxor \T2, \XMM1, \XMM1
1071 ######################
1073 vpshufd $0b01001110, \XMM7, \T2
1074 vpxor \XMM7, \T2, \T2
1075 vmovdqa HashKey_2(arg1), \T5
1076 vpclmulqdq $0x11, \T5, \XMM7, \T4
1079 vpclmulqdq $0x00, \T5, \XMM7, \T4
1082 vmovdqa HashKey_2_k(arg1), \T3
1083 vpclmulqdq $0x00, \T3, \T2, \T2
1084 vpxor \T2, \XMM1, \XMM1
1086 ######################
1088 vpshufd $0b01001110, \XMM8, \T2
1089 vpxor \XMM8, \T2, \T2
1090 vmovdqa HashKey(arg1), \T5
1091 vpclmulqdq $0x11, \T5, \XMM8, \T4
1094 vpclmulqdq $0x00, \T5, \XMM8, \T4
1097 vmovdqa HashKey_k(arg1), \T3
1098 vpclmulqdq $0x00, \T3, \T2, \T2
1100 vpxor \T2, \XMM1, \XMM1
1101 vpxor \T6, \XMM1, \XMM1
1102 vpxor \T7, \XMM1, \T2
1107 vpslldq $8, \T2, \T4
1108 vpsrldq $8, \T2, \T2
1111 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1112 # the accumulated carry-less multiplications
1114 #######################################################################
1115 #first phase of the reduction
1116 vpslld $31, \T7, \T2 # packed right shifting << 31
1117 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1118 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1120 vpxor \T3, \T2, \T2 # xor the shifted versions
1123 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1125 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1126 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1127 #######################################################################
1130 #second phase of the reduction
1131 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1132 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1133 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1134 vpxor \T3, \T2, \T2 # xor the shifted versions
1139 vpxor \T7, \T6, \T6 # the result is in T6
1144 # combined for GCM encrypt and decrypt functions
1145 # clobbering all xmm registers
1146 # clobbering r10, r11, r12, r13, r14, r15
1147 .macro GCM_ENC_DEC_AVX ENC_DEC
1149 #the number of pushes must equal STACK_OFFSET
1160 sub $VARIABLE_OFFSET, %rsp
1161 and $~63, %rsp # align rsp to 64 bytes
1164 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
1166 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
1167 and $-16, %r13 # r13 = r13 - (r13 mod 16)
1172 jz _initial_num_blocks_is_0\@
1175 je _initial_num_blocks_is_7\@
1177 je _initial_num_blocks_is_6\@
1179 je _initial_num_blocks_is_5\@
1181 je _initial_num_blocks_is_4\@
1183 je _initial_num_blocks_is_3\@
1185 je _initial_num_blocks_is_2\@
1187 jmp _initial_num_blocks_is_1\@
1189 _initial_num_blocks_is_7\@:
1190 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1192 jmp _initial_blocks_encrypted\@
1194 _initial_num_blocks_is_6\@:
1195 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1197 jmp _initial_blocks_encrypted\@
1199 _initial_num_blocks_is_5\@:
1200 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1202 jmp _initial_blocks_encrypted\@
1204 _initial_num_blocks_is_4\@:
1205 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1207 jmp _initial_blocks_encrypted\@
1209 _initial_num_blocks_is_3\@:
1210 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1212 jmp _initial_blocks_encrypted\@
1214 _initial_num_blocks_is_2\@:
1215 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1217 jmp _initial_blocks_encrypted\@
1219 _initial_num_blocks_is_1\@:
1220 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1222 jmp _initial_blocks_encrypted\@
1224 _initial_num_blocks_is_0\@:
1225 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1228 _initial_blocks_encrypted\@:
1230 je _zero_cipher_left\@
1233 je _eight_cipher_left\@
1240 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1243 _encrypt_by_8_new\@:
1250 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1253 jne _encrypt_by_8_new\@
1255 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1256 jmp _eight_cipher_left\@
1259 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1261 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1262 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1265 jne _encrypt_by_8_new\@
1267 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1272 _eight_cipher_left\@:
1273 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1276 _zero_cipher_left\@:
1278 jl _only_less_than_16\@
1281 and $15, %r13 # r13 = (arg4 mod 16)
1283 je _multiple_of_16_bytes\@
1285 # handle the last <16 Byte block seperately
1288 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1289 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1290 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1294 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
1296 lea SHIFT_MASK+16(%rip), %r12
1297 sub %r13, %r12 # adjust the shuffle mask pointer to be
1298 # able to shift 16-r13 bytes (r13 is the
1299 # number of bytes in plaintext mod 16)
1300 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
1301 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
1302 jmp _final_ghash_mul\@
1304 _only_less_than_16\@:
1305 # check for 0 length
1307 and $15, %r13 # r13 = (arg4 mod 16)
1309 je _multiple_of_16_bytes\@
1311 # handle the last <16 Byte block seperately
1314 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1315 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1316 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1319 lea SHIFT_MASK+16(%rip), %r12
1320 sub %r13, %r12 # adjust the shuffle mask pointer to be
1321 # able to shift 16-r13 bytes (r13 is the
1322 # number of bytes in plaintext mod 16)
1324 _get_last_16_byte_loop\@:
1325 movb (arg3, %r11), %al
1326 movb %al, TMP1 (%rsp , %r11)
1329 jne _get_last_16_byte_loop\@
1331 vmovdqu TMP1(%rsp), %xmm1
1337 vmovdqa %xmm1, %xmm2
1338 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1339 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1340 # mask out top 16-r13 bytes of xmm9
1341 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1342 vpand %xmm1, %xmm2, %xmm2
1343 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1344 vpxor %xmm2, %xmm14, %xmm14
1345 #GHASH computation for the last <16 Byte block
1346 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1350 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1351 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1352 # mask out top 16-r13 bytes of xmm9
1353 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1354 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1355 vpxor %xmm9, %xmm14, %xmm14
1356 #GHASH computation for the last <16 Byte block
1357 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1360 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
1364 #############################
1368 jle _less_than_8_bytes_left\@
1370 mov %rax, (arg2 , %r11)
1372 vpsrldq $8, %xmm9, %xmm9
1376 _less_than_8_bytes_left\@:
1377 movb %al, (arg2 , %r11)
1381 jne _less_than_8_bytes_left\@
1382 #############################
1384 _multiple_of_16_bytes\@:
1385 mov arg7, %r12 # r12 = aadLen (number of bytes)
1386 shl $3, %r12 # convert into number of bits
1387 vmovd %r12d, %xmm15 # len(A) in xmm15
1389 shl $3, arg4 # len(C) in bits (*128)
1391 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
1392 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
1394 vpxor %xmm15, %xmm14, %xmm14
1395 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
1396 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
1398 mov arg5, %rax # rax = *Y0
1399 vmovdqu (%rax), %xmm9 # xmm9 = Y0
1401 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
1403 vpxor %xmm14, %xmm9, %xmm9
1408 mov arg8, %r10 # r10 = authTag
1409 mov arg9, %r11 # r11 = auth_tag_len
1420 jmp _return_T_done\@
1424 vpsrldq $8, %xmm9, %xmm9
1427 jmp _return_T_done\@
1430 vmovdqu %xmm9, (%r10)
1442 #############################################################
1443 #void aesni_gcm_precomp_avx_gen2
1444 # (gcm_data *my_ctx_data,
1445 # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1446 #############################################################
1447 ENTRY(aesni_gcm_precomp_avx_gen2)
1448 #the number of pushes must equal STACK_OFFSET
1458 sub $VARIABLE_OFFSET, %rsp
1459 and $~63, %rsp # align rsp to 64 bytes
1461 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
1463 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1464 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1465 vmovdqa %xmm6, %xmm2
1466 vpsllq $1, %xmm6, %xmm6
1467 vpsrlq $63, %xmm2, %xmm2
1468 vmovdqa %xmm2, %xmm1
1469 vpslldq $8, %xmm2, %xmm2
1470 vpsrldq $8, %xmm1, %xmm1
1471 vpor %xmm2, %xmm6, %xmm6
1473 vpshufd $0b00100100, %xmm1, %xmm2
1474 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1475 vpand POLY(%rip), %xmm2, %xmm2
1476 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1477 #######################################################################
1478 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
1481 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1490 ENDPROC(aesni_gcm_precomp_avx_gen2)
1492 ###############################################################################
1493 #void aesni_gcm_enc_avx_gen2(
1494 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1495 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1496 # const u8 *in, /* Plaintext input */
1497 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1498 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1499 # (from Security Association) concatenated with 8 byte
1500 # Initialisation Vector (from IPSec ESP Payload)
1501 # concatenated with 0x00000001. 16-byte aligned pointer. */
1502 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1503 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1504 # u8 *auth_tag, /* Authenticated Tag output. */
1505 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1506 # Valid values are 16 (most likely), 12 or 8. */
1507 ###############################################################################
1508 ENTRY(aesni_gcm_enc_avx_gen2)
1511 ENDPROC(aesni_gcm_enc_avx_gen2)
1513 ###############################################################################
1514 #void aesni_gcm_dec_avx_gen2(
1515 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1516 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1517 # const u8 *in, /* Ciphertext input */
1518 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1519 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1520 # (from Security Association) concatenated with 8 byte
1521 # Initialisation Vector (from IPSec ESP Payload)
1522 # concatenated with 0x00000001. 16-byte aligned pointer. */
1523 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1524 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1525 # u8 *auth_tag, /* Authenticated Tag output. */
1526 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1527 # Valid values are 16 (most likely), 12 or 8. */
1528 ###############################################################################
1529 ENTRY(aesni_gcm_dec_avx_gen2)
1532 ENDPROC(aesni_gcm_dec_avx_gen2)
1533 #endif /* CONFIG_AS_AVX */
1535 #ifdef CONFIG_AS_AVX2
1536 ###############################################################################
1537 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1538 # Input: A and B (128-bits each, bit-reflected)
1539 # Output: C = A*B*x mod poly, (i.e. >>1 )
1540 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1541 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1542 ###############################################################################
1543 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1545 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1546 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1547 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1548 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1552 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1553 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1558 #######################################################################
1559 #first phase of the reduction
1560 vmovdqa POLY2(%rip), \T3
1562 vpclmulqdq $0x01, \GH, \T3, \T2
1563 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1565 vpxor \T2, \GH, \GH # first phase of the reduction complete
1566 #######################################################################
1567 #second phase of the reduction
1568 vpclmulqdq $0x00, \GH, \T3, \T2
1569 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1571 vpclmulqdq $0x10, \GH, \T3, \GH
1572 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1574 vpxor \T2, \GH, \GH # second phase of the reduction complete
1575 #######################################################################
1576 vpxor \T1, \GH, \GH # the result is in GH
1581 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1583 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1585 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1586 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
1588 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1589 vmovdqa \T5, HashKey_3(arg1)
1591 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1592 vmovdqa \T5, HashKey_4(arg1)
1594 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1595 vmovdqa \T5, HashKey_5(arg1)
1597 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1598 vmovdqa \T5, HashKey_6(arg1)
1600 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1601 vmovdqa \T5, HashKey_7(arg1)
1603 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1604 vmovdqa \T5, HashKey_8(arg1)
1609 ## if a = number of total plaintext bytes
1611 ## num_initial_blocks = b mod 4#
1612 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1613 ## r10, r11, r12, rax are clobbered
1614 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1616 .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1617 i = (8-\num_initial_blocks)
1620 mov arg6, %r10 # r10 = AAD
1621 mov arg7, %r12 # r12 = aadLen
1626 vpxor reg_i, reg_i, reg_i
1629 vpslldq $12, \T1, \T1
1630 vpsrldq $4, reg_i, reg_i
1631 vpxor \T1, reg_i, reg_i
1639 je _get_AAD_loop2_done\@
1643 vpsrldq $4, reg_i, reg_i
1648 _get_AAD_loop2_done\@:
1650 #byte-reflect the AAD data
1651 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1653 # initialize the data pointer offset as zero
1656 # start AES for num_initial_blocks blocks
1657 mov arg5, %rax # rax = *Y0
1658 vmovdqu (%rax), \CTR # CTR = Y0
1659 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1662 i = (9-\num_initial_blocks)
1664 .rep \num_initial_blocks
1665 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1667 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1672 vmovdqa (arg1), \T_key
1673 i = (9-\num_initial_blocks)
1675 .rep \num_initial_blocks
1676 vpxor \T_key, reg_i, reg_i
1684 vmovdqa 16*j(arg1), \T_key
1685 i = (9-\num_initial_blocks)
1687 .rep \num_initial_blocks
1688 vaesenc \T_key, reg_i, reg_i
1698 vmovdqa 16*10(arg1), \T_key
1699 i = (9-\num_initial_blocks)
1701 .rep \num_initial_blocks
1702 vaesenclast \T_key, reg_i, reg_i
1707 i = (9-\num_initial_blocks)
1709 .rep \num_initial_blocks
1710 vmovdqu (arg3, %r11), \T1
1711 vpxor \T1, reg_i, reg_i
1712 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
1713 # num_initial_blocks blocks
1718 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1724 i = (8-\num_initial_blocks)
1725 j = (9-\num_initial_blocks)
1727 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1729 .rep \num_initial_blocks
1730 vpxor reg_i, reg_j, reg_j
1731 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1736 # XMM8 has the combined result here
1738 vmovdqa \XMM8, TMP1(%rsp)
1742 jl _initial_blocks_done\@ # no need for precomputed constants
1744 ###############################################################################
1745 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1746 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1748 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1750 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1752 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1754 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1756 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1758 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1760 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1762 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1764 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1766 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1768 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1770 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1772 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1774 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1776 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1778 vmovdqa (arg1), \T_key
1779 vpxor \T_key, \XMM1, \XMM1
1780 vpxor \T_key, \XMM2, \XMM2
1781 vpxor \T_key, \XMM3, \XMM3
1782 vpxor \T_key, \XMM4, \XMM4
1783 vpxor \T_key, \XMM5, \XMM5
1784 vpxor \T_key, \XMM6, \XMM6
1785 vpxor \T_key, \XMM7, \XMM7
1786 vpxor \T_key, \XMM8, \XMM8
1790 .rep 9 # do 9 rounds
1791 vmovdqa 16*i(arg1), \T_key
1792 vaesenc \T_key, \XMM1, \XMM1
1793 vaesenc \T_key, \XMM2, \XMM2
1794 vaesenc \T_key, \XMM3, \XMM3
1795 vaesenc \T_key, \XMM4, \XMM4
1796 vaesenc \T_key, \XMM5, \XMM5
1797 vaesenc \T_key, \XMM6, \XMM6
1798 vaesenc \T_key, \XMM7, \XMM7
1799 vaesenc \T_key, \XMM8, \XMM8
1805 vmovdqa 16*i(arg1), \T_key
1806 vaesenclast \T_key, \XMM1, \XMM1
1807 vaesenclast \T_key, \XMM2, \XMM2
1808 vaesenclast \T_key, \XMM3, \XMM3
1809 vaesenclast \T_key, \XMM4, \XMM4
1810 vaesenclast \T_key, \XMM5, \XMM5
1811 vaesenclast \T_key, \XMM6, \XMM6
1812 vaesenclast \T_key, \XMM7, \XMM7
1813 vaesenclast \T_key, \XMM8, \XMM8
1815 vmovdqu (arg3, %r11), \T1
1816 vpxor \T1, \XMM1, \XMM1
1817 vmovdqu \XMM1, (arg2 , %r11)
1822 vmovdqu 16*1(arg3, %r11), \T1
1823 vpxor \T1, \XMM2, \XMM2
1824 vmovdqu \XMM2, 16*1(arg2 , %r11)
1829 vmovdqu 16*2(arg3, %r11), \T1
1830 vpxor \T1, \XMM3, \XMM3
1831 vmovdqu \XMM3, 16*2(arg2 , %r11)
1836 vmovdqu 16*3(arg3, %r11), \T1
1837 vpxor \T1, \XMM4, \XMM4
1838 vmovdqu \XMM4, 16*3(arg2 , %r11)
1843 vmovdqu 16*4(arg3, %r11), \T1
1844 vpxor \T1, \XMM5, \XMM5
1845 vmovdqu \XMM5, 16*4(arg2 , %r11)
1850 vmovdqu 16*5(arg3, %r11), \T1
1851 vpxor \T1, \XMM6, \XMM6
1852 vmovdqu \XMM6, 16*5(arg2 , %r11)
1857 vmovdqu 16*6(arg3, %r11), \T1
1858 vpxor \T1, \XMM7, \XMM7
1859 vmovdqu \XMM7, 16*6(arg2 , %r11)
1864 vmovdqu 16*7(arg3, %r11), \T1
1865 vpxor \T1, \XMM8, \XMM8
1866 vmovdqu \XMM8, 16*7(arg2 , %r11)
1873 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1874 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
1875 # the corresponding ciphertext
1876 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1877 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1878 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1879 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1880 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1881 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1882 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1884 ###############################################################################
1886 _initial_blocks_done\@:
1893 # encrypt 8 blocks at a time
1894 # ghash the 8 previously encrypted ciphertext blocks
1895 # arg1, arg2, arg3 are used as pointers only, not modified
1896 # r11 is the data offset value
1897 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1900 vmovdqa \XMM2, TMP2(%rsp)
1901 vmovdqa \XMM3, TMP3(%rsp)
1902 vmovdqa \XMM4, TMP4(%rsp)
1903 vmovdqa \XMM5, TMP5(%rsp)
1904 vmovdqa \XMM6, TMP6(%rsp)
1905 vmovdqa \XMM7, TMP7(%rsp)
1906 vmovdqa \XMM8, TMP8(%rsp)
1908 .if \loop_idx == in_order
1909 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1910 vpaddd ONE(%rip), \XMM1, \XMM2
1911 vpaddd ONE(%rip), \XMM2, \XMM3
1912 vpaddd ONE(%rip), \XMM3, \XMM4
1913 vpaddd ONE(%rip), \XMM4, \XMM5
1914 vpaddd ONE(%rip), \XMM5, \XMM6
1915 vpaddd ONE(%rip), \XMM6, \XMM7
1916 vpaddd ONE(%rip), \XMM7, \XMM8
1919 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1920 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1921 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1922 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1923 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1924 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1925 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1926 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1928 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1929 vpaddd ONEf(%rip), \XMM1, \XMM2
1930 vpaddd ONEf(%rip), \XMM2, \XMM3
1931 vpaddd ONEf(%rip), \XMM3, \XMM4
1932 vpaddd ONEf(%rip), \XMM4, \XMM5
1933 vpaddd ONEf(%rip), \XMM5, \XMM6
1934 vpaddd ONEf(%rip), \XMM6, \XMM7
1935 vpaddd ONEf(%rip), \XMM7, \XMM8
1940 #######################################################################
1943 vpxor \T1, \XMM1, \XMM1
1944 vpxor \T1, \XMM2, \XMM2
1945 vpxor \T1, \XMM3, \XMM3
1946 vpxor \T1, \XMM4, \XMM4
1947 vpxor \T1, \XMM5, \XMM5
1948 vpxor \T1, \XMM6, \XMM6
1949 vpxor \T1, \XMM7, \XMM7
1950 vpxor \T1, \XMM8, \XMM8
1952 #######################################################################
1958 vmovdqu 16*1(arg1), \T1
1959 vaesenc \T1, \XMM1, \XMM1
1960 vaesenc \T1, \XMM2, \XMM2
1961 vaesenc \T1, \XMM3, \XMM3
1962 vaesenc \T1, \XMM4, \XMM4
1963 vaesenc \T1, \XMM5, \XMM5
1964 vaesenc \T1, \XMM6, \XMM6
1965 vaesenc \T1, \XMM7, \XMM7
1966 vaesenc \T1, \XMM8, \XMM8
1968 vmovdqu 16*2(arg1), \T1
1969 vaesenc \T1, \XMM1, \XMM1
1970 vaesenc \T1, \XMM2, \XMM2
1971 vaesenc \T1, \XMM3, \XMM3
1972 vaesenc \T1, \XMM4, \XMM4
1973 vaesenc \T1, \XMM5, \XMM5
1974 vaesenc \T1, \XMM6, \XMM6
1975 vaesenc \T1, \XMM7, \XMM7
1976 vaesenc \T1, \XMM8, \XMM8
1979 #######################################################################
1981 vmovdqa HashKey_8(arg1), \T5
1982 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1983 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1984 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
1985 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
1988 vmovdqu 16*3(arg1), \T1
1989 vaesenc \T1, \XMM1, \XMM1
1990 vaesenc \T1, \XMM2, \XMM2
1991 vaesenc \T1, \XMM3, \XMM3
1992 vaesenc \T1, \XMM4, \XMM4
1993 vaesenc \T1, \XMM5, \XMM5
1994 vaesenc \T1, \XMM6, \XMM6
1995 vaesenc \T1, \XMM7, \XMM7
1996 vaesenc \T1, \XMM8, \XMM8
1998 vmovdqa TMP2(%rsp), \T1
1999 vmovdqa HashKey_7(arg1), \T5
2000 vpclmulqdq $0x11, \T5, \T1, \T3
2003 vpclmulqdq $0x00, \T5, \T1, \T3
2006 vpclmulqdq $0x01, \T5, \T1, \T3
2009 vpclmulqdq $0x10, \T5, \T1, \T3
2012 vmovdqu 16*4(arg1), \T1
2013 vaesenc \T1, \XMM1, \XMM1
2014 vaesenc \T1, \XMM2, \XMM2
2015 vaesenc \T1, \XMM3, \XMM3
2016 vaesenc \T1, \XMM4, \XMM4
2017 vaesenc \T1, \XMM5, \XMM5
2018 vaesenc \T1, \XMM6, \XMM6
2019 vaesenc \T1, \XMM7, \XMM7
2020 vaesenc \T1, \XMM8, \XMM8
2022 #######################################################################
2024 vmovdqa TMP3(%rsp), \T1
2025 vmovdqa HashKey_6(arg1), \T5
2026 vpclmulqdq $0x11, \T5, \T1, \T3
2029 vpclmulqdq $0x00, \T5, \T1, \T3
2032 vpclmulqdq $0x01, \T5, \T1, \T3
2035 vpclmulqdq $0x10, \T5, \T1, \T3
2038 vmovdqu 16*5(arg1), \T1
2039 vaesenc \T1, \XMM1, \XMM1
2040 vaesenc \T1, \XMM2, \XMM2
2041 vaesenc \T1, \XMM3, \XMM3
2042 vaesenc \T1, \XMM4, \XMM4
2043 vaesenc \T1, \XMM5, \XMM5
2044 vaesenc \T1, \XMM6, \XMM6
2045 vaesenc \T1, \XMM7, \XMM7
2046 vaesenc \T1, \XMM8, \XMM8
2048 vmovdqa TMP4(%rsp), \T1
2049 vmovdqa HashKey_5(arg1), \T5
2050 vpclmulqdq $0x11, \T5, \T1, \T3
2053 vpclmulqdq $0x00, \T5, \T1, \T3
2056 vpclmulqdq $0x01, \T5, \T1, \T3
2059 vpclmulqdq $0x10, \T5, \T1, \T3
2062 vmovdqu 16*6(arg1), \T1
2063 vaesenc \T1, \XMM1, \XMM1
2064 vaesenc \T1, \XMM2, \XMM2
2065 vaesenc \T1, \XMM3, \XMM3
2066 vaesenc \T1, \XMM4, \XMM4
2067 vaesenc \T1, \XMM5, \XMM5
2068 vaesenc \T1, \XMM6, \XMM6
2069 vaesenc \T1, \XMM7, \XMM7
2070 vaesenc \T1, \XMM8, \XMM8
2073 vmovdqa TMP5(%rsp), \T1
2074 vmovdqa HashKey_4(arg1), \T5
2075 vpclmulqdq $0x11, \T5, \T1, \T3
2078 vpclmulqdq $0x00, \T5, \T1, \T3
2081 vpclmulqdq $0x01, \T5, \T1, \T3
2084 vpclmulqdq $0x10, \T5, \T1, \T3
2087 vmovdqu 16*7(arg1), \T1
2088 vaesenc \T1, \XMM1, \XMM1
2089 vaesenc \T1, \XMM2, \XMM2
2090 vaesenc \T1, \XMM3, \XMM3
2091 vaesenc \T1, \XMM4, \XMM4
2092 vaesenc \T1, \XMM5, \XMM5
2093 vaesenc \T1, \XMM6, \XMM6
2094 vaesenc \T1, \XMM7, \XMM7
2095 vaesenc \T1, \XMM8, \XMM8
2097 vmovdqa TMP6(%rsp), \T1
2098 vmovdqa HashKey_3(arg1), \T5
2099 vpclmulqdq $0x11, \T5, \T1, \T3
2102 vpclmulqdq $0x00, \T5, \T1, \T3
2105 vpclmulqdq $0x01, \T5, \T1, \T3
2108 vpclmulqdq $0x10, \T5, \T1, \T3
2111 vmovdqu 16*8(arg1), \T1
2112 vaesenc \T1, \XMM1, \XMM1
2113 vaesenc \T1, \XMM2, \XMM2
2114 vaesenc \T1, \XMM3, \XMM3
2115 vaesenc \T1, \XMM4, \XMM4
2116 vaesenc \T1, \XMM5, \XMM5
2117 vaesenc \T1, \XMM6, \XMM6
2118 vaesenc \T1, \XMM7, \XMM7
2119 vaesenc \T1, \XMM8, \XMM8
2121 vmovdqa TMP7(%rsp), \T1
2122 vmovdqa HashKey_2(arg1), \T5
2123 vpclmulqdq $0x11, \T5, \T1, \T3
2126 vpclmulqdq $0x00, \T5, \T1, \T3
2129 vpclmulqdq $0x01, \T5, \T1, \T3
2132 vpclmulqdq $0x10, \T5, \T1, \T3
2136 #######################################################################
2138 vmovdqu 16*9(arg1), \T5
2139 vaesenc \T5, \XMM1, \XMM1
2140 vaesenc \T5, \XMM2, \XMM2
2141 vaesenc \T5, \XMM3, \XMM3
2142 vaesenc \T5, \XMM4, \XMM4
2143 vaesenc \T5, \XMM5, \XMM5
2144 vaesenc \T5, \XMM6, \XMM6
2145 vaesenc \T5, \XMM7, \XMM7
2146 vaesenc \T5, \XMM8, \XMM8
2148 vmovdqa TMP8(%rsp), \T1
2149 vmovdqa HashKey(arg1), \T5
2151 vpclmulqdq $0x00, \T5, \T1, \T3
2154 vpclmulqdq $0x01, \T5, \T1, \T3
2157 vpclmulqdq $0x10, \T5, \T1, \T3
2160 vpclmulqdq $0x11, \T5, \T1, \T3
2164 vmovdqu 16*10(arg1), \T5
2170 vpxor 16*i(arg3, %r11), \T5, \T2
2172 vaesenclast \T2, reg_j, reg_j
2174 vaesenclast \T2, reg_j, \T3
2175 vmovdqu 16*i(arg3, %r11), reg_j
2176 vmovdqu \T3, 16*i(arg2, %r11)
2182 #######################################################################
2185 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2186 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2188 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2192 #######################################################################
2193 #first phase of the reduction
2194 vmovdqa POLY2(%rip), \T3
2196 vpclmulqdq $0x01, \T7, \T3, \T2
2197 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2199 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2200 #######################################################################
2202 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
2203 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
2204 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
2205 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
2206 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
2207 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
2208 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
2209 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
2212 #######################################################################
2213 #second phase of the reduction
2214 vpclmulqdq $0x00, \T7, \T3, \T2
2215 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2217 vpclmulqdq $0x10, \T7, \T3, \T4
2218 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2220 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2221 #######################################################################
2222 vpxor \T4, \T1, \T1 # the result is in T1
2224 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2225 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2226 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2227 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2228 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2229 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2230 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2231 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2234 vpxor \T1, \XMM1, \XMM1
2241 # GHASH the last 4 ciphertext blocks.
2242 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2246 vmovdqa HashKey_8(arg1), \T5
2248 vpshufd $0b01001110, \XMM1, \T2
2249 vpshufd $0b01001110, \T5, \T3
2250 vpxor \XMM1, \T2, \T2
2253 vpclmulqdq $0x11, \T5, \XMM1, \T6
2254 vpclmulqdq $0x00, \T5, \XMM1, \T7
2256 vpclmulqdq $0x00, \T3, \T2, \XMM1
2258 ######################
2260 vmovdqa HashKey_7(arg1), \T5
2261 vpshufd $0b01001110, \XMM2, \T2
2262 vpshufd $0b01001110, \T5, \T3
2263 vpxor \XMM2, \T2, \T2
2266 vpclmulqdq $0x11, \T5, \XMM2, \T4
2269 vpclmulqdq $0x00, \T5, \XMM2, \T4
2272 vpclmulqdq $0x00, \T3, \T2, \T2
2274 vpxor \T2, \XMM1, \XMM1
2276 ######################
2278 vmovdqa HashKey_6(arg1), \T5
2279 vpshufd $0b01001110, \XMM3, \T2
2280 vpshufd $0b01001110, \T5, \T3
2281 vpxor \XMM3, \T2, \T2
2284 vpclmulqdq $0x11, \T5, \XMM3, \T4
2287 vpclmulqdq $0x00, \T5, \XMM3, \T4
2290 vpclmulqdq $0x00, \T3, \T2, \T2
2292 vpxor \T2, \XMM1, \XMM1
2294 ######################
2296 vmovdqa HashKey_5(arg1), \T5
2297 vpshufd $0b01001110, \XMM4, \T2
2298 vpshufd $0b01001110, \T5, \T3
2299 vpxor \XMM4, \T2, \T2
2302 vpclmulqdq $0x11, \T5, \XMM4, \T4
2305 vpclmulqdq $0x00, \T5, \XMM4, \T4
2308 vpclmulqdq $0x00, \T3, \T2, \T2
2310 vpxor \T2, \XMM1, \XMM1
2312 ######################
2314 vmovdqa HashKey_4(arg1), \T5
2315 vpshufd $0b01001110, \XMM5, \T2
2316 vpshufd $0b01001110, \T5, \T3
2317 vpxor \XMM5, \T2, \T2
2320 vpclmulqdq $0x11, \T5, \XMM5, \T4
2323 vpclmulqdq $0x00, \T5, \XMM5, \T4
2326 vpclmulqdq $0x00, \T3, \T2, \T2
2328 vpxor \T2, \XMM1, \XMM1
2330 ######################
2332 vmovdqa HashKey_3(arg1), \T5
2333 vpshufd $0b01001110, \XMM6, \T2
2334 vpshufd $0b01001110, \T5, \T3
2335 vpxor \XMM6, \T2, \T2
2338 vpclmulqdq $0x11, \T5, \XMM6, \T4
2341 vpclmulqdq $0x00, \T5, \XMM6, \T4
2344 vpclmulqdq $0x00, \T3, \T2, \T2
2346 vpxor \T2, \XMM1, \XMM1
2348 ######################
2350 vmovdqa HashKey_2(arg1), \T5
2351 vpshufd $0b01001110, \XMM7, \T2
2352 vpshufd $0b01001110, \T5, \T3
2353 vpxor \XMM7, \T2, \T2
2356 vpclmulqdq $0x11, \T5, \XMM7, \T4
2359 vpclmulqdq $0x00, \T5, \XMM7, \T4
2362 vpclmulqdq $0x00, \T3, \T2, \T2
2364 vpxor \T2, \XMM1, \XMM1
2366 ######################
2368 vmovdqa HashKey(arg1), \T5
2369 vpshufd $0b01001110, \XMM8, \T2
2370 vpshufd $0b01001110, \T5, \T3
2371 vpxor \XMM8, \T2, \T2
2374 vpclmulqdq $0x11, \T5, \XMM8, \T4
2377 vpclmulqdq $0x00, \T5, \XMM8, \T4
2380 vpclmulqdq $0x00, \T3, \T2, \T2
2382 vpxor \T2, \XMM1, \XMM1
2383 vpxor \T6, \XMM1, \XMM1
2384 vpxor \T7, \XMM1, \T2
2389 vpslldq $8, \T2, \T4
2390 vpsrldq $8, \T2, \T2
2393 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2394 # accumulated carry-less multiplications
2396 #######################################################################
2397 #first phase of the reduction
2398 vmovdqa POLY2(%rip), \T3
2400 vpclmulqdq $0x01, \T7, \T3, \T2
2401 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2403 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2404 #######################################################################
2407 #second phase of the reduction
2408 vpclmulqdq $0x00, \T7, \T3, \T2
2409 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2411 vpclmulqdq $0x10, \T7, \T3, \T4
2412 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2414 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2415 #######################################################################
2416 vpxor \T4, \T6, \T6 # the result is in T6
2421 # combined for GCM encrypt and decrypt functions
2422 # clobbering all xmm registers
2423 # clobbering r10, r11, r12, r13, r14, r15
2424 .macro GCM_ENC_DEC_AVX2 ENC_DEC
2426 #the number of pushes must equal STACK_OFFSET
2437 sub $VARIABLE_OFFSET, %rsp
2438 and $~63, %rsp # align rsp to 64 bytes
2441 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
2443 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
2444 and $-16, %r13 # r13 = r13 - (r13 mod 16)
2449 jz _initial_num_blocks_is_0\@
2452 je _initial_num_blocks_is_7\@
2454 je _initial_num_blocks_is_6\@
2456 je _initial_num_blocks_is_5\@
2458 je _initial_num_blocks_is_4\@
2460 je _initial_num_blocks_is_3\@
2462 je _initial_num_blocks_is_2\@
2464 jmp _initial_num_blocks_is_1\@
2466 _initial_num_blocks_is_7\@:
2467 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2469 jmp _initial_blocks_encrypted\@
2471 _initial_num_blocks_is_6\@:
2472 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2474 jmp _initial_blocks_encrypted\@
2476 _initial_num_blocks_is_5\@:
2477 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2479 jmp _initial_blocks_encrypted\@
2481 _initial_num_blocks_is_4\@:
2482 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2484 jmp _initial_blocks_encrypted\@
2486 _initial_num_blocks_is_3\@:
2487 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2489 jmp _initial_blocks_encrypted\@
2491 _initial_num_blocks_is_2\@:
2492 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2494 jmp _initial_blocks_encrypted\@
2496 _initial_num_blocks_is_1\@:
2497 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2499 jmp _initial_blocks_encrypted\@
2501 _initial_num_blocks_is_0\@:
2502 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2505 _initial_blocks_encrypted\@:
2507 je _zero_cipher_left\@
2510 je _eight_cipher_left\@
2517 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2520 _encrypt_by_8_new\@:
2527 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2530 jne _encrypt_by_8_new\@
2532 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2533 jmp _eight_cipher_left\@
2536 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2538 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2539 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2542 jne _encrypt_by_8_new\@
2544 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2549 _eight_cipher_left\@:
2550 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2553 _zero_cipher_left\@:
2555 jl _only_less_than_16\@
2558 and $15, %r13 # r13 = (arg4 mod 16)
2560 je _multiple_of_16_bytes\@
2562 # handle the last <16 Byte block seperately
2565 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2566 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2567 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2571 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
2573 lea SHIFT_MASK+16(%rip), %r12
2574 sub %r13, %r12 # adjust the shuffle mask pointer
2575 # to be able to shift 16-r13 bytes
2576 # (r13 is the number of bytes in plaintext mod 16)
2577 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
2578 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
2579 jmp _final_ghash_mul\@
2581 _only_less_than_16\@:
2582 # check for 0 length
2584 and $15, %r13 # r13 = (arg4 mod 16)
2586 je _multiple_of_16_bytes\@
2588 # handle the last <16 Byte block seperately
2591 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2592 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2593 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2596 lea SHIFT_MASK+16(%rip), %r12
2597 sub %r13, %r12 # adjust the shuffle mask pointer to be
2598 # able to shift 16-r13 bytes (r13 is the
2599 # number of bytes in plaintext mod 16)
2601 _get_last_16_byte_loop\@:
2602 movb (arg3, %r11), %al
2603 movb %al, TMP1 (%rsp , %r11)
2606 jne _get_last_16_byte_loop\@
2608 vmovdqu TMP1(%rsp), %xmm1
2614 vmovdqa %xmm1, %xmm2
2615 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2616 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2617 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2618 vpand %xmm1, %xmm2, %xmm2
2619 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2620 vpxor %xmm2, %xmm14, %xmm14
2621 #GHASH computation for the last <16 Byte block
2622 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2626 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2627 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2628 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2629 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2630 vpxor %xmm9, %xmm14, %xmm14
2631 #GHASH computation for the last <16 Byte block
2632 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2635 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
2639 #############################
2643 jle _less_than_8_bytes_left\@
2645 mov %rax, (arg2 , %r11)
2647 vpsrldq $8, %xmm9, %xmm9
2651 _less_than_8_bytes_left\@:
2652 movb %al, (arg2 , %r11)
2656 jne _less_than_8_bytes_left\@
2657 #############################
2659 _multiple_of_16_bytes\@:
2660 mov arg7, %r12 # r12 = aadLen (number of bytes)
2661 shl $3, %r12 # convert into number of bits
2662 vmovd %r12d, %xmm15 # len(A) in xmm15
2664 shl $3, arg4 # len(C) in bits (*128)
2666 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
2667 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
2669 vpxor %xmm15, %xmm14, %xmm14
2670 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
2671 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
2673 mov arg5, %rax # rax = *Y0
2674 vmovdqu (%rax), %xmm9 # xmm9 = Y0
2676 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
2678 vpxor %xmm14, %xmm9, %xmm9
2683 mov arg8, %r10 # r10 = authTag
2684 mov arg9, %r11 # r11 = auth_tag_len
2695 jmp _return_T_done\@
2699 vpsrldq $8, %xmm9, %xmm9
2702 jmp _return_T_done\@
2705 vmovdqu %xmm9, (%r10)
2717 #############################################################
2718 #void aesni_gcm_precomp_avx_gen4
2719 # (gcm_data *my_ctx_data,
2720 # u8 *hash_subkey)# /* H, the Hash sub key input.
2721 # Data starts on a 16-byte boundary. */
2722 #############################################################
2723 ENTRY(aesni_gcm_precomp_avx_gen4)
2724 #the number of pushes must equal STACK_OFFSET
2734 sub $VARIABLE_OFFSET, %rsp
2735 and $~63, %rsp # align rsp to 64 bytes
2737 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
2739 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2740 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2741 vmovdqa %xmm6, %xmm2
2742 vpsllq $1, %xmm6, %xmm6
2743 vpsrlq $63, %xmm2, %xmm2
2744 vmovdqa %xmm2, %xmm1
2745 vpslldq $8, %xmm2, %xmm2
2746 vpsrldq $8, %xmm1, %xmm1
2747 vpor %xmm2, %xmm6, %xmm6
2749 vpshufd $0b00100100, %xmm1, %xmm2
2750 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2751 vpand POLY(%rip), %xmm2, %xmm2
2752 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2753 #######################################################################
2754 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
2757 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2766 ENDPROC(aesni_gcm_precomp_avx_gen4)
2769 ###############################################################################
2770 #void aesni_gcm_enc_avx_gen4(
2771 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2772 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2773 # const u8 *in, /* Plaintext input */
2774 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2775 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2776 # (from Security Association) concatenated with 8 byte
2777 # Initialisation Vector (from IPSec ESP Payload)
2778 # concatenated with 0x00000001. 16-byte aligned pointer. */
2779 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2780 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2781 # u8 *auth_tag, /* Authenticated Tag output. */
2782 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2783 # Valid values are 16 (most likely), 12 or 8. */
2784 ###############################################################################
2785 ENTRY(aesni_gcm_enc_avx_gen4)
2786 GCM_ENC_DEC_AVX2 ENC
2788 ENDPROC(aesni_gcm_enc_avx_gen4)
2790 ###############################################################################
2791 #void aesni_gcm_dec_avx_gen4(
2792 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2793 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2794 # const u8 *in, /* Ciphertext input */
2795 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2796 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2797 # (from Security Association) concatenated with 8 byte
2798 # Initialisation Vector (from IPSec ESP Payload)
2799 # concatenated with 0x00000001. 16-byte aligned pointer. */
2800 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2801 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2802 # u8 *auth_tag, /* Authenticated Tag output. */
2803 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2804 # Valid values are 16 (most likely), 12 or 8. */
2805 ###############################################################################
2806 ENTRY(aesni_gcm_dec_avx_gen4)
2807 GCM_ENC_DEC_AVX2 DEC
2809 ENDPROC(aesni_gcm_dec_avx_gen4)
2811 #endif /* CONFIG_AS_AVX2 */