1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
128 POLY: .octa 0xC2000000000000000000000000000001
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
136 TWOONE: .octa 0x00000001000000000000000000000001
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
144 ONE: .octa 0x00000000000000000000000000000001
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
148 ONEf: .octa 0x01000000000000000000000000000000
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
161 ##define the fields of the gcm aes context
163 # u8 expanded_keys[16*11] store expanded keys
164 # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
165 # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
166 # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
167 # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
168 # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
169 # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
170 # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
171 # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
172 # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
173 # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
174 # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
175 # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
176 # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
177 # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
178 # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
179 # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
182 HashKey = 16*11 # store HashKey <<1 mod poly here
183 HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
184 HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
185 HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
186 HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
187 HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
188 HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
189 HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
190 HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
191 HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
192 HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
193 HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
194 HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
195 HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
196 HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
197 HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
205 #define arg7 STACK_OFFSET+8*1(%r14)
206 #define arg8 STACK_OFFSET+8*2(%r14)
207 #define arg9 STACK_OFFSET+8*3(%r14)
217 .macro define_reg r n
228 # need to push 4 registers into stack to maintain
231 TMP1 = 16*0 # Temporary storage for AAD
232 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
233 TMP3 = 16*2 # Temporary storage for AES State 3
234 TMP4 = 16*3 # Temporary storage for AES State 4
235 TMP5 = 16*4 # Temporary storage for AES State 5
236 TMP6 = 16*5 # Temporary storage for AES State 6
237 TMP7 = 16*6 # Temporary storage for AES State 7
238 TMP8 = 16*7 # Temporary storage for AES State 8
240 VARIABLE_OFFSET = 16*8
242 ################################
244 ################################
246 # Encryption of a single block
247 .macro ENCRYPT_SINGLE_BLOCK XMM0
248 vpxor (arg1), \XMM0, \XMM0
252 vaesenc 16*i(arg1), \XMM0, \XMM0
256 vaesenclast 16*10(arg1), \XMM0, \XMM0
260 ###############################################################################
261 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
262 # Input: A and B (128-bits each, bit-reflected)
263 # Output: C = A*B*x mod poly, (i.e. >>1 )
264 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
265 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
266 ###############################################################################
267 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
269 vpshufd $0b01001110, \GH, \T2
270 vpshufd $0b01001110, \HK, \T3
271 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
272 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
274 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
275 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
276 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
278 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
280 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
281 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
283 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
285 #first phase of the reduction
286 vpslld $31, \GH, \T2 # packed right shifting << 31
287 vpslld $30, \GH, \T3 # packed right shifting shift << 30
288 vpslld $25, \GH, \T4 # packed right shifting shift << 25
290 vpxor \T3, \T2, \T2 # xor the shifted versions
293 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
295 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
296 vpxor \T2, \GH, \GH # first phase of the reduction complete
298 #second phase of the reduction
300 vpsrld $1,\GH, \T2 # packed left shifting >> 1
301 vpsrld $2,\GH, \T3 # packed left shifting >> 2
302 vpsrld $7,\GH, \T4 # packed left shifting >> 7
303 vpxor \T3, \T2, \T2 # xor the shifted versions
308 vpxor \T1, \GH, \GH # the result is in GH
313 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
315 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
318 vpshufd $0b01001110, \T5, \T1
320 vmovdqa \T1, HashKey_k(arg1)
322 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
323 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
324 vpshufd $0b01001110, \T5, \T1
326 vmovdqa \T1, HashKey_2_k(arg1)
328 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
329 vmovdqa \T5, HashKey_3(arg1)
330 vpshufd $0b01001110, \T5, \T1
332 vmovdqa \T1, HashKey_3_k(arg1)
334 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
335 vmovdqa \T5, HashKey_4(arg1)
336 vpshufd $0b01001110, \T5, \T1
338 vmovdqa \T1, HashKey_4_k(arg1)
340 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
341 vmovdqa \T5, HashKey_5(arg1)
342 vpshufd $0b01001110, \T5, \T1
344 vmovdqa \T1, HashKey_5_k(arg1)
346 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
347 vmovdqa \T5, HashKey_6(arg1)
348 vpshufd $0b01001110, \T5, \T1
350 vmovdqa \T1, HashKey_6_k(arg1)
352 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
353 vmovdqa \T5, HashKey_7(arg1)
354 vpshufd $0b01001110, \T5, \T1
356 vmovdqa \T1, HashKey_7_k(arg1)
358 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
359 vmovdqa \T5, HashKey_8(arg1)
360 vpshufd $0b01001110, \T5, \T1
362 vmovdqa \T1, HashKey_8_k(arg1)
366 ## if a = number of total plaintext bytes
368 ## num_initial_blocks = b mod 4#
369 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
370 ## r10, r11, r12, rax are clobbered
371 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
373 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
374 i = (8-\num_initial_blocks)
377 mov arg6, %r10 # r10 = AAD
378 mov arg7, %r12 # r12 = aadLen
383 vpxor reg_i, reg_i, reg_i
386 vpslldq $12, \T1, \T1
387 vpsrldq $4, reg_i, reg_i
388 vpxor \T1, reg_i, reg_i
396 je _get_AAD_loop2_done\@
400 vpsrldq $4, reg_i, reg_i
405 _get_AAD_loop2_done\@:
407 #byte-reflect the AAD data
408 vpshufb SHUF_MASK(%rip), reg_i, reg_i
410 # initialize the data pointer offset as zero
413 # start AES for num_initial_blocks blocks
414 mov arg5, %rax # rax = *Y0
415 vmovdqu (%rax), \CTR # CTR = Y0
416 vpshufb SHUF_MASK(%rip), \CTR, \CTR
419 i = (9-\num_initial_blocks)
421 .rep \num_initial_blocks
422 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
424 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
429 vmovdqa (arg1), \T_key
430 i = (9-\num_initial_blocks)
432 .rep \num_initial_blocks
433 vpxor \T_key, reg_i, reg_i
441 vmovdqa 16*j(arg1), \T_key
442 i = (9-\num_initial_blocks)
444 .rep \num_initial_blocks
445 vaesenc \T_key, reg_i, reg_i
455 vmovdqa 16*10(arg1), \T_key
456 i = (9-\num_initial_blocks)
458 .rep \num_initial_blocks
459 vaesenclast \T_key, reg_i, reg_i
464 i = (9-\num_initial_blocks)
466 .rep \num_initial_blocks
467 vmovdqu (arg3, %r11), \T1
468 vpxor \T1, reg_i, reg_i
469 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
474 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
480 i = (8-\num_initial_blocks)
481 j = (9-\num_initial_blocks)
483 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
485 .rep \num_initial_blocks
486 vpxor reg_i, reg_j, reg_j
487 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
492 # XMM8 has the combined result here
494 vmovdqa \XMM8, TMP1(%rsp)
498 jl _initial_blocks_done\@ # no need for precomputed constants
500 ###############################################################################
501 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
502 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
504 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
506 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
508 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
510 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
512 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
514 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
516 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
518 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
520 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
522 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
524 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
526 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
528 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
530 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
532 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
534 vmovdqa (arg1), \T_key
535 vpxor \T_key, \XMM1, \XMM1
536 vpxor \T_key, \XMM2, \XMM2
537 vpxor \T_key, \XMM3, \XMM3
538 vpxor \T_key, \XMM4, \XMM4
539 vpxor \T_key, \XMM5, \XMM5
540 vpxor \T_key, \XMM6, \XMM6
541 vpxor \T_key, \XMM7, \XMM7
542 vpxor \T_key, \XMM8, \XMM8
547 vmovdqa 16*i(arg1), \T_key
548 vaesenc \T_key, \XMM1, \XMM1
549 vaesenc \T_key, \XMM2, \XMM2
550 vaesenc \T_key, \XMM3, \XMM3
551 vaesenc \T_key, \XMM4, \XMM4
552 vaesenc \T_key, \XMM5, \XMM5
553 vaesenc \T_key, \XMM6, \XMM6
554 vaesenc \T_key, \XMM7, \XMM7
555 vaesenc \T_key, \XMM8, \XMM8
561 vmovdqa 16*i(arg1), \T_key
562 vaesenclast \T_key, \XMM1, \XMM1
563 vaesenclast \T_key, \XMM2, \XMM2
564 vaesenclast \T_key, \XMM3, \XMM3
565 vaesenclast \T_key, \XMM4, \XMM4
566 vaesenclast \T_key, \XMM5, \XMM5
567 vaesenclast \T_key, \XMM6, \XMM6
568 vaesenclast \T_key, \XMM7, \XMM7
569 vaesenclast \T_key, \XMM8, \XMM8
571 vmovdqu (arg3, %r11), \T1
572 vpxor \T1, \XMM1, \XMM1
573 vmovdqu \XMM1, (arg2 , %r11)
578 vmovdqu 16*1(arg3, %r11), \T1
579 vpxor \T1, \XMM2, \XMM2
580 vmovdqu \XMM2, 16*1(arg2 , %r11)
585 vmovdqu 16*2(arg3, %r11), \T1
586 vpxor \T1, \XMM3, \XMM3
587 vmovdqu \XMM3, 16*2(arg2 , %r11)
592 vmovdqu 16*3(arg3, %r11), \T1
593 vpxor \T1, \XMM4, \XMM4
594 vmovdqu \XMM4, 16*3(arg2 , %r11)
599 vmovdqu 16*4(arg3, %r11), \T1
600 vpxor \T1, \XMM5, \XMM5
601 vmovdqu \XMM5, 16*4(arg2 , %r11)
606 vmovdqu 16*5(arg3, %r11), \T1
607 vpxor \T1, \XMM6, \XMM6
608 vmovdqu \XMM6, 16*5(arg2 , %r11)
613 vmovdqu 16*6(arg3, %r11), \T1
614 vpxor \T1, \XMM7, \XMM7
615 vmovdqu \XMM7, 16*6(arg2 , %r11)
620 vmovdqu 16*7(arg3, %r11), \T1
621 vpxor \T1, \XMM8, \XMM8
622 vmovdqu \XMM8, 16*7(arg2 , %r11)
629 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
630 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
631 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
632 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
633 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
634 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
635 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
636 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
637 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
639 ###############################################################################
641 _initial_blocks_done\@:
645 # encrypt 8 blocks at a time
646 # ghash the 8 previously encrypted ciphertext blocks
647 # arg1, arg2, arg3 are used as pointers only, not modified
648 # r11 is the data offset value
649 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
652 vmovdqa \XMM2, TMP2(%rsp)
653 vmovdqa \XMM3, TMP3(%rsp)
654 vmovdqa \XMM4, TMP4(%rsp)
655 vmovdqa \XMM5, TMP5(%rsp)
656 vmovdqa \XMM6, TMP6(%rsp)
657 vmovdqa \XMM7, TMP7(%rsp)
658 vmovdqa \XMM8, TMP8(%rsp)
660 .if \loop_idx == in_order
661 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
662 vpaddd ONE(%rip), \XMM1, \XMM2
663 vpaddd ONE(%rip), \XMM2, \XMM3
664 vpaddd ONE(%rip), \XMM3, \XMM4
665 vpaddd ONE(%rip), \XMM4, \XMM5
666 vpaddd ONE(%rip), \XMM5, \XMM6
667 vpaddd ONE(%rip), \XMM6, \XMM7
668 vpaddd ONE(%rip), \XMM7, \XMM8
671 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
672 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
673 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
674 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
675 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
676 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
677 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
678 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
680 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
681 vpaddd ONEf(%rip), \XMM1, \XMM2
682 vpaddd ONEf(%rip), \XMM2, \XMM3
683 vpaddd ONEf(%rip), \XMM3, \XMM4
684 vpaddd ONEf(%rip), \XMM4, \XMM5
685 vpaddd ONEf(%rip), \XMM5, \XMM6
686 vpaddd ONEf(%rip), \XMM6, \XMM7
687 vpaddd ONEf(%rip), \XMM7, \XMM8
692 #######################################################################
695 vpxor \T1, \XMM1, \XMM1
696 vpxor \T1, \XMM2, \XMM2
697 vpxor \T1, \XMM3, \XMM3
698 vpxor \T1, \XMM4, \XMM4
699 vpxor \T1, \XMM5, \XMM5
700 vpxor \T1, \XMM6, \XMM6
701 vpxor \T1, \XMM7, \XMM7
702 vpxor \T1, \XMM8, \XMM8
704 #######################################################################
710 vmovdqu 16*1(arg1), \T1
711 vaesenc \T1, \XMM1, \XMM1
712 vaesenc \T1, \XMM2, \XMM2
713 vaesenc \T1, \XMM3, \XMM3
714 vaesenc \T1, \XMM4, \XMM4
715 vaesenc \T1, \XMM5, \XMM5
716 vaesenc \T1, \XMM6, \XMM6
717 vaesenc \T1, \XMM7, \XMM7
718 vaesenc \T1, \XMM8, \XMM8
720 vmovdqu 16*2(arg1), \T1
721 vaesenc \T1, \XMM1, \XMM1
722 vaesenc \T1, \XMM2, \XMM2
723 vaesenc \T1, \XMM3, \XMM3
724 vaesenc \T1, \XMM4, \XMM4
725 vaesenc \T1, \XMM5, \XMM5
726 vaesenc \T1, \XMM6, \XMM6
727 vaesenc \T1, \XMM7, \XMM7
728 vaesenc \T1, \XMM8, \XMM8
731 #######################################################################
733 vmovdqa HashKey_8(arg1), \T5
734 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
735 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
737 vpshufd $0b01001110, \T2, \T6
740 vmovdqa HashKey_8_k(arg1), \T5
741 vpclmulqdq $0x00, \T5, \T6, \T6
743 vmovdqu 16*3(arg1), \T1
744 vaesenc \T1, \XMM1, \XMM1
745 vaesenc \T1, \XMM2, \XMM2
746 vaesenc \T1, \XMM3, \XMM3
747 vaesenc \T1, \XMM4, \XMM4
748 vaesenc \T1, \XMM5, \XMM5
749 vaesenc \T1, \XMM6, \XMM6
750 vaesenc \T1, \XMM7, \XMM7
751 vaesenc \T1, \XMM8, \XMM8
753 vmovdqa TMP2(%rsp), \T1
754 vmovdqa HashKey_7(arg1), \T5
755 vpclmulqdq $0x11, \T5, \T1, \T3
757 vpclmulqdq $0x00, \T5, \T1, \T3
760 vpshufd $0b01001110, \T1, \T3
762 vmovdqa HashKey_7_k(arg1), \T5
763 vpclmulqdq $0x10, \T5, \T3, \T3
766 vmovdqu 16*4(arg1), \T1
767 vaesenc \T1, \XMM1, \XMM1
768 vaesenc \T1, \XMM2, \XMM2
769 vaesenc \T1, \XMM3, \XMM3
770 vaesenc \T1, \XMM4, \XMM4
771 vaesenc \T1, \XMM5, \XMM5
772 vaesenc \T1, \XMM6, \XMM6
773 vaesenc \T1, \XMM7, \XMM7
774 vaesenc \T1, \XMM8, \XMM8
776 #######################################################################
778 vmovdqa TMP3(%rsp), \T1
779 vmovdqa HashKey_6(arg1), \T5
780 vpclmulqdq $0x11, \T5, \T1, \T3
782 vpclmulqdq $0x00, \T5, \T1, \T3
785 vpshufd $0b01001110, \T1, \T3
787 vmovdqa HashKey_6_k(arg1), \T5
788 vpclmulqdq $0x10, \T5, \T3, \T3
791 vmovdqu 16*5(arg1), \T1
792 vaesenc \T1, \XMM1, \XMM1
793 vaesenc \T1, \XMM2, \XMM2
794 vaesenc \T1, \XMM3, \XMM3
795 vaesenc \T1, \XMM4, \XMM4
796 vaesenc \T1, \XMM5, \XMM5
797 vaesenc \T1, \XMM6, \XMM6
798 vaesenc \T1, \XMM7, \XMM7
799 vaesenc \T1, \XMM8, \XMM8
801 vmovdqa TMP4(%rsp), \T1
802 vmovdqa HashKey_5(arg1), \T5
803 vpclmulqdq $0x11, \T5, \T1, \T3
805 vpclmulqdq $0x00, \T5, \T1, \T3
808 vpshufd $0b01001110, \T1, \T3
810 vmovdqa HashKey_5_k(arg1), \T5
811 vpclmulqdq $0x10, \T5, \T3, \T3
814 vmovdqu 16*6(arg1), \T1
815 vaesenc \T1, \XMM1, \XMM1
816 vaesenc \T1, \XMM2, \XMM2
817 vaesenc \T1, \XMM3, \XMM3
818 vaesenc \T1, \XMM4, \XMM4
819 vaesenc \T1, \XMM5, \XMM5
820 vaesenc \T1, \XMM6, \XMM6
821 vaesenc \T1, \XMM7, \XMM7
822 vaesenc \T1, \XMM8, \XMM8
825 vmovdqa TMP5(%rsp), \T1
826 vmovdqa HashKey_4(arg1), \T5
827 vpclmulqdq $0x11, \T5, \T1, \T3
829 vpclmulqdq $0x00, \T5, \T1, \T3
832 vpshufd $0b01001110, \T1, \T3
834 vmovdqa HashKey_4_k(arg1), \T5
835 vpclmulqdq $0x10, \T5, \T3, \T3
838 vmovdqu 16*7(arg1), \T1
839 vaesenc \T1, \XMM1, \XMM1
840 vaesenc \T1, \XMM2, \XMM2
841 vaesenc \T1, \XMM3, \XMM3
842 vaesenc \T1, \XMM4, \XMM4
843 vaesenc \T1, \XMM5, \XMM5
844 vaesenc \T1, \XMM6, \XMM6
845 vaesenc \T1, \XMM7, \XMM7
846 vaesenc \T1, \XMM8, \XMM8
848 vmovdqa TMP6(%rsp), \T1
849 vmovdqa HashKey_3(arg1), \T5
850 vpclmulqdq $0x11, \T5, \T1, \T3
852 vpclmulqdq $0x00, \T5, \T1, \T3
855 vpshufd $0b01001110, \T1, \T3
857 vmovdqa HashKey_3_k(arg1), \T5
858 vpclmulqdq $0x10, \T5, \T3, \T3
862 vmovdqu 16*8(arg1), \T1
863 vaesenc \T1, \XMM1, \XMM1
864 vaesenc \T1, \XMM2, \XMM2
865 vaesenc \T1, \XMM3, \XMM3
866 vaesenc \T1, \XMM4, \XMM4
867 vaesenc \T1, \XMM5, \XMM5
868 vaesenc \T1, \XMM6, \XMM6
869 vaesenc \T1, \XMM7, \XMM7
870 vaesenc \T1, \XMM8, \XMM8
872 vmovdqa TMP7(%rsp), \T1
873 vmovdqa HashKey_2(arg1), \T5
874 vpclmulqdq $0x11, \T5, \T1, \T3
876 vpclmulqdq $0x00, \T5, \T1, \T3
879 vpshufd $0b01001110, \T1, \T3
881 vmovdqa HashKey_2_k(arg1), \T5
882 vpclmulqdq $0x10, \T5, \T3, \T3
885 #######################################################################
887 vmovdqu 16*9(arg1), \T5
888 vaesenc \T5, \XMM1, \XMM1
889 vaesenc \T5, \XMM2, \XMM2
890 vaesenc \T5, \XMM3, \XMM3
891 vaesenc \T5, \XMM4, \XMM4
892 vaesenc \T5, \XMM5, \XMM5
893 vaesenc \T5, \XMM6, \XMM6
894 vaesenc \T5, \XMM7, \XMM7
895 vaesenc \T5, \XMM8, \XMM8
897 vmovdqa TMP8(%rsp), \T1
898 vmovdqa HashKey(arg1), \T5
899 vpclmulqdq $0x11, \T5, \T1, \T3
901 vpclmulqdq $0x00, \T5, \T1, \T3
904 vpshufd $0b01001110, \T1, \T3
906 vmovdqa HashKey_k(arg1), \T5
907 vpclmulqdq $0x10, \T5, \T3, \T3
913 vmovdqu 16*10(arg1), \T5
919 vpxor 16*i(arg3, %r11), \T5, \T2
921 vaesenclast \T2, reg_j, reg_j
923 vaesenclast \T2, reg_j, \T3
924 vmovdqu 16*i(arg3, %r11), reg_j
925 vmovdqu \T3, 16*i(arg2, %r11)
931 #######################################################################
934 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
935 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
937 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
941 #######################################################################
942 #first phase of the reduction
943 #######################################################################
944 vpslld $31, \T7, \T2 # packed right shifting << 31
945 vpslld $30, \T7, \T3 # packed right shifting shift << 30
946 vpslld $25, \T7, \T4 # packed right shifting shift << 25
948 vpxor \T3, \T2, \T2 # xor the shifted versions
951 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
953 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
954 vpxor \T2, \T7, \T7 # first phase of the reduction complete
955 #######################################################################
957 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
958 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
959 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
960 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
961 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
962 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
963 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
964 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
967 #######################################################################
968 #second phase of the reduction
969 vpsrld $1, \T7, \T2 # packed left shifting >> 1
970 vpsrld $2, \T7, \T3 # packed left shifting >> 2
971 vpsrld $7, \T7, \T4 # packed left shifting >> 7
972 vpxor \T3, \T2, \T2 # xor the shifted versions
977 vpxor \T7, \T6, \T6 # the result is in T6
978 #######################################################################
980 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
981 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
982 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
983 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
984 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
985 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
986 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
987 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
990 vpxor \T6, \XMM1, \XMM1
997 # GHASH the last 4 ciphertext blocks.
998 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1003 vpshufd $0b01001110, \XMM1, \T2
1004 vpxor \XMM1, \T2, \T2
1005 vmovdqa HashKey_8(arg1), \T5
1006 vpclmulqdq $0x11, \T5, \XMM1, \T6
1007 vpclmulqdq $0x00, \T5, \XMM1, \T7
1009 vmovdqa HashKey_8_k(arg1), \T3
1010 vpclmulqdq $0x00, \T3, \T2, \XMM1
1012 ######################
1014 vpshufd $0b01001110, \XMM2, \T2
1015 vpxor \XMM2, \T2, \T2
1016 vmovdqa HashKey_7(arg1), \T5
1017 vpclmulqdq $0x11, \T5, \XMM2, \T4
1020 vpclmulqdq $0x00, \T5, \XMM2, \T4
1023 vmovdqa HashKey_7_k(arg1), \T3
1024 vpclmulqdq $0x00, \T3, \T2, \T2
1025 vpxor \T2, \XMM1, \XMM1
1027 ######################
1029 vpshufd $0b01001110, \XMM3, \T2
1030 vpxor \XMM3, \T2, \T2
1031 vmovdqa HashKey_6(arg1), \T5
1032 vpclmulqdq $0x11, \T5, \XMM3, \T4
1035 vpclmulqdq $0x00, \T5, \XMM3, \T4
1038 vmovdqa HashKey_6_k(arg1), \T3
1039 vpclmulqdq $0x00, \T3, \T2, \T2
1040 vpxor \T2, \XMM1, \XMM1
1042 ######################
1044 vpshufd $0b01001110, \XMM4, \T2
1045 vpxor \XMM4, \T2, \T2
1046 vmovdqa HashKey_5(arg1), \T5
1047 vpclmulqdq $0x11, \T5, \XMM4, \T4
1050 vpclmulqdq $0x00, \T5, \XMM4, \T4
1053 vmovdqa HashKey_5_k(arg1), \T3
1054 vpclmulqdq $0x00, \T3, \T2, \T2
1055 vpxor \T2, \XMM1, \XMM1
1057 ######################
1059 vpshufd $0b01001110, \XMM5, \T2
1060 vpxor \XMM5, \T2, \T2
1061 vmovdqa HashKey_4(arg1), \T5
1062 vpclmulqdq $0x11, \T5, \XMM5, \T4
1065 vpclmulqdq $0x00, \T5, \XMM5, \T4
1068 vmovdqa HashKey_4_k(arg1), \T3
1069 vpclmulqdq $0x00, \T3, \T2, \T2
1070 vpxor \T2, \XMM1, \XMM1
1072 ######################
1074 vpshufd $0b01001110, \XMM6, \T2
1075 vpxor \XMM6, \T2, \T2
1076 vmovdqa HashKey_3(arg1), \T5
1077 vpclmulqdq $0x11, \T5, \XMM6, \T4
1080 vpclmulqdq $0x00, \T5, \XMM6, \T4
1083 vmovdqa HashKey_3_k(arg1), \T3
1084 vpclmulqdq $0x00, \T3, \T2, \T2
1085 vpxor \T2, \XMM1, \XMM1
1087 ######################
1089 vpshufd $0b01001110, \XMM7, \T2
1090 vpxor \XMM7, \T2, \T2
1091 vmovdqa HashKey_2(arg1), \T5
1092 vpclmulqdq $0x11, \T5, \XMM7, \T4
1095 vpclmulqdq $0x00, \T5, \XMM7, \T4
1098 vmovdqa HashKey_2_k(arg1), \T3
1099 vpclmulqdq $0x00, \T3, \T2, \T2
1100 vpxor \T2, \XMM1, \XMM1
1102 ######################
1104 vpshufd $0b01001110, \XMM8, \T2
1105 vpxor \XMM8, \T2, \T2
1106 vmovdqa HashKey(arg1), \T5
1107 vpclmulqdq $0x11, \T5, \XMM8, \T4
1110 vpclmulqdq $0x00, \T5, \XMM8, \T4
1113 vmovdqa HashKey_k(arg1), \T3
1114 vpclmulqdq $0x00, \T3, \T2, \T2
1116 vpxor \T2, \XMM1, \XMM1
1117 vpxor \T6, \XMM1, \XMM1
1118 vpxor \T7, \XMM1, \T2
1123 vpslldq $8, \T2, \T4
1124 vpsrldq $8, \T2, \T2
1127 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1128 # the accumulated carry-less multiplications
1130 #######################################################################
1131 #first phase of the reduction
1132 vpslld $31, \T7, \T2 # packed right shifting << 31
1133 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1134 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1136 vpxor \T3, \T2, \T2 # xor the shifted versions
1139 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1141 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1142 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1143 #######################################################################
1146 #second phase of the reduction
1147 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1148 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1149 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1150 vpxor \T3, \T2, \T2 # xor the shifted versions
1155 vpxor \T7, \T6, \T6 # the result is in T6
1160 # combined for GCM encrypt and decrypt functions
1161 # clobbering all xmm registers
1162 # clobbering r10, r11, r12, r13, r14, r15
1163 .macro GCM_ENC_DEC_AVX ENC_DEC
1165 #the number of pushes must equal STACK_OFFSET
1176 sub $VARIABLE_OFFSET, %rsp
1177 and $~63, %rsp # align rsp to 64 bytes
1180 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
1182 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
1183 and $-16, %r13 # r13 = r13 - (r13 mod 16)
1188 jz _initial_num_blocks_is_0\@
1191 je _initial_num_blocks_is_7\@
1193 je _initial_num_blocks_is_6\@
1195 je _initial_num_blocks_is_5\@
1197 je _initial_num_blocks_is_4\@
1199 je _initial_num_blocks_is_3\@
1201 je _initial_num_blocks_is_2\@
1203 jmp _initial_num_blocks_is_1\@
1205 _initial_num_blocks_is_7\@:
1206 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1208 jmp _initial_blocks_encrypted\@
1210 _initial_num_blocks_is_6\@:
1211 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1213 jmp _initial_blocks_encrypted\@
1215 _initial_num_blocks_is_5\@:
1216 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1218 jmp _initial_blocks_encrypted\@
1220 _initial_num_blocks_is_4\@:
1221 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1223 jmp _initial_blocks_encrypted\@
1225 _initial_num_blocks_is_3\@:
1226 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1228 jmp _initial_blocks_encrypted\@
1230 _initial_num_blocks_is_2\@:
1231 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1233 jmp _initial_blocks_encrypted\@
1235 _initial_num_blocks_is_1\@:
1236 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1238 jmp _initial_blocks_encrypted\@
1240 _initial_num_blocks_is_0\@:
1241 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1244 _initial_blocks_encrypted\@:
1246 je _zero_cipher_left\@
1249 je _eight_cipher_left\@
1256 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1259 _encrypt_by_8_new\@:
1266 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1269 jne _encrypt_by_8_new\@
1271 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1272 jmp _eight_cipher_left\@
1275 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1277 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1278 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1281 jne _encrypt_by_8_new\@
1283 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1288 _eight_cipher_left\@:
1289 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1292 _zero_cipher_left\@:
1294 jl _only_less_than_16\@
1297 and $15, %r13 # r13 = (arg4 mod 16)
1299 je _multiple_of_16_bytes\@
1301 # handle the last <16 Byte block seperately
1304 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1305 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1306 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1310 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
1312 lea SHIFT_MASK+16(%rip), %r12
1313 sub %r13, %r12 # adjust the shuffle mask pointer to be
1314 # able to shift 16-r13 bytes (r13 is the
1315 # number of bytes in plaintext mod 16)
1316 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
1317 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
1318 jmp _final_ghash_mul\@
1320 _only_less_than_16\@:
1321 # check for 0 length
1323 and $15, %r13 # r13 = (arg4 mod 16)
1325 je _multiple_of_16_bytes\@
1327 # handle the last <16 Byte block seperately
1330 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1331 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1332 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1335 lea SHIFT_MASK+16(%rip), %r12
1336 sub %r13, %r12 # adjust the shuffle mask pointer to be
1337 # able to shift 16-r13 bytes (r13 is the
1338 # number of bytes in plaintext mod 16)
1340 _get_last_16_byte_loop\@:
1341 movb (arg3, %r11), %al
1342 movb %al, TMP1 (%rsp , %r11)
1345 jne _get_last_16_byte_loop\@
1347 vmovdqu TMP1(%rsp), %xmm1
1353 vmovdqa %xmm1, %xmm2
1354 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1355 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1356 # mask out top 16-r13 bytes of xmm9
1357 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1358 vpand %xmm1, %xmm2, %xmm2
1359 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1360 vpxor %xmm2, %xmm14, %xmm14
1361 #GHASH computation for the last <16 Byte block
1362 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1366 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1367 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1368 # mask out top 16-r13 bytes of xmm9
1369 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1370 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1371 vpxor %xmm9, %xmm14, %xmm14
1372 #GHASH computation for the last <16 Byte block
1373 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1376 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
1380 #############################
1384 jle _less_than_8_bytes_left\@
1386 mov %rax, (arg2 , %r11)
1388 vpsrldq $8, %xmm9, %xmm9
1392 _less_than_8_bytes_left\@:
1393 movb %al, (arg2 , %r11)
1397 jne _less_than_8_bytes_left\@
1398 #############################
1400 _multiple_of_16_bytes\@:
1401 mov arg7, %r12 # r12 = aadLen (number of bytes)
1402 shl $3, %r12 # convert into number of bits
1403 vmovd %r12d, %xmm15 # len(A) in xmm15
1405 shl $3, arg4 # len(C) in bits (*128)
1407 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
1408 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
1410 vpxor %xmm15, %xmm14, %xmm14
1411 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
1412 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
1414 mov arg5, %rax # rax = *Y0
1415 vmovdqu (%rax), %xmm9 # xmm9 = Y0
1417 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
1419 vpxor %xmm14, %xmm9, %xmm9
1424 mov arg8, %r10 # r10 = authTag
1425 mov arg9, %r11 # r11 = auth_tag_len
1436 jmp _return_T_done\@
1440 vpsrldq $8, %xmm9, %xmm9
1443 jmp _return_T_done\@
1446 vmovdqu %xmm9, (%r10)
1458 #############################################################
1459 #void aesni_gcm_precomp_avx_gen2
1460 # (gcm_data *my_ctx_data,
1461 # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1462 #############################################################
1463 ENTRY(aesni_gcm_precomp_avx_gen2)
1464 #the number of pushes must equal STACK_OFFSET
1474 sub $VARIABLE_OFFSET, %rsp
1475 and $~63, %rsp # align rsp to 64 bytes
1477 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
1479 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1480 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1481 vmovdqa %xmm6, %xmm2
1482 vpsllq $1, %xmm6, %xmm6
1483 vpsrlq $63, %xmm2, %xmm2
1484 vmovdqa %xmm2, %xmm1
1485 vpslldq $8, %xmm2, %xmm2
1486 vpsrldq $8, %xmm1, %xmm1
1487 vpor %xmm2, %xmm6, %xmm6
1489 vpshufd $0b00100100, %xmm1, %xmm2
1490 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1491 vpand POLY(%rip), %xmm2, %xmm2
1492 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1493 #######################################################################
1494 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
1497 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1506 ENDPROC(aesni_gcm_precomp_avx_gen2)
1508 ###############################################################################
1509 #void aesni_gcm_enc_avx_gen2(
1510 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1511 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1512 # const u8 *in, /* Plaintext input */
1513 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1514 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1515 # (from Security Association) concatenated with 8 byte
1516 # Initialisation Vector (from IPSec ESP Payload)
1517 # concatenated with 0x00000001. 16-byte aligned pointer. */
1518 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1519 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1520 # u8 *auth_tag, /* Authenticated Tag output. */
1521 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1522 # Valid values are 16 (most likely), 12 or 8. */
1523 ###############################################################################
1524 ENTRY(aesni_gcm_enc_avx_gen2)
1527 ENDPROC(aesni_gcm_enc_avx_gen2)
1529 ###############################################################################
1530 #void aesni_gcm_dec_avx_gen2(
1531 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1532 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1533 # const u8 *in, /* Ciphertext input */
1534 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1535 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1536 # (from Security Association) concatenated with 8 byte
1537 # Initialisation Vector (from IPSec ESP Payload)
1538 # concatenated with 0x00000001. 16-byte aligned pointer. */
1539 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1540 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1541 # u8 *auth_tag, /* Authenticated Tag output. */
1542 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1543 # Valid values are 16 (most likely), 12 or 8. */
1544 ###############################################################################
1545 ENTRY(aesni_gcm_dec_avx_gen2)
1548 ENDPROC(aesni_gcm_dec_avx_gen2)
1549 #endif /* CONFIG_AS_AVX */
1551 #ifdef CONFIG_AS_AVX2
1552 ###############################################################################
1553 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1554 # Input: A and B (128-bits each, bit-reflected)
1555 # Output: C = A*B*x mod poly, (i.e. >>1 )
1556 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1557 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1558 ###############################################################################
1559 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1561 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1562 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1563 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1564 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1568 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1569 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1574 #######################################################################
1575 #first phase of the reduction
1576 vmovdqa POLY2(%rip), \T3
1578 vpclmulqdq $0x01, \GH, \T3, \T2
1579 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1581 vpxor \T2, \GH, \GH # first phase of the reduction complete
1582 #######################################################################
1583 #second phase of the reduction
1584 vpclmulqdq $0x00, \GH, \T3, \T2
1585 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1587 vpclmulqdq $0x10, \GH, \T3, \GH
1588 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1590 vpxor \T2, \GH, \GH # second phase of the reduction complete
1591 #######################################################################
1592 vpxor \T1, \GH, \GH # the result is in GH
1597 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1599 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1601 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1602 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
1604 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1605 vmovdqa \T5, HashKey_3(arg1)
1607 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1608 vmovdqa \T5, HashKey_4(arg1)
1610 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1611 vmovdqa \T5, HashKey_5(arg1)
1613 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1614 vmovdqa \T5, HashKey_6(arg1)
1616 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1617 vmovdqa \T5, HashKey_7(arg1)
1619 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1620 vmovdqa \T5, HashKey_8(arg1)
1625 ## if a = number of total plaintext bytes
1627 ## num_initial_blocks = b mod 4#
1628 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1629 ## r10, r11, r12, rax are clobbered
1630 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1632 .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1633 i = (8-\num_initial_blocks)
1636 mov arg6, %r10 # r10 = AAD
1637 mov arg7, %r12 # r12 = aadLen
1642 vpxor reg_i, reg_i, reg_i
1645 vpslldq $12, \T1, \T1
1646 vpsrldq $4, reg_i, reg_i
1647 vpxor \T1, reg_i, reg_i
1655 je _get_AAD_loop2_done\@
1659 vpsrldq $4, reg_i, reg_i
1664 _get_AAD_loop2_done\@:
1666 #byte-reflect the AAD data
1667 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1669 # initialize the data pointer offset as zero
1672 # start AES for num_initial_blocks blocks
1673 mov arg5, %rax # rax = *Y0
1674 vmovdqu (%rax), \CTR # CTR = Y0
1675 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1678 i = (9-\num_initial_blocks)
1680 .rep \num_initial_blocks
1681 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1683 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1688 vmovdqa (arg1), \T_key
1689 i = (9-\num_initial_blocks)
1691 .rep \num_initial_blocks
1692 vpxor \T_key, reg_i, reg_i
1700 vmovdqa 16*j(arg1), \T_key
1701 i = (9-\num_initial_blocks)
1703 .rep \num_initial_blocks
1704 vaesenc \T_key, reg_i, reg_i
1714 vmovdqa 16*10(arg1), \T_key
1715 i = (9-\num_initial_blocks)
1717 .rep \num_initial_blocks
1718 vaesenclast \T_key, reg_i, reg_i
1723 i = (9-\num_initial_blocks)
1725 .rep \num_initial_blocks
1726 vmovdqu (arg3, %r11), \T1
1727 vpxor \T1, reg_i, reg_i
1728 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
1729 # num_initial_blocks blocks
1734 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1740 i = (8-\num_initial_blocks)
1741 j = (9-\num_initial_blocks)
1743 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1745 .rep \num_initial_blocks
1746 vpxor reg_i, reg_j, reg_j
1747 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1752 # XMM8 has the combined result here
1754 vmovdqa \XMM8, TMP1(%rsp)
1758 jl _initial_blocks_done\@ # no need for precomputed constants
1760 ###############################################################################
1761 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1762 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1764 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1766 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1768 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1770 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1772 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1774 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1776 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1778 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1780 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1782 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1784 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1786 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1788 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1790 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1792 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1794 vmovdqa (arg1), \T_key
1795 vpxor \T_key, \XMM1, \XMM1
1796 vpxor \T_key, \XMM2, \XMM2
1797 vpxor \T_key, \XMM3, \XMM3
1798 vpxor \T_key, \XMM4, \XMM4
1799 vpxor \T_key, \XMM5, \XMM5
1800 vpxor \T_key, \XMM6, \XMM6
1801 vpxor \T_key, \XMM7, \XMM7
1802 vpxor \T_key, \XMM8, \XMM8
1806 .rep 9 # do 9 rounds
1807 vmovdqa 16*i(arg1), \T_key
1808 vaesenc \T_key, \XMM1, \XMM1
1809 vaesenc \T_key, \XMM2, \XMM2
1810 vaesenc \T_key, \XMM3, \XMM3
1811 vaesenc \T_key, \XMM4, \XMM4
1812 vaesenc \T_key, \XMM5, \XMM5
1813 vaesenc \T_key, \XMM6, \XMM6
1814 vaesenc \T_key, \XMM7, \XMM7
1815 vaesenc \T_key, \XMM8, \XMM8
1821 vmovdqa 16*i(arg1), \T_key
1822 vaesenclast \T_key, \XMM1, \XMM1
1823 vaesenclast \T_key, \XMM2, \XMM2
1824 vaesenclast \T_key, \XMM3, \XMM3
1825 vaesenclast \T_key, \XMM4, \XMM4
1826 vaesenclast \T_key, \XMM5, \XMM5
1827 vaesenclast \T_key, \XMM6, \XMM6
1828 vaesenclast \T_key, \XMM7, \XMM7
1829 vaesenclast \T_key, \XMM8, \XMM8
1831 vmovdqu (arg3, %r11), \T1
1832 vpxor \T1, \XMM1, \XMM1
1833 vmovdqu \XMM1, (arg2 , %r11)
1838 vmovdqu 16*1(arg3, %r11), \T1
1839 vpxor \T1, \XMM2, \XMM2
1840 vmovdqu \XMM2, 16*1(arg2 , %r11)
1845 vmovdqu 16*2(arg3, %r11), \T1
1846 vpxor \T1, \XMM3, \XMM3
1847 vmovdqu \XMM3, 16*2(arg2 , %r11)
1852 vmovdqu 16*3(arg3, %r11), \T1
1853 vpxor \T1, \XMM4, \XMM4
1854 vmovdqu \XMM4, 16*3(arg2 , %r11)
1859 vmovdqu 16*4(arg3, %r11), \T1
1860 vpxor \T1, \XMM5, \XMM5
1861 vmovdqu \XMM5, 16*4(arg2 , %r11)
1866 vmovdqu 16*5(arg3, %r11), \T1
1867 vpxor \T1, \XMM6, \XMM6
1868 vmovdqu \XMM6, 16*5(arg2 , %r11)
1873 vmovdqu 16*6(arg3, %r11), \T1
1874 vpxor \T1, \XMM7, \XMM7
1875 vmovdqu \XMM7, 16*6(arg2 , %r11)
1880 vmovdqu 16*7(arg3, %r11), \T1
1881 vpxor \T1, \XMM8, \XMM8
1882 vmovdqu \XMM8, 16*7(arg2 , %r11)
1889 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1890 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
1891 # the corresponding ciphertext
1892 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1893 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1894 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1895 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1896 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1897 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1898 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1900 ###############################################################################
1902 _initial_blocks_done\@:
1909 # encrypt 8 blocks at a time
1910 # ghash the 8 previously encrypted ciphertext blocks
1911 # arg1, arg2, arg3 are used as pointers only, not modified
1912 # r11 is the data offset value
1913 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1916 vmovdqa \XMM2, TMP2(%rsp)
1917 vmovdqa \XMM3, TMP3(%rsp)
1918 vmovdqa \XMM4, TMP4(%rsp)
1919 vmovdqa \XMM5, TMP5(%rsp)
1920 vmovdqa \XMM6, TMP6(%rsp)
1921 vmovdqa \XMM7, TMP7(%rsp)
1922 vmovdqa \XMM8, TMP8(%rsp)
1924 .if \loop_idx == in_order
1925 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1926 vpaddd ONE(%rip), \XMM1, \XMM2
1927 vpaddd ONE(%rip), \XMM2, \XMM3
1928 vpaddd ONE(%rip), \XMM3, \XMM4
1929 vpaddd ONE(%rip), \XMM4, \XMM5
1930 vpaddd ONE(%rip), \XMM5, \XMM6
1931 vpaddd ONE(%rip), \XMM6, \XMM7
1932 vpaddd ONE(%rip), \XMM7, \XMM8
1935 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1936 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1937 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1938 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1939 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1940 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1941 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1942 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1944 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1945 vpaddd ONEf(%rip), \XMM1, \XMM2
1946 vpaddd ONEf(%rip), \XMM2, \XMM3
1947 vpaddd ONEf(%rip), \XMM3, \XMM4
1948 vpaddd ONEf(%rip), \XMM4, \XMM5
1949 vpaddd ONEf(%rip), \XMM5, \XMM6
1950 vpaddd ONEf(%rip), \XMM6, \XMM7
1951 vpaddd ONEf(%rip), \XMM7, \XMM8
1956 #######################################################################
1959 vpxor \T1, \XMM1, \XMM1
1960 vpxor \T1, \XMM2, \XMM2
1961 vpxor \T1, \XMM3, \XMM3
1962 vpxor \T1, \XMM4, \XMM4
1963 vpxor \T1, \XMM5, \XMM5
1964 vpxor \T1, \XMM6, \XMM6
1965 vpxor \T1, \XMM7, \XMM7
1966 vpxor \T1, \XMM8, \XMM8
1968 #######################################################################
1974 vmovdqu 16*1(arg1), \T1
1975 vaesenc \T1, \XMM1, \XMM1
1976 vaesenc \T1, \XMM2, \XMM2
1977 vaesenc \T1, \XMM3, \XMM3
1978 vaesenc \T1, \XMM4, \XMM4
1979 vaesenc \T1, \XMM5, \XMM5
1980 vaesenc \T1, \XMM6, \XMM6
1981 vaesenc \T1, \XMM7, \XMM7
1982 vaesenc \T1, \XMM8, \XMM8
1984 vmovdqu 16*2(arg1), \T1
1985 vaesenc \T1, \XMM1, \XMM1
1986 vaesenc \T1, \XMM2, \XMM2
1987 vaesenc \T1, \XMM3, \XMM3
1988 vaesenc \T1, \XMM4, \XMM4
1989 vaesenc \T1, \XMM5, \XMM5
1990 vaesenc \T1, \XMM6, \XMM6
1991 vaesenc \T1, \XMM7, \XMM7
1992 vaesenc \T1, \XMM8, \XMM8
1995 #######################################################################
1997 vmovdqa HashKey_8(arg1), \T5
1998 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1999 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2000 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2001 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2004 vmovdqu 16*3(arg1), \T1
2005 vaesenc \T1, \XMM1, \XMM1
2006 vaesenc \T1, \XMM2, \XMM2
2007 vaesenc \T1, \XMM3, \XMM3
2008 vaesenc \T1, \XMM4, \XMM4
2009 vaesenc \T1, \XMM5, \XMM5
2010 vaesenc \T1, \XMM6, \XMM6
2011 vaesenc \T1, \XMM7, \XMM7
2012 vaesenc \T1, \XMM8, \XMM8
2014 vmovdqa TMP2(%rsp), \T1
2015 vmovdqa HashKey_7(arg1), \T5
2016 vpclmulqdq $0x11, \T5, \T1, \T3
2019 vpclmulqdq $0x00, \T5, \T1, \T3
2022 vpclmulqdq $0x01, \T5, \T1, \T3
2025 vpclmulqdq $0x10, \T5, \T1, \T3
2028 vmovdqu 16*4(arg1), \T1
2029 vaesenc \T1, \XMM1, \XMM1
2030 vaesenc \T1, \XMM2, \XMM2
2031 vaesenc \T1, \XMM3, \XMM3
2032 vaesenc \T1, \XMM4, \XMM4
2033 vaesenc \T1, \XMM5, \XMM5
2034 vaesenc \T1, \XMM6, \XMM6
2035 vaesenc \T1, \XMM7, \XMM7
2036 vaesenc \T1, \XMM8, \XMM8
2038 #######################################################################
2040 vmovdqa TMP3(%rsp), \T1
2041 vmovdqa HashKey_6(arg1), \T5
2042 vpclmulqdq $0x11, \T5, \T1, \T3
2045 vpclmulqdq $0x00, \T5, \T1, \T3
2048 vpclmulqdq $0x01, \T5, \T1, \T3
2051 vpclmulqdq $0x10, \T5, \T1, \T3
2054 vmovdqu 16*5(arg1), \T1
2055 vaesenc \T1, \XMM1, \XMM1
2056 vaesenc \T1, \XMM2, \XMM2
2057 vaesenc \T1, \XMM3, \XMM3
2058 vaesenc \T1, \XMM4, \XMM4
2059 vaesenc \T1, \XMM5, \XMM5
2060 vaesenc \T1, \XMM6, \XMM6
2061 vaesenc \T1, \XMM7, \XMM7
2062 vaesenc \T1, \XMM8, \XMM8
2064 vmovdqa TMP4(%rsp), \T1
2065 vmovdqa HashKey_5(arg1), \T5
2066 vpclmulqdq $0x11, \T5, \T1, \T3
2069 vpclmulqdq $0x00, \T5, \T1, \T3
2072 vpclmulqdq $0x01, \T5, \T1, \T3
2075 vpclmulqdq $0x10, \T5, \T1, \T3
2078 vmovdqu 16*6(arg1), \T1
2079 vaesenc \T1, \XMM1, \XMM1
2080 vaesenc \T1, \XMM2, \XMM2
2081 vaesenc \T1, \XMM3, \XMM3
2082 vaesenc \T1, \XMM4, \XMM4
2083 vaesenc \T1, \XMM5, \XMM5
2084 vaesenc \T1, \XMM6, \XMM6
2085 vaesenc \T1, \XMM7, \XMM7
2086 vaesenc \T1, \XMM8, \XMM8
2089 vmovdqa TMP5(%rsp), \T1
2090 vmovdqa HashKey_4(arg1), \T5
2091 vpclmulqdq $0x11, \T5, \T1, \T3
2094 vpclmulqdq $0x00, \T5, \T1, \T3
2097 vpclmulqdq $0x01, \T5, \T1, \T3
2100 vpclmulqdq $0x10, \T5, \T1, \T3
2103 vmovdqu 16*7(arg1), \T1
2104 vaesenc \T1, \XMM1, \XMM1
2105 vaesenc \T1, \XMM2, \XMM2
2106 vaesenc \T1, \XMM3, \XMM3
2107 vaesenc \T1, \XMM4, \XMM4
2108 vaesenc \T1, \XMM5, \XMM5
2109 vaesenc \T1, \XMM6, \XMM6
2110 vaesenc \T1, \XMM7, \XMM7
2111 vaesenc \T1, \XMM8, \XMM8
2113 vmovdqa TMP6(%rsp), \T1
2114 vmovdqa HashKey_3(arg1), \T5
2115 vpclmulqdq $0x11, \T5, \T1, \T3
2118 vpclmulqdq $0x00, \T5, \T1, \T3
2121 vpclmulqdq $0x01, \T5, \T1, \T3
2124 vpclmulqdq $0x10, \T5, \T1, \T3
2127 vmovdqu 16*8(arg1), \T1
2128 vaesenc \T1, \XMM1, \XMM1
2129 vaesenc \T1, \XMM2, \XMM2
2130 vaesenc \T1, \XMM3, \XMM3
2131 vaesenc \T1, \XMM4, \XMM4
2132 vaesenc \T1, \XMM5, \XMM5
2133 vaesenc \T1, \XMM6, \XMM6
2134 vaesenc \T1, \XMM7, \XMM7
2135 vaesenc \T1, \XMM8, \XMM8
2137 vmovdqa TMP7(%rsp), \T1
2138 vmovdqa HashKey_2(arg1), \T5
2139 vpclmulqdq $0x11, \T5, \T1, \T3
2142 vpclmulqdq $0x00, \T5, \T1, \T3
2145 vpclmulqdq $0x01, \T5, \T1, \T3
2148 vpclmulqdq $0x10, \T5, \T1, \T3
2152 #######################################################################
2154 vmovdqu 16*9(arg1), \T5
2155 vaesenc \T5, \XMM1, \XMM1
2156 vaesenc \T5, \XMM2, \XMM2
2157 vaesenc \T5, \XMM3, \XMM3
2158 vaesenc \T5, \XMM4, \XMM4
2159 vaesenc \T5, \XMM5, \XMM5
2160 vaesenc \T5, \XMM6, \XMM6
2161 vaesenc \T5, \XMM7, \XMM7
2162 vaesenc \T5, \XMM8, \XMM8
2164 vmovdqa TMP8(%rsp), \T1
2165 vmovdqa HashKey(arg1), \T5
2167 vpclmulqdq $0x00, \T5, \T1, \T3
2170 vpclmulqdq $0x01, \T5, \T1, \T3
2173 vpclmulqdq $0x10, \T5, \T1, \T3
2176 vpclmulqdq $0x11, \T5, \T1, \T3
2180 vmovdqu 16*10(arg1), \T5
2186 vpxor 16*i(arg3, %r11), \T5, \T2
2188 vaesenclast \T2, reg_j, reg_j
2190 vaesenclast \T2, reg_j, \T3
2191 vmovdqu 16*i(arg3, %r11), reg_j
2192 vmovdqu \T3, 16*i(arg2, %r11)
2198 #######################################################################
2201 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2202 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2204 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2208 #######################################################################
2209 #first phase of the reduction
2210 vmovdqa POLY2(%rip), \T3
2212 vpclmulqdq $0x01, \T7, \T3, \T2
2213 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2215 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2216 #######################################################################
2218 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
2219 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
2220 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
2221 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
2222 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
2223 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
2224 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
2225 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
2228 #######################################################################
2229 #second phase of the reduction
2230 vpclmulqdq $0x00, \T7, \T3, \T2
2231 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2233 vpclmulqdq $0x10, \T7, \T3, \T4
2234 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2236 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2237 #######################################################################
2238 vpxor \T4, \T1, \T1 # the result is in T1
2240 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2241 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2242 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2243 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2244 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2245 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2246 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2247 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2250 vpxor \T1, \XMM1, \XMM1
2257 # GHASH the last 4 ciphertext blocks.
2258 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2262 vmovdqa HashKey_8(arg1), \T5
2264 vpshufd $0b01001110, \XMM1, \T2
2265 vpshufd $0b01001110, \T5, \T3
2266 vpxor \XMM1, \T2, \T2
2269 vpclmulqdq $0x11, \T5, \XMM1, \T6
2270 vpclmulqdq $0x00, \T5, \XMM1, \T7
2272 vpclmulqdq $0x00, \T3, \T2, \XMM1
2274 ######################
2276 vmovdqa HashKey_7(arg1), \T5
2277 vpshufd $0b01001110, \XMM2, \T2
2278 vpshufd $0b01001110, \T5, \T3
2279 vpxor \XMM2, \T2, \T2
2282 vpclmulqdq $0x11, \T5, \XMM2, \T4
2285 vpclmulqdq $0x00, \T5, \XMM2, \T4
2288 vpclmulqdq $0x00, \T3, \T2, \T2
2290 vpxor \T2, \XMM1, \XMM1
2292 ######################
2294 vmovdqa HashKey_6(arg1), \T5
2295 vpshufd $0b01001110, \XMM3, \T2
2296 vpshufd $0b01001110, \T5, \T3
2297 vpxor \XMM3, \T2, \T2
2300 vpclmulqdq $0x11, \T5, \XMM3, \T4
2303 vpclmulqdq $0x00, \T5, \XMM3, \T4
2306 vpclmulqdq $0x00, \T3, \T2, \T2
2308 vpxor \T2, \XMM1, \XMM1
2310 ######################
2312 vmovdqa HashKey_5(arg1), \T5
2313 vpshufd $0b01001110, \XMM4, \T2
2314 vpshufd $0b01001110, \T5, \T3
2315 vpxor \XMM4, \T2, \T2
2318 vpclmulqdq $0x11, \T5, \XMM4, \T4
2321 vpclmulqdq $0x00, \T5, \XMM4, \T4
2324 vpclmulqdq $0x00, \T3, \T2, \T2
2326 vpxor \T2, \XMM1, \XMM1
2328 ######################
2330 vmovdqa HashKey_4(arg1), \T5
2331 vpshufd $0b01001110, \XMM5, \T2
2332 vpshufd $0b01001110, \T5, \T3
2333 vpxor \XMM5, \T2, \T2
2336 vpclmulqdq $0x11, \T5, \XMM5, \T4
2339 vpclmulqdq $0x00, \T5, \XMM5, \T4
2342 vpclmulqdq $0x00, \T3, \T2, \T2
2344 vpxor \T2, \XMM1, \XMM1
2346 ######################
2348 vmovdqa HashKey_3(arg1), \T5
2349 vpshufd $0b01001110, \XMM6, \T2
2350 vpshufd $0b01001110, \T5, \T3
2351 vpxor \XMM6, \T2, \T2
2354 vpclmulqdq $0x11, \T5, \XMM6, \T4
2357 vpclmulqdq $0x00, \T5, \XMM6, \T4
2360 vpclmulqdq $0x00, \T3, \T2, \T2
2362 vpxor \T2, \XMM1, \XMM1
2364 ######################
2366 vmovdqa HashKey_2(arg1), \T5
2367 vpshufd $0b01001110, \XMM7, \T2
2368 vpshufd $0b01001110, \T5, \T3
2369 vpxor \XMM7, \T2, \T2
2372 vpclmulqdq $0x11, \T5, \XMM7, \T4
2375 vpclmulqdq $0x00, \T5, \XMM7, \T4
2378 vpclmulqdq $0x00, \T3, \T2, \T2
2380 vpxor \T2, \XMM1, \XMM1
2382 ######################
2384 vmovdqa HashKey(arg1), \T5
2385 vpshufd $0b01001110, \XMM8, \T2
2386 vpshufd $0b01001110, \T5, \T3
2387 vpxor \XMM8, \T2, \T2
2390 vpclmulqdq $0x11, \T5, \XMM8, \T4
2393 vpclmulqdq $0x00, \T5, \XMM8, \T4
2396 vpclmulqdq $0x00, \T3, \T2, \T2
2398 vpxor \T2, \XMM1, \XMM1
2399 vpxor \T6, \XMM1, \XMM1
2400 vpxor \T7, \XMM1, \T2
2405 vpslldq $8, \T2, \T4
2406 vpsrldq $8, \T2, \T2
2409 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2410 # accumulated carry-less multiplications
2412 #######################################################################
2413 #first phase of the reduction
2414 vmovdqa POLY2(%rip), \T3
2416 vpclmulqdq $0x01, \T7, \T3, \T2
2417 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2419 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2420 #######################################################################
2423 #second phase of the reduction
2424 vpclmulqdq $0x00, \T7, \T3, \T2
2425 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2427 vpclmulqdq $0x10, \T7, \T3, \T4
2428 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2430 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2431 #######################################################################
2432 vpxor \T4, \T6, \T6 # the result is in T6
2437 # combined for GCM encrypt and decrypt functions
2438 # clobbering all xmm registers
2439 # clobbering r10, r11, r12, r13, r14, r15
2440 .macro GCM_ENC_DEC_AVX2 ENC_DEC
2442 #the number of pushes must equal STACK_OFFSET
2453 sub $VARIABLE_OFFSET, %rsp
2454 and $~63, %rsp # align rsp to 64 bytes
2457 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
2459 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
2460 and $-16, %r13 # r13 = r13 - (r13 mod 16)
2465 jz _initial_num_blocks_is_0\@
2468 je _initial_num_blocks_is_7\@
2470 je _initial_num_blocks_is_6\@
2472 je _initial_num_blocks_is_5\@
2474 je _initial_num_blocks_is_4\@
2476 je _initial_num_blocks_is_3\@
2478 je _initial_num_blocks_is_2\@
2480 jmp _initial_num_blocks_is_1\@
2482 _initial_num_blocks_is_7\@:
2483 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2485 jmp _initial_blocks_encrypted\@
2487 _initial_num_blocks_is_6\@:
2488 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2490 jmp _initial_blocks_encrypted\@
2492 _initial_num_blocks_is_5\@:
2493 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2495 jmp _initial_blocks_encrypted\@
2497 _initial_num_blocks_is_4\@:
2498 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2500 jmp _initial_blocks_encrypted\@
2502 _initial_num_blocks_is_3\@:
2503 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2505 jmp _initial_blocks_encrypted\@
2507 _initial_num_blocks_is_2\@:
2508 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2510 jmp _initial_blocks_encrypted\@
2512 _initial_num_blocks_is_1\@:
2513 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2515 jmp _initial_blocks_encrypted\@
2517 _initial_num_blocks_is_0\@:
2518 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2521 _initial_blocks_encrypted\@:
2523 je _zero_cipher_left\@
2526 je _eight_cipher_left\@
2533 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2536 _encrypt_by_8_new\@:
2543 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2546 jne _encrypt_by_8_new\@
2548 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2549 jmp _eight_cipher_left\@
2552 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2554 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2555 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2558 jne _encrypt_by_8_new\@
2560 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2565 _eight_cipher_left\@:
2566 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2569 _zero_cipher_left\@:
2571 jl _only_less_than_16\@
2574 and $15, %r13 # r13 = (arg4 mod 16)
2576 je _multiple_of_16_bytes\@
2578 # handle the last <16 Byte block seperately
2581 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2582 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2583 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2587 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
2589 lea SHIFT_MASK+16(%rip), %r12
2590 sub %r13, %r12 # adjust the shuffle mask pointer
2591 # to be able to shift 16-r13 bytes
2592 # (r13 is the number of bytes in plaintext mod 16)
2593 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
2594 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
2595 jmp _final_ghash_mul\@
2597 _only_less_than_16\@:
2598 # check for 0 length
2600 and $15, %r13 # r13 = (arg4 mod 16)
2602 je _multiple_of_16_bytes\@
2604 # handle the last <16 Byte block seperately
2607 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2608 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2609 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2612 lea SHIFT_MASK+16(%rip), %r12
2613 sub %r13, %r12 # adjust the shuffle mask pointer to be
2614 # able to shift 16-r13 bytes (r13 is the
2615 # number of bytes in plaintext mod 16)
2617 _get_last_16_byte_loop\@:
2618 movb (arg3, %r11), %al
2619 movb %al, TMP1 (%rsp , %r11)
2622 jne _get_last_16_byte_loop\@
2624 vmovdqu TMP1(%rsp), %xmm1
2630 vmovdqa %xmm1, %xmm2
2631 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2632 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2633 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2634 vpand %xmm1, %xmm2, %xmm2
2635 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2636 vpxor %xmm2, %xmm14, %xmm14
2637 #GHASH computation for the last <16 Byte block
2638 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2642 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2643 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2644 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2645 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2646 vpxor %xmm9, %xmm14, %xmm14
2647 #GHASH computation for the last <16 Byte block
2648 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2651 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
2655 #############################
2659 jle _less_than_8_bytes_left\@
2661 mov %rax, (arg2 , %r11)
2663 vpsrldq $8, %xmm9, %xmm9
2667 _less_than_8_bytes_left\@:
2668 movb %al, (arg2 , %r11)
2672 jne _less_than_8_bytes_left\@
2673 #############################
2675 _multiple_of_16_bytes\@:
2676 mov arg7, %r12 # r12 = aadLen (number of bytes)
2677 shl $3, %r12 # convert into number of bits
2678 vmovd %r12d, %xmm15 # len(A) in xmm15
2680 shl $3, arg4 # len(C) in bits (*128)
2682 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
2683 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
2685 vpxor %xmm15, %xmm14, %xmm14
2686 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
2687 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
2689 mov arg5, %rax # rax = *Y0
2690 vmovdqu (%rax), %xmm9 # xmm9 = Y0
2692 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
2694 vpxor %xmm14, %xmm9, %xmm9
2699 mov arg8, %r10 # r10 = authTag
2700 mov arg9, %r11 # r11 = auth_tag_len
2711 jmp _return_T_done\@
2715 vpsrldq $8, %xmm9, %xmm9
2718 jmp _return_T_done\@
2721 vmovdqu %xmm9, (%r10)
2733 #############################################################
2734 #void aesni_gcm_precomp_avx_gen4
2735 # (gcm_data *my_ctx_data,
2736 # u8 *hash_subkey)# /* H, the Hash sub key input.
2737 # Data starts on a 16-byte boundary. */
2738 #############################################################
2739 ENTRY(aesni_gcm_precomp_avx_gen4)
2740 #the number of pushes must equal STACK_OFFSET
2750 sub $VARIABLE_OFFSET, %rsp
2751 and $~63, %rsp # align rsp to 64 bytes
2753 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
2755 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2756 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2757 vmovdqa %xmm6, %xmm2
2758 vpsllq $1, %xmm6, %xmm6
2759 vpsrlq $63, %xmm2, %xmm2
2760 vmovdqa %xmm2, %xmm1
2761 vpslldq $8, %xmm2, %xmm2
2762 vpsrldq $8, %xmm1, %xmm1
2763 vpor %xmm2, %xmm6, %xmm6
2765 vpshufd $0b00100100, %xmm1, %xmm2
2766 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2767 vpand POLY(%rip), %xmm2, %xmm2
2768 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2769 #######################################################################
2770 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
2773 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2782 ENDPROC(aesni_gcm_precomp_avx_gen4)
2785 ###############################################################################
2786 #void aesni_gcm_enc_avx_gen4(
2787 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2788 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2789 # const u8 *in, /* Plaintext input */
2790 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2791 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2792 # (from Security Association) concatenated with 8 byte
2793 # Initialisation Vector (from IPSec ESP Payload)
2794 # concatenated with 0x00000001. 16-byte aligned pointer. */
2795 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2796 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2797 # u8 *auth_tag, /* Authenticated Tag output. */
2798 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2799 # Valid values are 16 (most likely), 12 or 8. */
2800 ###############################################################################
2801 ENTRY(aesni_gcm_enc_avx_gen4)
2802 GCM_ENC_DEC_AVX2 ENC
2804 ENDPROC(aesni_gcm_enc_avx_gen4)
2806 ###############################################################################
2807 #void aesni_gcm_dec_avx_gen4(
2808 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2809 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2810 # const u8 *in, /* Ciphertext input */
2811 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2812 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2813 # (from Security Association) concatenated with 8 byte
2814 # Initialisation Vector (from IPSec ESP Payload)
2815 # concatenated with 0x00000001. 16-byte aligned pointer. */
2816 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2817 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2818 # u8 *auth_tag, /* Authenticated Tag output. */
2819 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2820 # Valid values are 16 (most likely), 12 or 8. */
2821 ###############################################################################
2822 ENTRY(aesni_gcm_dec_avx_gen4)
2823 GCM_ENC_DEC_AVX2 DEC
2825 ENDPROC(aesni_gcm_dec_avx_gen4)
2827 #endif /* CONFIG_AS_AVX2 */