1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
128 POLY: .octa 0xC2000000000000000000000000000001
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
136 TWOONE: .octa 0x00000001000000000000000000000001
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
144 ONE: .octa 0x00000000000000000000000000000001
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
148 ONEf: .octa 0x01000000000000000000000000000000
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
160 .type aad_shift_arr, @object
161 .size aad_shift_arr, 272
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
185 ##define the fields of the gcm aes context
187 # u8 expanded_keys[16*11] store expanded keys
188 # u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
189 # u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
190 # u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
191 # u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
192 # u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
193 # u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
194 # u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
195 # u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
196 # u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
197 # u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
198 # u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
199 # u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
200 # u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
201 # u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
202 # u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
203 # u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
206 HashKey = 16*11 # store HashKey <<1 mod poly here
207 HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
208 HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
209 HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
210 HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
211 HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
212 HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
213 HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
214 HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
215 HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
216 HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
217 HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
218 HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
219 HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
220 HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
221 HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
229 #define arg7 STACK_OFFSET+8*1(%r14)
230 #define arg8 STACK_OFFSET+8*2(%r14)
231 #define arg9 STACK_OFFSET+8*3(%r14)
241 .macro define_reg r n
252 # need to push 4 registers into stack to maintain
255 TMP1 = 16*0 # Temporary storage for AAD
256 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
257 TMP3 = 16*2 # Temporary storage for AES State 3
258 TMP4 = 16*3 # Temporary storage for AES State 4
259 TMP5 = 16*4 # Temporary storage for AES State 5
260 TMP6 = 16*5 # Temporary storage for AES State 6
261 TMP7 = 16*6 # Temporary storage for AES State 7
262 TMP8 = 16*7 # Temporary storage for AES State 8
264 VARIABLE_OFFSET = 16*8
266 ################################
268 ################################
270 # Encryption of a single block
271 .macro ENCRYPT_SINGLE_BLOCK XMM0
272 vpxor (arg1), \XMM0, \XMM0
276 vaesenc 16*i(arg1), \XMM0, \XMM0
280 vaesenclast 16*10(arg1), \XMM0, \XMM0
284 ###############################################################################
285 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
286 # Input: A and B (128-bits each, bit-reflected)
287 # Output: C = A*B*x mod poly, (i.e. >>1 )
288 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
289 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
290 ###############################################################################
291 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
293 vpshufd $0b01001110, \GH, \T2
294 vpshufd $0b01001110, \HK, \T3
295 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
296 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
298 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
299 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
300 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
302 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
304 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
305 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
307 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
309 #first phase of the reduction
310 vpslld $31, \GH, \T2 # packed right shifting << 31
311 vpslld $30, \GH, \T3 # packed right shifting shift << 30
312 vpslld $25, \GH, \T4 # packed right shifting shift << 25
314 vpxor \T3, \T2, \T2 # xor the shifted versions
317 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
319 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
320 vpxor \T2, \GH, \GH # first phase of the reduction complete
322 #second phase of the reduction
324 vpsrld $1,\GH, \T2 # packed left shifting >> 1
325 vpsrld $2,\GH, \T3 # packed left shifting >> 2
326 vpsrld $7,\GH, \T4 # packed left shifting >> 7
327 vpxor \T3, \T2, \T2 # xor the shifted versions
332 vpxor \T1, \GH, \GH # the result is in GH
337 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
339 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
342 vpshufd $0b01001110, \T5, \T1
344 vmovdqa \T1, HashKey_k(arg1)
346 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
347 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
348 vpshufd $0b01001110, \T5, \T1
350 vmovdqa \T1, HashKey_2_k(arg1)
352 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
353 vmovdqa \T5, HashKey_3(arg1)
354 vpshufd $0b01001110, \T5, \T1
356 vmovdqa \T1, HashKey_3_k(arg1)
358 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
359 vmovdqa \T5, HashKey_4(arg1)
360 vpshufd $0b01001110, \T5, \T1
362 vmovdqa \T1, HashKey_4_k(arg1)
364 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
365 vmovdqa \T5, HashKey_5(arg1)
366 vpshufd $0b01001110, \T5, \T1
368 vmovdqa \T1, HashKey_5_k(arg1)
370 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
371 vmovdqa \T5, HashKey_6(arg1)
372 vpshufd $0b01001110, \T5, \T1
374 vmovdqa \T1, HashKey_6_k(arg1)
376 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
377 vmovdqa \T5, HashKey_7(arg1)
378 vpshufd $0b01001110, \T5, \T1
380 vmovdqa \T1, HashKey_7_k(arg1)
382 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
383 vmovdqa \T5, HashKey_8(arg1)
384 vpshufd $0b01001110, \T5, \T1
386 vmovdqa \T1, HashKey_8_k(arg1)
390 ## if a = number of total plaintext bytes
392 ## num_initial_blocks = b mod 4#
393 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
394 ## r10, r11, r12, rax are clobbered
395 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
397 .macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
398 i = (8-\num_initial_blocks)
402 mov arg6, %r10 # r10 = AAD
403 mov arg7, %r12 # r12 = aadLen
408 vpxor reg_j, reg_j, reg_j
409 vpxor reg_i, reg_i, reg_i
413 vmovdqu (%r10), reg_i
414 vpshufb SHUF_MASK(%rip), reg_i, reg_i
415 vpxor reg_i, reg_j, reg_j
416 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
421 jge _get_AAD_blocks\@
426 vpxor reg_i, reg_i, reg_i
428 /* read the last <16B of AAD. since we have at least 4B of
429 data right after the AAD (the ICV, and maybe some CT), we can
430 read 4B/8B blocks safely, and then get rid of the extra stuff */
438 vpsrldq $8, reg_i, reg_i
439 vpxor \T1, reg_i, reg_i
448 vpslldq $12, \T1, \T1
449 vpsrldq $4, reg_i, reg_i
450 vpxor \T1, reg_i, reg_i
452 /* finalize: shift out the extra bytes we read, and align
453 left. since pslldq can only shift by an immediate, we use
454 vpshufb and an array of shuffle masks */
457 movdqu aad_shift_arr(%r11), \T1
458 vpshufb \T1, reg_i, reg_i
459 _get_AAD_rest_final\@:
460 vpshufb SHUF_MASK(%rip), reg_i, reg_i
461 vpxor reg_j, reg_i, reg_i
462 GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
465 # initialize the data pointer offset as zero
468 # start AES for num_initial_blocks blocks
469 mov arg5, %rax # rax = *Y0
470 vmovdqu (%rax), \CTR # CTR = Y0
471 vpshufb SHUF_MASK(%rip), \CTR, \CTR
474 i = (9-\num_initial_blocks)
476 .rep \num_initial_blocks
477 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
479 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
484 vmovdqa (arg1), \T_key
485 i = (9-\num_initial_blocks)
487 .rep \num_initial_blocks
488 vpxor \T_key, reg_i, reg_i
496 vmovdqa 16*j(arg1), \T_key
497 i = (9-\num_initial_blocks)
499 .rep \num_initial_blocks
500 vaesenc \T_key, reg_i, reg_i
510 vmovdqa 16*10(arg1), \T_key
511 i = (9-\num_initial_blocks)
513 .rep \num_initial_blocks
514 vaesenclast \T_key, reg_i, reg_i
519 i = (9-\num_initial_blocks)
521 .rep \num_initial_blocks
522 vmovdqu (arg3, %r11), \T1
523 vpxor \T1, reg_i, reg_i
524 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
529 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
535 i = (8-\num_initial_blocks)
536 j = (9-\num_initial_blocks)
539 .rep \num_initial_blocks
540 vpxor reg_i, reg_j, reg_j
541 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
546 # XMM8 has the combined result here
548 vmovdqa \XMM8, TMP1(%rsp)
552 jl _initial_blocks_done\@ # no need for precomputed constants
554 ###############################################################################
555 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
556 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
558 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
560 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
562 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
564 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
566 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
568 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
570 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
572 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
574 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
576 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
578 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
580 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
582 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
584 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
586 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
588 vmovdqa (arg1), \T_key
589 vpxor \T_key, \XMM1, \XMM1
590 vpxor \T_key, \XMM2, \XMM2
591 vpxor \T_key, \XMM3, \XMM3
592 vpxor \T_key, \XMM4, \XMM4
593 vpxor \T_key, \XMM5, \XMM5
594 vpxor \T_key, \XMM6, \XMM6
595 vpxor \T_key, \XMM7, \XMM7
596 vpxor \T_key, \XMM8, \XMM8
601 vmovdqa 16*i(arg1), \T_key
602 vaesenc \T_key, \XMM1, \XMM1
603 vaesenc \T_key, \XMM2, \XMM2
604 vaesenc \T_key, \XMM3, \XMM3
605 vaesenc \T_key, \XMM4, \XMM4
606 vaesenc \T_key, \XMM5, \XMM5
607 vaesenc \T_key, \XMM6, \XMM6
608 vaesenc \T_key, \XMM7, \XMM7
609 vaesenc \T_key, \XMM8, \XMM8
615 vmovdqa 16*i(arg1), \T_key
616 vaesenclast \T_key, \XMM1, \XMM1
617 vaesenclast \T_key, \XMM2, \XMM2
618 vaesenclast \T_key, \XMM3, \XMM3
619 vaesenclast \T_key, \XMM4, \XMM4
620 vaesenclast \T_key, \XMM5, \XMM5
621 vaesenclast \T_key, \XMM6, \XMM6
622 vaesenclast \T_key, \XMM7, \XMM7
623 vaesenclast \T_key, \XMM8, \XMM8
625 vmovdqu (arg3, %r11), \T1
626 vpxor \T1, \XMM1, \XMM1
627 vmovdqu \XMM1, (arg2 , %r11)
632 vmovdqu 16*1(arg3, %r11), \T1
633 vpxor \T1, \XMM2, \XMM2
634 vmovdqu \XMM2, 16*1(arg2 , %r11)
639 vmovdqu 16*2(arg3, %r11), \T1
640 vpxor \T1, \XMM3, \XMM3
641 vmovdqu \XMM3, 16*2(arg2 , %r11)
646 vmovdqu 16*3(arg3, %r11), \T1
647 vpxor \T1, \XMM4, \XMM4
648 vmovdqu \XMM4, 16*3(arg2 , %r11)
653 vmovdqu 16*4(arg3, %r11), \T1
654 vpxor \T1, \XMM5, \XMM5
655 vmovdqu \XMM5, 16*4(arg2 , %r11)
660 vmovdqu 16*5(arg3, %r11), \T1
661 vpxor \T1, \XMM6, \XMM6
662 vmovdqu \XMM6, 16*5(arg2 , %r11)
667 vmovdqu 16*6(arg3, %r11), \T1
668 vpxor \T1, \XMM7, \XMM7
669 vmovdqu \XMM7, 16*6(arg2 , %r11)
674 vmovdqu 16*7(arg3, %r11), \T1
675 vpxor \T1, \XMM8, \XMM8
676 vmovdqu \XMM8, 16*7(arg2 , %r11)
683 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
684 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
685 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
686 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
687 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
688 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
689 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
690 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
691 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
693 ###############################################################################
695 _initial_blocks_done\@:
699 # encrypt 8 blocks at a time
700 # ghash the 8 previously encrypted ciphertext blocks
701 # arg1, arg2, arg3 are used as pointers only, not modified
702 # r11 is the data offset value
703 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
706 vmovdqa \XMM2, TMP2(%rsp)
707 vmovdqa \XMM3, TMP3(%rsp)
708 vmovdqa \XMM4, TMP4(%rsp)
709 vmovdqa \XMM5, TMP5(%rsp)
710 vmovdqa \XMM6, TMP6(%rsp)
711 vmovdqa \XMM7, TMP7(%rsp)
712 vmovdqa \XMM8, TMP8(%rsp)
714 .if \loop_idx == in_order
715 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
716 vpaddd ONE(%rip), \XMM1, \XMM2
717 vpaddd ONE(%rip), \XMM2, \XMM3
718 vpaddd ONE(%rip), \XMM3, \XMM4
719 vpaddd ONE(%rip), \XMM4, \XMM5
720 vpaddd ONE(%rip), \XMM5, \XMM6
721 vpaddd ONE(%rip), \XMM6, \XMM7
722 vpaddd ONE(%rip), \XMM7, \XMM8
725 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
726 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
727 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
728 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
729 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
730 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
731 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
732 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
734 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
735 vpaddd ONEf(%rip), \XMM1, \XMM2
736 vpaddd ONEf(%rip), \XMM2, \XMM3
737 vpaddd ONEf(%rip), \XMM3, \XMM4
738 vpaddd ONEf(%rip), \XMM4, \XMM5
739 vpaddd ONEf(%rip), \XMM5, \XMM6
740 vpaddd ONEf(%rip), \XMM6, \XMM7
741 vpaddd ONEf(%rip), \XMM7, \XMM8
746 #######################################################################
749 vpxor \T1, \XMM1, \XMM1
750 vpxor \T1, \XMM2, \XMM2
751 vpxor \T1, \XMM3, \XMM3
752 vpxor \T1, \XMM4, \XMM4
753 vpxor \T1, \XMM5, \XMM5
754 vpxor \T1, \XMM6, \XMM6
755 vpxor \T1, \XMM7, \XMM7
756 vpxor \T1, \XMM8, \XMM8
758 #######################################################################
764 vmovdqu 16*1(arg1), \T1
765 vaesenc \T1, \XMM1, \XMM1
766 vaesenc \T1, \XMM2, \XMM2
767 vaesenc \T1, \XMM3, \XMM3
768 vaesenc \T1, \XMM4, \XMM4
769 vaesenc \T1, \XMM5, \XMM5
770 vaesenc \T1, \XMM6, \XMM6
771 vaesenc \T1, \XMM7, \XMM7
772 vaesenc \T1, \XMM8, \XMM8
774 vmovdqu 16*2(arg1), \T1
775 vaesenc \T1, \XMM1, \XMM1
776 vaesenc \T1, \XMM2, \XMM2
777 vaesenc \T1, \XMM3, \XMM3
778 vaesenc \T1, \XMM4, \XMM4
779 vaesenc \T1, \XMM5, \XMM5
780 vaesenc \T1, \XMM6, \XMM6
781 vaesenc \T1, \XMM7, \XMM7
782 vaesenc \T1, \XMM8, \XMM8
785 #######################################################################
787 vmovdqa HashKey_8(arg1), \T5
788 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
789 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
791 vpshufd $0b01001110, \T2, \T6
794 vmovdqa HashKey_8_k(arg1), \T5
795 vpclmulqdq $0x00, \T5, \T6, \T6
797 vmovdqu 16*3(arg1), \T1
798 vaesenc \T1, \XMM1, \XMM1
799 vaesenc \T1, \XMM2, \XMM2
800 vaesenc \T1, \XMM3, \XMM3
801 vaesenc \T1, \XMM4, \XMM4
802 vaesenc \T1, \XMM5, \XMM5
803 vaesenc \T1, \XMM6, \XMM6
804 vaesenc \T1, \XMM7, \XMM7
805 vaesenc \T1, \XMM8, \XMM8
807 vmovdqa TMP2(%rsp), \T1
808 vmovdqa HashKey_7(arg1), \T5
809 vpclmulqdq $0x11, \T5, \T1, \T3
811 vpclmulqdq $0x00, \T5, \T1, \T3
814 vpshufd $0b01001110, \T1, \T3
816 vmovdqa HashKey_7_k(arg1), \T5
817 vpclmulqdq $0x10, \T5, \T3, \T3
820 vmovdqu 16*4(arg1), \T1
821 vaesenc \T1, \XMM1, \XMM1
822 vaesenc \T1, \XMM2, \XMM2
823 vaesenc \T1, \XMM3, \XMM3
824 vaesenc \T1, \XMM4, \XMM4
825 vaesenc \T1, \XMM5, \XMM5
826 vaesenc \T1, \XMM6, \XMM6
827 vaesenc \T1, \XMM7, \XMM7
828 vaesenc \T1, \XMM8, \XMM8
830 #######################################################################
832 vmovdqa TMP3(%rsp), \T1
833 vmovdqa HashKey_6(arg1), \T5
834 vpclmulqdq $0x11, \T5, \T1, \T3
836 vpclmulqdq $0x00, \T5, \T1, \T3
839 vpshufd $0b01001110, \T1, \T3
841 vmovdqa HashKey_6_k(arg1), \T5
842 vpclmulqdq $0x10, \T5, \T3, \T3
845 vmovdqu 16*5(arg1), \T1
846 vaesenc \T1, \XMM1, \XMM1
847 vaesenc \T1, \XMM2, \XMM2
848 vaesenc \T1, \XMM3, \XMM3
849 vaesenc \T1, \XMM4, \XMM4
850 vaesenc \T1, \XMM5, \XMM5
851 vaesenc \T1, \XMM6, \XMM6
852 vaesenc \T1, \XMM7, \XMM7
853 vaesenc \T1, \XMM8, \XMM8
855 vmovdqa TMP4(%rsp), \T1
856 vmovdqa HashKey_5(arg1), \T5
857 vpclmulqdq $0x11, \T5, \T1, \T3
859 vpclmulqdq $0x00, \T5, \T1, \T3
862 vpshufd $0b01001110, \T1, \T3
864 vmovdqa HashKey_5_k(arg1), \T5
865 vpclmulqdq $0x10, \T5, \T3, \T3
868 vmovdqu 16*6(arg1), \T1
869 vaesenc \T1, \XMM1, \XMM1
870 vaesenc \T1, \XMM2, \XMM2
871 vaesenc \T1, \XMM3, \XMM3
872 vaesenc \T1, \XMM4, \XMM4
873 vaesenc \T1, \XMM5, \XMM5
874 vaesenc \T1, \XMM6, \XMM6
875 vaesenc \T1, \XMM7, \XMM7
876 vaesenc \T1, \XMM8, \XMM8
879 vmovdqa TMP5(%rsp), \T1
880 vmovdqa HashKey_4(arg1), \T5
881 vpclmulqdq $0x11, \T5, \T1, \T3
883 vpclmulqdq $0x00, \T5, \T1, \T3
886 vpshufd $0b01001110, \T1, \T3
888 vmovdqa HashKey_4_k(arg1), \T5
889 vpclmulqdq $0x10, \T5, \T3, \T3
892 vmovdqu 16*7(arg1), \T1
893 vaesenc \T1, \XMM1, \XMM1
894 vaesenc \T1, \XMM2, \XMM2
895 vaesenc \T1, \XMM3, \XMM3
896 vaesenc \T1, \XMM4, \XMM4
897 vaesenc \T1, \XMM5, \XMM5
898 vaesenc \T1, \XMM6, \XMM6
899 vaesenc \T1, \XMM7, \XMM7
900 vaesenc \T1, \XMM8, \XMM8
902 vmovdqa TMP6(%rsp), \T1
903 vmovdqa HashKey_3(arg1), \T5
904 vpclmulqdq $0x11, \T5, \T1, \T3
906 vpclmulqdq $0x00, \T5, \T1, \T3
909 vpshufd $0b01001110, \T1, \T3
911 vmovdqa HashKey_3_k(arg1), \T5
912 vpclmulqdq $0x10, \T5, \T3, \T3
916 vmovdqu 16*8(arg1), \T1
917 vaesenc \T1, \XMM1, \XMM1
918 vaesenc \T1, \XMM2, \XMM2
919 vaesenc \T1, \XMM3, \XMM3
920 vaesenc \T1, \XMM4, \XMM4
921 vaesenc \T1, \XMM5, \XMM5
922 vaesenc \T1, \XMM6, \XMM6
923 vaesenc \T1, \XMM7, \XMM7
924 vaesenc \T1, \XMM8, \XMM8
926 vmovdqa TMP7(%rsp), \T1
927 vmovdqa HashKey_2(arg1), \T5
928 vpclmulqdq $0x11, \T5, \T1, \T3
930 vpclmulqdq $0x00, \T5, \T1, \T3
933 vpshufd $0b01001110, \T1, \T3
935 vmovdqa HashKey_2_k(arg1), \T5
936 vpclmulqdq $0x10, \T5, \T3, \T3
939 #######################################################################
941 vmovdqu 16*9(arg1), \T5
942 vaesenc \T5, \XMM1, \XMM1
943 vaesenc \T5, \XMM2, \XMM2
944 vaesenc \T5, \XMM3, \XMM3
945 vaesenc \T5, \XMM4, \XMM4
946 vaesenc \T5, \XMM5, \XMM5
947 vaesenc \T5, \XMM6, \XMM6
948 vaesenc \T5, \XMM7, \XMM7
949 vaesenc \T5, \XMM8, \XMM8
951 vmovdqa TMP8(%rsp), \T1
952 vmovdqa HashKey(arg1), \T5
953 vpclmulqdq $0x11, \T5, \T1, \T3
955 vpclmulqdq $0x00, \T5, \T1, \T3
958 vpshufd $0b01001110, \T1, \T3
960 vmovdqa HashKey_k(arg1), \T5
961 vpclmulqdq $0x10, \T5, \T3, \T3
967 vmovdqu 16*10(arg1), \T5
973 vpxor 16*i(arg3, %r11), \T5, \T2
975 vaesenclast \T2, reg_j, reg_j
977 vaesenclast \T2, reg_j, \T3
978 vmovdqu 16*i(arg3, %r11), reg_j
979 vmovdqu \T3, 16*i(arg2, %r11)
985 #######################################################################
988 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
989 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
991 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
995 #######################################################################
996 #first phase of the reduction
997 #######################################################################
998 vpslld $31, \T7, \T2 # packed right shifting << 31
999 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1000 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1002 vpxor \T3, \T2, \T2 # xor the shifted versions
1005 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1007 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1008 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1009 #######################################################################
1011 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
1012 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
1013 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
1014 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
1015 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
1016 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
1017 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
1018 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
1021 #######################################################################
1022 #second phase of the reduction
1023 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1024 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1025 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1026 vpxor \T3, \T2, \T2 # xor the shifted versions
1031 vpxor \T7, \T6, \T6 # the result is in T6
1032 #######################################################################
1034 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1035 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1036 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1037 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1038 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1039 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1040 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1041 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1044 vpxor \T6, \XMM1, \XMM1
1051 # GHASH the last 4 ciphertext blocks.
1052 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1057 vpshufd $0b01001110, \XMM1, \T2
1058 vpxor \XMM1, \T2, \T2
1059 vmovdqa HashKey_8(arg1), \T5
1060 vpclmulqdq $0x11, \T5, \XMM1, \T6
1061 vpclmulqdq $0x00, \T5, \XMM1, \T7
1063 vmovdqa HashKey_8_k(arg1), \T3
1064 vpclmulqdq $0x00, \T3, \T2, \XMM1
1066 ######################
1068 vpshufd $0b01001110, \XMM2, \T2
1069 vpxor \XMM2, \T2, \T2
1070 vmovdqa HashKey_7(arg1), \T5
1071 vpclmulqdq $0x11, \T5, \XMM2, \T4
1074 vpclmulqdq $0x00, \T5, \XMM2, \T4
1077 vmovdqa HashKey_7_k(arg1), \T3
1078 vpclmulqdq $0x00, \T3, \T2, \T2
1079 vpxor \T2, \XMM1, \XMM1
1081 ######################
1083 vpshufd $0b01001110, \XMM3, \T2
1084 vpxor \XMM3, \T2, \T2
1085 vmovdqa HashKey_6(arg1), \T5
1086 vpclmulqdq $0x11, \T5, \XMM3, \T4
1089 vpclmulqdq $0x00, \T5, \XMM3, \T4
1092 vmovdqa HashKey_6_k(arg1), \T3
1093 vpclmulqdq $0x00, \T3, \T2, \T2
1094 vpxor \T2, \XMM1, \XMM1
1096 ######################
1098 vpshufd $0b01001110, \XMM4, \T2
1099 vpxor \XMM4, \T2, \T2
1100 vmovdqa HashKey_5(arg1), \T5
1101 vpclmulqdq $0x11, \T5, \XMM4, \T4
1104 vpclmulqdq $0x00, \T5, \XMM4, \T4
1107 vmovdqa HashKey_5_k(arg1), \T3
1108 vpclmulqdq $0x00, \T3, \T2, \T2
1109 vpxor \T2, \XMM1, \XMM1
1111 ######################
1113 vpshufd $0b01001110, \XMM5, \T2
1114 vpxor \XMM5, \T2, \T2
1115 vmovdqa HashKey_4(arg1), \T5
1116 vpclmulqdq $0x11, \T5, \XMM5, \T4
1119 vpclmulqdq $0x00, \T5, \XMM5, \T4
1122 vmovdqa HashKey_4_k(arg1), \T3
1123 vpclmulqdq $0x00, \T3, \T2, \T2
1124 vpxor \T2, \XMM1, \XMM1
1126 ######################
1128 vpshufd $0b01001110, \XMM6, \T2
1129 vpxor \XMM6, \T2, \T2
1130 vmovdqa HashKey_3(arg1), \T5
1131 vpclmulqdq $0x11, \T5, \XMM6, \T4
1134 vpclmulqdq $0x00, \T5, \XMM6, \T4
1137 vmovdqa HashKey_3_k(arg1), \T3
1138 vpclmulqdq $0x00, \T3, \T2, \T2
1139 vpxor \T2, \XMM1, \XMM1
1141 ######################
1143 vpshufd $0b01001110, \XMM7, \T2
1144 vpxor \XMM7, \T2, \T2
1145 vmovdqa HashKey_2(arg1), \T5
1146 vpclmulqdq $0x11, \T5, \XMM7, \T4
1149 vpclmulqdq $0x00, \T5, \XMM7, \T4
1152 vmovdqa HashKey_2_k(arg1), \T3
1153 vpclmulqdq $0x00, \T3, \T2, \T2
1154 vpxor \T2, \XMM1, \XMM1
1156 ######################
1158 vpshufd $0b01001110, \XMM8, \T2
1159 vpxor \XMM8, \T2, \T2
1160 vmovdqa HashKey(arg1), \T5
1161 vpclmulqdq $0x11, \T5, \XMM8, \T4
1164 vpclmulqdq $0x00, \T5, \XMM8, \T4
1167 vmovdqa HashKey_k(arg1), \T3
1168 vpclmulqdq $0x00, \T3, \T2, \T2
1170 vpxor \T2, \XMM1, \XMM1
1171 vpxor \T6, \XMM1, \XMM1
1172 vpxor \T7, \XMM1, \T2
1177 vpslldq $8, \T2, \T4
1178 vpsrldq $8, \T2, \T2
1181 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1182 # the accumulated carry-less multiplications
1184 #######################################################################
1185 #first phase of the reduction
1186 vpslld $31, \T7, \T2 # packed right shifting << 31
1187 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1188 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1190 vpxor \T3, \T2, \T2 # xor the shifted versions
1193 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1195 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1196 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1197 #######################################################################
1200 #second phase of the reduction
1201 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1202 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1203 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1204 vpxor \T3, \T2, \T2 # xor the shifted versions
1209 vpxor \T7, \T6, \T6 # the result is in T6
1214 # combined for GCM encrypt and decrypt functions
1215 # clobbering all xmm registers
1216 # clobbering r10, r11, r12, r13, r14, r15
1217 .macro GCM_ENC_DEC_AVX ENC_DEC
1219 #the number of pushes must equal STACK_OFFSET
1230 sub $VARIABLE_OFFSET, %rsp
1231 and $~63, %rsp # align rsp to 64 bytes
1234 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
1236 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
1237 and $-16, %r13 # r13 = r13 - (r13 mod 16)
1242 jz _initial_num_blocks_is_0\@
1245 je _initial_num_blocks_is_7\@
1247 je _initial_num_blocks_is_6\@
1249 je _initial_num_blocks_is_5\@
1251 je _initial_num_blocks_is_4\@
1253 je _initial_num_blocks_is_3\@
1255 je _initial_num_blocks_is_2\@
1257 jmp _initial_num_blocks_is_1\@
1259 _initial_num_blocks_is_7\@:
1260 INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1262 jmp _initial_blocks_encrypted\@
1264 _initial_num_blocks_is_6\@:
1265 INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1267 jmp _initial_blocks_encrypted\@
1269 _initial_num_blocks_is_5\@:
1270 INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1272 jmp _initial_blocks_encrypted\@
1274 _initial_num_blocks_is_4\@:
1275 INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1277 jmp _initial_blocks_encrypted\@
1279 _initial_num_blocks_is_3\@:
1280 INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1282 jmp _initial_blocks_encrypted\@
1284 _initial_num_blocks_is_2\@:
1285 INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1287 jmp _initial_blocks_encrypted\@
1289 _initial_num_blocks_is_1\@:
1290 INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1292 jmp _initial_blocks_encrypted\@
1294 _initial_num_blocks_is_0\@:
1295 INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
1298 _initial_blocks_encrypted\@:
1300 je _zero_cipher_left\@
1303 je _eight_cipher_left\@
1310 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1313 _encrypt_by_8_new\@:
1320 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
1323 jne _encrypt_by_8_new\@
1325 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1326 jmp _eight_cipher_left\@
1329 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1331 GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
1332 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1335 jne _encrypt_by_8_new\@
1337 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1342 _eight_cipher_left\@:
1343 GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
1346 _zero_cipher_left\@:
1348 jl _only_less_than_16\@
1351 and $15, %r13 # r13 = (arg4 mod 16)
1353 je _multiple_of_16_bytes\@
1355 # handle the last <16 Byte block seperately
1358 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1359 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1360 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1364 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
1366 lea SHIFT_MASK+16(%rip), %r12
1367 sub %r13, %r12 # adjust the shuffle mask pointer to be
1368 # able to shift 16-r13 bytes (r13 is the
1369 # number of bytes in plaintext mod 16)
1370 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
1371 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
1372 jmp _final_ghash_mul\@
1374 _only_less_than_16\@:
1375 # check for 0 length
1377 and $15, %r13 # r13 = (arg4 mod 16)
1379 je _multiple_of_16_bytes\@
1381 # handle the last <16 Byte block seperately
1384 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
1385 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1386 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
1389 lea SHIFT_MASK+16(%rip), %r12
1390 sub %r13, %r12 # adjust the shuffle mask pointer to be
1391 # able to shift 16-r13 bytes (r13 is the
1392 # number of bytes in plaintext mod 16)
1394 _get_last_16_byte_loop\@:
1395 movb (arg3, %r11), %al
1396 movb %al, TMP1 (%rsp , %r11)
1399 jne _get_last_16_byte_loop\@
1401 vmovdqu TMP1(%rsp), %xmm1
1407 vmovdqa %xmm1, %xmm2
1408 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1409 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1410 # mask out top 16-r13 bytes of xmm9
1411 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1412 vpand %xmm1, %xmm2, %xmm2
1413 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
1414 vpxor %xmm2, %xmm14, %xmm14
1415 #GHASH computation for the last <16 Byte block
1416 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1420 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
1421 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
1422 # mask out top 16-r13 bytes of xmm9
1423 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
1424 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
1425 vpxor %xmm9, %xmm14, %xmm14
1426 #GHASH computation for the last <16 Byte block
1427 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
1430 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
1434 #############################
1438 jle _less_than_8_bytes_left\@
1440 mov %rax, (arg2 , %r11)
1442 vpsrldq $8, %xmm9, %xmm9
1446 _less_than_8_bytes_left\@:
1447 movb %al, (arg2 , %r11)
1451 jne _less_than_8_bytes_left\@
1452 #############################
1454 _multiple_of_16_bytes\@:
1455 mov arg7, %r12 # r12 = aadLen (number of bytes)
1456 shl $3, %r12 # convert into number of bits
1457 vmovd %r12d, %xmm15 # len(A) in xmm15
1459 shl $3, arg4 # len(C) in bits (*128)
1461 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
1462 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
1464 vpxor %xmm15, %xmm14, %xmm14
1465 GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
1466 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
1468 mov arg5, %rax # rax = *Y0
1469 vmovdqu (%rax), %xmm9 # xmm9 = Y0
1471 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
1473 vpxor %xmm14, %xmm9, %xmm9
1478 mov arg8, %r10 # r10 = authTag
1479 mov arg9, %r11 # r11 = auth_tag_len
1492 vpsrldq $8, %xmm9, %xmm9
1500 vpsrldq $4, %xmm9, %xmm9
1514 jmp _return_T_done\@
1517 vmovdqu %xmm9, (%r10)
1529 #############################################################
1530 #void aesni_gcm_precomp_avx_gen2
1531 # (gcm_data *my_ctx_data,
1532 # u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1533 #############################################################
1534 ENTRY(aesni_gcm_precomp_avx_gen2)
1535 #the number of pushes must equal STACK_OFFSET
1545 sub $VARIABLE_OFFSET, %rsp
1546 and $~63, %rsp # align rsp to 64 bytes
1548 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
1550 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
1551 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
1552 vmovdqa %xmm6, %xmm2
1553 vpsllq $1, %xmm6, %xmm6
1554 vpsrlq $63, %xmm2, %xmm2
1555 vmovdqa %xmm2, %xmm1
1556 vpslldq $8, %xmm2, %xmm2
1557 vpsrldq $8, %xmm1, %xmm1
1558 vpor %xmm2, %xmm6, %xmm6
1560 vpshufd $0b00100100, %xmm1, %xmm2
1561 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
1562 vpand POLY(%rip), %xmm2, %xmm2
1563 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
1564 #######################################################################
1565 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
1568 PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
1577 ENDPROC(aesni_gcm_precomp_avx_gen2)
1579 ###############################################################################
1580 #void aesni_gcm_enc_avx_gen2(
1581 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1582 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1583 # const u8 *in, /* Plaintext input */
1584 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1585 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1586 # (from Security Association) concatenated with 8 byte
1587 # Initialisation Vector (from IPSec ESP Payload)
1588 # concatenated with 0x00000001. 16-byte aligned pointer. */
1589 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1590 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1591 # u8 *auth_tag, /* Authenticated Tag output. */
1592 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1593 # Valid values are 16 (most likely), 12 or 8. */
1594 ###############################################################################
1595 ENTRY(aesni_gcm_enc_avx_gen2)
1598 ENDPROC(aesni_gcm_enc_avx_gen2)
1600 ###############################################################################
1601 #void aesni_gcm_dec_avx_gen2(
1602 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1603 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1604 # const u8 *in, /* Ciphertext input */
1605 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
1606 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1607 # (from Security Association) concatenated with 8 byte
1608 # Initialisation Vector (from IPSec ESP Payload)
1609 # concatenated with 0x00000001. 16-byte aligned pointer. */
1610 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1611 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1612 # u8 *auth_tag, /* Authenticated Tag output. */
1613 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1614 # Valid values are 16 (most likely), 12 or 8. */
1615 ###############################################################################
1616 ENTRY(aesni_gcm_dec_avx_gen2)
1619 ENDPROC(aesni_gcm_dec_avx_gen2)
1620 #endif /* CONFIG_AS_AVX */
1622 #ifdef CONFIG_AS_AVX2
1623 ###############################################################################
1624 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1625 # Input: A and B (128-bits each, bit-reflected)
1626 # Output: C = A*B*x mod poly, (i.e. >>1 )
1627 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1628 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1629 ###############################################################################
1630 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1632 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1633 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1634 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1635 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1639 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1640 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1645 #######################################################################
1646 #first phase of the reduction
1647 vmovdqa POLY2(%rip), \T3
1649 vpclmulqdq $0x01, \GH, \T3, \T2
1650 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1652 vpxor \T2, \GH, \GH # first phase of the reduction complete
1653 #######################################################################
1654 #second phase of the reduction
1655 vpclmulqdq $0x00, \GH, \T3, \T2
1656 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1658 vpclmulqdq $0x10, \GH, \T3, \GH
1659 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1661 vpxor \T2, \GH, \GH # second phase of the reduction complete
1662 #######################################################################
1663 vpxor \T1, \GH, \GH # the result is in GH
1668 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1670 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1672 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1673 vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
1675 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1676 vmovdqa \T5, HashKey_3(arg1)
1678 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1679 vmovdqa \T5, HashKey_4(arg1)
1681 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1682 vmovdqa \T5, HashKey_5(arg1)
1684 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1685 vmovdqa \T5, HashKey_6(arg1)
1687 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1688 vmovdqa \T5, HashKey_7(arg1)
1690 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1691 vmovdqa \T5, HashKey_8(arg1)
1696 ## if a = number of total plaintext bytes
1698 ## num_initial_blocks = b mod 4#
1699 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1700 ## r10, r11, r12, rax are clobbered
1701 ## arg1, arg2, arg3, r14 are used as a pointer only, not modified
1703 .macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1704 i = (8-\num_initial_blocks)
1708 mov arg6, %r10 # r10 = AAD
1709 mov arg7, %r12 # r12 = aadLen
1714 vpxor reg_j, reg_j, reg_j
1715 vpxor reg_i, reg_i, reg_i
1720 vmovdqu (%r10), reg_i
1721 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1722 vpxor reg_i, reg_j, reg_j
1723 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
1728 jge _get_AAD_blocks\@
1729 vmovdqu reg_j, reg_i
1733 vpxor reg_i, reg_i, reg_i
1735 /* read the last <16B of AAD. since we have at least 4B of
1736 data right after the AAD (the ICV, and maybe some CT), we can
1737 read 4B/8B blocks safely, and then get rid of the extra stuff */
1740 jle _get_AAD_rest4\@
1744 vpslldq $8, \T1, \T1
1745 vpsrldq $8, reg_i, reg_i
1746 vpxor \T1, reg_i, reg_i
1747 jmp _get_AAD_rest8\@
1750 jle _get_AAD_rest0\@
1755 vpslldq $12, \T1, \T1
1756 vpsrldq $4, reg_i, reg_i
1757 vpxor \T1, reg_i, reg_i
1759 /* finalize: shift out the extra bytes we read, and align
1760 left. since pslldq can only shift by an immediate, we use
1761 vpshufb and an array of shuffle masks */
1764 movdqu aad_shift_arr(%r11), \T1
1765 vpshufb \T1, reg_i, reg_i
1766 _get_AAD_rest_final\@:
1767 vpshufb SHUF_MASK(%rip), reg_i, reg_i
1768 vpxor reg_j, reg_i, reg_i
1769 GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
1772 # initialize the data pointer offset as zero
1775 # start AES for num_initial_blocks blocks
1776 mov arg5, %rax # rax = *Y0
1777 vmovdqu (%rax), \CTR # CTR = Y0
1778 vpshufb SHUF_MASK(%rip), \CTR, \CTR
1781 i = (9-\num_initial_blocks)
1783 .rep \num_initial_blocks
1784 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1786 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1791 vmovdqa (arg1), \T_key
1792 i = (9-\num_initial_blocks)
1794 .rep \num_initial_blocks
1795 vpxor \T_key, reg_i, reg_i
1803 vmovdqa 16*j(arg1), \T_key
1804 i = (9-\num_initial_blocks)
1806 .rep \num_initial_blocks
1807 vaesenc \T_key, reg_i, reg_i
1817 vmovdqa 16*10(arg1), \T_key
1818 i = (9-\num_initial_blocks)
1820 .rep \num_initial_blocks
1821 vaesenclast \T_key, reg_i, reg_i
1826 i = (9-\num_initial_blocks)
1828 .rep \num_initial_blocks
1829 vmovdqu (arg3, %r11), \T1
1830 vpxor \T1, reg_i, reg_i
1831 vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
1832 # num_initial_blocks blocks
1837 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1843 i = (8-\num_initial_blocks)
1844 j = (9-\num_initial_blocks)
1847 .rep \num_initial_blocks
1848 vpxor reg_i, reg_j, reg_j
1849 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1854 # XMM8 has the combined result here
1856 vmovdqa \XMM8, TMP1(%rsp)
1860 jl _initial_blocks_done\@ # no need for precomputed constants
1862 ###############################################################################
1863 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1864 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1866 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1868 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1870 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1872 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1874 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1876 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1878 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1880 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1882 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1884 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1886 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1888 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1890 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1892 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1894 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1896 vmovdqa (arg1), \T_key
1897 vpxor \T_key, \XMM1, \XMM1
1898 vpxor \T_key, \XMM2, \XMM2
1899 vpxor \T_key, \XMM3, \XMM3
1900 vpxor \T_key, \XMM4, \XMM4
1901 vpxor \T_key, \XMM5, \XMM5
1902 vpxor \T_key, \XMM6, \XMM6
1903 vpxor \T_key, \XMM7, \XMM7
1904 vpxor \T_key, \XMM8, \XMM8
1908 .rep 9 # do 9 rounds
1909 vmovdqa 16*i(arg1), \T_key
1910 vaesenc \T_key, \XMM1, \XMM1
1911 vaesenc \T_key, \XMM2, \XMM2
1912 vaesenc \T_key, \XMM3, \XMM3
1913 vaesenc \T_key, \XMM4, \XMM4
1914 vaesenc \T_key, \XMM5, \XMM5
1915 vaesenc \T_key, \XMM6, \XMM6
1916 vaesenc \T_key, \XMM7, \XMM7
1917 vaesenc \T_key, \XMM8, \XMM8
1923 vmovdqa 16*i(arg1), \T_key
1924 vaesenclast \T_key, \XMM1, \XMM1
1925 vaesenclast \T_key, \XMM2, \XMM2
1926 vaesenclast \T_key, \XMM3, \XMM3
1927 vaesenclast \T_key, \XMM4, \XMM4
1928 vaesenclast \T_key, \XMM5, \XMM5
1929 vaesenclast \T_key, \XMM6, \XMM6
1930 vaesenclast \T_key, \XMM7, \XMM7
1931 vaesenclast \T_key, \XMM8, \XMM8
1933 vmovdqu (arg3, %r11), \T1
1934 vpxor \T1, \XMM1, \XMM1
1935 vmovdqu \XMM1, (arg2 , %r11)
1940 vmovdqu 16*1(arg3, %r11), \T1
1941 vpxor \T1, \XMM2, \XMM2
1942 vmovdqu \XMM2, 16*1(arg2 , %r11)
1947 vmovdqu 16*2(arg3, %r11), \T1
1948 vpxor \T1, \XMM3, \XMM3
1949 vmovdqu \XMM3, 16*2(arg2 , %r11)
1954 vmovdqu 16*3(arg3, %r11), \T1
1955 vpxor \T1, \XMM4, \XMM4
1956 vmovdqu \XMM4, 16*3(arg2 , %r11)
1961 vmovdqu 16*4(arg3, %r11), \T1
1962 vpxor \T1, \XMM5, \XMM5
1963 vmovdqu \XMM5, 16*4(arg2 , %r11)
1968 vmovdqu 16*5(arg3, %r11), \T1
1969 vpxor \T1, \XMM6, \XMM6
1970 vmovdqu \XMM6, 16*5(arg2 , %r11)
1975 vmovdqu 16*6(arg3, %r11), \T1
1976 vpxor \T1, \XMM7, \XMM7
1977 vmovdqu \XMM7, 16*6(arg2 , %r11)
1982 vmovdqu 16*7(arg3, %r11), \T1
1983 vpxor \T1, \XMM8, \XMM8
1984 vmovdqu \XMM8, 16*7(arg2 , %r11)
1991 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1992 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
1993 # the corresponding ciphertext
1994 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1995 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1996 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1997 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1998 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1999 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2000 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2002 ###############################################################################
2004 _initial_blocks_done\@:
2011 # encrypt 8 blocks at a time
2012 # ghash the 8 previously encrypted ciphertext blocks
2013 # arg1, arg2, arg3 are used as pointers only, not modified
2014 # r11 is the data offset value
2015 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2018 vmovdqa \XMM2, TMP2(%rsp)
2019 vmovdqa \XMM3, TMP3(%rsp)
2020 vmovdqa \XMM4, TMP4(%rsp)
2021 vmovdqa \XMM5, TMP5(%rsp)
2022 vmovdqa \XMM6, TMP6(%rsp)
2023 vmovdqa \XMM7, TMP7(%rsp)
2024 vmovdqa \XMM8, TMP8(%rsp)
2026 .if \loop_idx == in_order
2027 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2028 vpaddd ONE(%rip), \XMM1, \XMM2
2029 vpaddd ONE(%rip), \XMM2, \XMM3
2030 vpaddd ONE(%rip), \XMM3, \XMM4
2031 vpaddd ONE(%rip), \XMM4, \XMM5
2032 vpaddd ONE(%rip), \XMM5, \XMM6
2033 vpaddd ONE(%rip), \XMM6, \XMM7
2034 vpaddd ONE(%rip), \XMM7, \XMM8
2037 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2038 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2039 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2040 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2041 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2042 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2043 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2044 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2046 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2047 vpaddd ONEf(%rip), \XMM1, \XMM2
2048 vpaddd ONEf(%rip), \XMM2, \XMM3
2049 vpaddd ONEf(%rip), \XMM3, \XMM4
2050 vpaddd ONEf(%rip), \XMM4, \XMM5
2051 vpaddd ONEf(%rip), \XMM5, \XMM6
2052 vpaddd ONEf(%rip), \XMM6, \XMM7
2053 vpaddd ONEf(%rip), \XMM7, \XMM8
2058 #######################################################################
2061 vpxor \T1, \XMM1, \XMM1
2062 vpxor \T1, \XMM2, \XMM2
2063 vpxor \T1, \XMM3, \XMM3
2064 vpxor \T1, \XMM4, \XMM4
2065 vpxor \T1, \XMM5, \XMM5
2066 vpxor \T1, \XMM6, \XMM6
2067 vpxor \T1, \XMM7, \XMM7
2068 vpxor \T1, \XMM8, \XMM8
2070 #######################################################################
2076 vmovdqu 16*1(arg1), \T1
2077 vaesenc \T1, \XMM1, \XMM1
2078 vaesenc \T1, \XMM2, \XMM2
2079 vaesenc \T1, \XMM3, \XMM3
2080 vaesenc \T1, \XMM4, \XMM4
2081 vaesenc \T1, \XMM5, \XMM5
2082 vaesenc \T1, \XMM6, \XMM6
2083 vaesenc \T1, \XMM7, \XMM7
2084 vaesenc \T1, \XMM8, \XMM8
2086 vmovdqu 16*2(arg1), \T1
2087 vaesenc \T1, \XMM1, \XMM1
2088 vaesenc \T1, \XMM2, \XMM2
2089 vaesenc \T1, \XMM3, \XMM3
2090 vaesenc \T1, \XMM4, \XMM4
2091 vaesenc \T1, \XMM5, \XMM5
2092 vaesenc \T1, \XMM6, \XMM6
2093 vaesenc \T1, \XMM7, \XMM7
2094 vaesenc \T1, \XMM8, \XMM8
2097 #######################################################################
2099 vmovdqa HashKey_8(arg1), \T5
2100 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2101 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2102 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2103 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2106 vmovdqu 16*3(arg1), \T1
2107 vaesenc \T1, \XMM1, \XMM1
2108 vaesenc \T1, \XMM2, \XMM2
2109 vaesenc \T1, \XMM3, \XMM3
2110 vaesenc \T1, \XMM4, \XMM4
2111 vaesenc \T1, \XMM5, \XMM5
2112 vaesenc \T1, \XMM6, \XMM6
2113 vaesenc \T1, \XMM7, \XMM7
2114 vaesenc \T1, \XMM8, \XMM8
2116 vmovdqa TMP2(%rsp), \T1
2117 vmovdqa HashKey_7(arg1), \T5
2118 vpclmulqdq $0x11, \T5, \T1, \T3
2121 vpclmulqdq $0x00, \T5, \T1, \T3
2124 vpclmulqdq $0x01, \T5, \T1, \T3
2127 vpclmulqdq $0x10, \T5, \T1, \T3
2130 vmovdqu 16*4(arg1), \T1
2131 vaesenc \T1, \XMM1, \XMM1
2132 vaesenc \T1, \XMM2, \XMM2
2133 vaesenc \T1, \XMM3, \XMM3
2134 vaesenc \T1, \XMM4, \XMM4
2135 vaesenc \T1, \XMM5, \XMM5
2136 vaesenc \T1, \XMM6, \XMM6
2137 vaesenc \T1, \XMM7, \XMM7
2138 vaesenc \T1, \XMM8, \XMM8
2140 #######################################################################
2142 vmovdqa TMP3(%rsp), \T1
2143 vmovdqa HashKey_6(arg1), \T5
2144 vpclmulqdq $0x11, \T5, \T1, \T3
2147 vpclmulqdq $0x00, \T5, \T1, \T3
2150 vpclmulqdq $0x01, \T5, \T1, \T3
2153 vpclmulqdq $0x10, \T5, \T1, \T3
2156 vmovdqu 16*5(arg1), \T1
2157 vaesenc \T1, \XMM1, \XMM1
2158 vaesenc \T1, \XMM2, \XMM2
2159 vaesenc \T1, \XMM3, \XMM3
2160 vaesenc \T1, \XMM4, \XMM4
2161 vaesenc \T1, \XMM5, \XMM5
2162 vaesenc \T1, \XMM6, \XMM6
2163 vaesenc \T1, \XMM7, \XMM7
2164 vaesenc \T1, \XMM8, \XMM8
2166 vmovdqa TMP4(%rsp), \T1
2167 vmovdqa HashKey_5(arg1), \T5
2168 vpclmulqdq $0x11, \T5, \T1, \T3
2171 vpclmulqdq $0x00, \T5, \T1, \T3
2174 vpclmulqdq $0x01, \T5, \T1, \T3
2177 vpclmulqdq $0x10, \T5, \T1, \T3
2180 vmovdqu 16*6(arg1), \T1
2181 vaesenc \T1, \XMM1, \XMM1
2182 vaesenc \T1, \XMM2, \XMM2
2183 vaesenc \T1, \XMM3, \XMM3
2184 vaesenc \T1, \XMM4, \XMM4
2185 vaesenc \T1, \XMM5, \XMM5
2186 vaesenc \T1, \XMM6, \XMM6
2187 vaesenc \T1, \XMM7, \XMM7
2188 vaesenc \T1, \XMM8, \XMM8
2191 vmovdqa TMP5(%rsp), \T1
2192 vmovdqa HashKey_4(arg1), \T5
2193 vpclmulqdq $0x11, \T5, \T1, \T3
2196 vpclmulqdq $0x00, \T5, \T1, \T3
2199 vpclmulqdq $0x01, \T5, \T1, \T3
2202 vpclmulqdq $0x10, \T5, \T1, \T3
2205 vmovdqu 16*7(arg1), \T1
2206 vaesenc \T1, \XMM1, \XMM1
2207 vaesenc \T1, \XMM2, \XMM2
2208 vaesenc \T1, \XMM3, \XMM3
2209 vaesenc \T1, \XMM4, \XMM4
2210 vaesenc \T1, \XMM5, \XMM5
2211 vaesenc \T1, \XMM6, \XMM6
2212 vaesenc \T1, \XMM7, \XMM7
2213 vaesenc \T1, \XMM8, \XMM8
2215 vmovdqa TMP6(%rsp), \T1
2216 vmovdqa HashKey_3(arg1), \T5
2217 vpclmulqdq $0x11, \T5, \T1, \T3
2220 vpclmulqdq $0x00, \T5, \T1, \T3
2223 vpclmulqdq $0x01, \T5, \T1, \T3
2226 vpclmulqdq $0x10, \T5, \T1, \T3
2229 vmovdqu 16*8(arg1), \T1
2230 vaesenc \T1, \XMM1, \XMM1
2231 vaesenc \T1, \XMM2, \XMM2
2232 vaesenc \T1, \XMM3, \XMM3
2233 vaesenc \T1, \XMM4, \XMM4
2234 vaesenc \T1, \XMM5, \XMM5
2235 vaesenc \T1, \XMM6, \XMM6
2236 vaesenc \T1, \XMM7, \XMM7
2237 vaesenc \T1, \XMM8, \XMM8
2239 vmovdqa TMP7(%rsp), \T1
2240 vmovdqa HashKey_2(arg1), \T5
2241 vpclmulqdq $0x11, \T5, \T1, \T3
2244 vpclmulqdq $0x00, \T5, \T1, \T3
2247 vpclmulqdq $0x01, \T5, \T1, \T3
2250 vpclmulqdq $0x10, \T5, \T1, \T3
2254 #######################################################################
2256 vmovdqu 16*9(arg1), \T5
2257 vaesenc \T5, \XMM1, \XMM1
2258 vaesenc \T5, \XMM2, \XMM2
2259 vaesenc \T5, \XMM3, \XMM3
2260 vaesenc \T5, \XMM4, \XMM4
2261 vaesenc \T5, \XMM5, \XMM5
2262 vaesenc \T5, \XMM6, \XMM6
2263 vaesenc \T5, \XMM7, \XMM7
2264 vaesenc \T5, \XMM8, \XMM8
2266 vmovdqa TMP8(%rsp), \T1
2267 vmovdqa HashKey(arg1), \T5
2269 vpclmulqdq $0x00, \T5, \T1, \T3
2272 vpclmulqdq $0x01, \T5, \T1, \T3
2275 vpclmulqdq $0x10, \T5, \T1, \T3
2278 vpclmulqdq $0x11, \T5, \T1, \T3
2282 vmovdqu 16*10(arg1), \T5
2288 vpxor 16*i(arg3, %r11), \T5, \T2
2290 vaesenclast \T2, reg_j, reg_j
2292 vaesenclast \T2, reg_j, \T3
2293 vmovdqu 16*i(arg3, %r11), reg_j
2294 vmovdqu \T3, 16*i(arg2, %r11)
2300 #######################################################################
2303 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2304 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2306 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2310 #######################################################################
2311 #first phase of the reduction
2312 vmovdqa POLY2(%rip), \T3
2314 vpclmulqdq $0x01, \T7, \T3, \T2
2315 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2317 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2318 #######################################################################
2320 vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
2321 vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
2322 vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
2323 vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
2324 vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
2325 vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
2326 vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
2327 vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
2330 #######################################################################
2331 #second phase of the reduction
2332 vpclmulqdq $0x00, \T7, \T3, \T2
2333 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2335 vpclmulqdq $0x10, \T7, \T3, \T4
2336 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2338 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2339 #######################################################################
2340 vpxor \T4, \T1, \T1 # the result is in T1
2342 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2343 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2344 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2345 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2346 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2347 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2348 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2349 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2352 vpxor \T1, \XMM1, \XMM1
2359 # GHASH the last 4 ciphertext blocks.
2360 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2364 vmovdqa HashKey_8(arg1), \T5
2366 vpshufd $0b01001110, \XMM1, \T2
2367 vpshufd $0b01001110, \T5, \T3
2368 vpxor \XMM1, \T2, \T2
2371 vpclmulqdq $0x11, \T5, \XMM1, \T6
2372 vpclmulqdq $0x00, \T5, \XMM1, \T7
2374 vpclmulqdq $0x00, \T3, \T2, \XMM1
2376 ######################
2378 vmovdqa HashKey_7(arg1), \T5
2379 vpshufd $0b01001110, \XMM2, \T2
2380 vpshufd $0b01001110, \T5, \T3
2381 vpxor \XMM2, \T2, \T2
2384 vpclmulqdq $0x11, \T5, \XMM2, \T4
2387 vpclmulqdq $0x00, \T5, \XMM2, \T4
2390 vpclmulqdq $0x00, \T3, \T2, \T2
2392 vpxor \T2, \XMM1, \XMM1
2394 ######################
2396 vmovdqa HashKey_6(arg1), \T5
2397 vpshufd $0b01001110, \XMM3, \T2
2398 vpshufd $0b01001110, \T5, \T3
2399 vpxor \XMM3, \T2, \T2
2402 vpclmulqdq $0x11, \T5, \XMM3, \T4
2405 vpclmulqdq $0x00, \T5, \XMM3, \T4
2408 vpclmulqdq $0x00, \T3, \T2, \T2
2410 vpxor \T2, \XMM1, \XMM1
2412 ######################
2414 vmovdqa HashKey_5(arg1), \T5
2415 vpshufd $0b01001110, \XMM4, \T2
2416 vpshufd $0b01001110, \T5, \T3
2417 vpxor \XMM4, \T2, \T2
2420 vpclmulqdq $0x11, \T5, \XMM4, \T4
2423 vpclmulqdq $0x00, \T5, \XMM4, \T4
2426 vpclmulqdq $0x00, \T3, \T2, \T2
2428 vpxor \T2, \XMM1, \XMM1
2430 ######################
2432 vmovdqa HashKey_4(arg1), \T5
2433 vpshufd $0b01001110, \XMM5, \T2
2434 vpshufd $0b01001110, \T5, \T3
2435 vpxor \XMM5, \T2, \T2
2438 vpclmulqdq $0x11, \T5, \XMM5, \T4
2441 vpclmulqdq $0x00, \T5, \XMM5, \T4
2444 vpclmulqdq $0x00, \T3, \T2, \T2
2446 vpxor \T2, \XMM1, \XMM1
2448 ######################
2450 vmovdqa HashKey_3(arg1), \T5
2451 vpshufd $0b01001110, \XMM6, \T2
2452 vpshufd $0b01001110, \T5, \T3
2453 vpxor \XMM6, \T2, \T2
2456 vpclmulqdq $0x11, \T5, \XMM6, \T4
2459 vpclmulqdq $0x00, \T5, \XMM6, \T4
2462 vpclmulqdq $0x00, \T3, \T2, \T2
2464 vpxor \T2, \XMM1, \XMM1
2466 ######################
2468 vmovdqa HashKey_2(arg1), \T5
2469 vpshufd $0b01001110, \XMM7, \T2
2470 vpshufd $0b01001110, \T5, \T3
2471 vpxor \XMM7, \T2, \T2
2474 vpclmulqdq $0x11, \T5, \XMM7, \T4
2477 vpclmulqdq $0x00, \T5, \XMM7, \T4
2480 vpclmulqdq $0x00, \T3, \T2, \T2
2482 vpxor \T2, \XMM1, \XMM1
2484 ######################
2486 vmovdqa HashKey(arg1), \T5
2487 vpshufd $0b01001110, \XMM8, \T2
2488 vpshufd $0b01001110, \T5, \T3
2489 vpxor \XMM8, \T2, \T2
2492 vpclmulqdq $0x11, \T5, \XMM8, \T4
2495 vpclmulqdq $0x00, \T5, \XMM8, \T4
2498 vpclmulqdq $0x00, \T3, \T2, \T2
2500 vpxor \T2, \XMM1, \XMM1
2501 vpxor \T6, \XMM1, \XMM1
2502 vpxor \T7, \XMM1, \T2
2507 vpslldq $8, \T2, \T4
2508 vpsrldq $8, \T2, \T2
2511 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2512 # accumulated carry-less multiplications
2514 #######################################################################
2515 #first phase of the reduction
2516 vmovdqa POLY2(%rip), \T3
2518 vpclmulqdq $0x01, \T7, \T3, \T2
2519 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2521 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2522 #######################################################################
2525 #second phase of the reduction
2526 vpclmulqdq $0x00, \T7, \T3, \T2
2527 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2529 vpclmulqdq $0x10, \T7, \T3, \T4
2530 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2532 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2533 #######################################################################
2534 vpxor \T4, \T6, \T6 # the result is in T6
2539 # combined for GCM encrypt and decrypt functions
2540 # clobbering all xmm registers
2541 # clobbering r10, r11, r12, r13, r14, r15
2542 .macro GCM_ENC_DEC_AVX2 ENC_DEC
2544 #the number of pushes must equal STACK_OFFSET
2555 sub $VARIABLE_OFFSET, %rsp
2556 and $~63, %rsp # align rsp to 64 bytes
2559 vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
2561 mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
2562 and $-16, %r13 # r13 = r13 - (r13 mod 16)
2567 jz _initial_num_blocks_is_0\@
2570 je _initial_num_blocks_is_7\@
2572 je _initial_num_blocks_is_6\@
2574 je _initial_num_blocks_is_5\@
2576 je _initial_num_blocks_is_4\@
2578 je _initial_num_blocks_is_3\@
2580 je _initial_num_blocks_is_2\@
2582 jmp _initial_num_blocks_is_1\@
2584 _initial_num_blocks_is_7\@:
2585 INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2587 jmp _initial_blocks_encrypted\@
2589 _initial_num_blocks_is_6\@:
2590 INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2592 jmp _initial_blocks_encrypted\@
2594 _initial_num_blocks_is_5\@:
2595 INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2597 jmp _initial_blocks_encrypted\@
2599 _initial_num_blocks_is_4\@:
2600 INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2602 jmp _initial_blocks_encrypted\@
2604 _initial_num_blocks_is_3\@:
2605 INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2607 jmp _initial_blocks_encrypted\@
2609 _initial_num_blocks_is_2\@:
2610 INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2612 jmp _initial_blocks_encrypted\@
2614 _initial_num_blocks_is_1\@:
2615 INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2617 jmp _initial_blocks_encrypted\@
2619 _initial_num_blocks_is_0\@:
2620 INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
2623 _initial_blocks_encrypted\@:
2625 je _zero_cipher_left\@
2628 je _eight_cipher_left\@
2635 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2638 _encrypt_by_8_new\@:
2645 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
2648 jne _encrypt_by_8_new\@
2650 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2651 jmp _eight_cipher_left\@
2654 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2656 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
2657 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2660 jne _encrypt_by_8_new\@
2662 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2667 _eight_cipher_left\@:
2668 GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
2671 _zero_cipher_left\@:
2673 jl _only_less_than_16\@
2676 and $15, %r13 # r13 = (arg4 mod 16)
2678 je _multiple_of_16_bytes\@
2680 # handle the last <16 Byte block seperately
2683 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2684 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2685 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2689 vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
2691 lea SHIFT_MASK+16(%rip), %r12
2692 sub %r13, %r12 # adjust the shuffle mask pointer
2693 # to be able to shift 16-r13 bytes
2694 # (r13 is the number of bytes in plaintext mod 16)
2695 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
2696 vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
2697 jmp _final_ghash_mul\@
2699 _only_less_than_16\@:
2700 # check for 0 length
2702 and $15, %r13 # r13 = (arg4 mod 16)
2704 je _multiple_of_16_bytes\@
2706 # handle the last <16 Byte block seperately
2709 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
2710 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2711 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
2714 lea SHIFT_MASK+16(%rip), %r12
2715 sub %r13, %r12 # adjust the shuffle mask pointer to be
2716 # able to shift 16-r13 bytes (r13 is the
2717 # number of bytes in plaintext mod 16)
2719 _get_last_16_byte_loop\@:
2720 movb (arg3, %r11), %al
2721 movb %al, TMP1 (%rsp , %r11)
2724 jne _get_last_16_byte_loop\@
2726 vmovdqu TMP1(%rsp), %xmm1
2732 vmovdqa %xmm1, %xmm2
2733 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2734 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2735 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2736 vpand %xmm1, %xmm2, %xmm2
2737 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
2738 vpxor %xmm2, %xmm14, %xmm14
2739 #GHASH computation for the last <16 Byte block
2740 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2744 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
2745 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
2746 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
2747 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
2748 vpxor %xmm9, %xmm14, %xmm14
2749 #GHASH computation for the last <16 Byte block
2750 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
2753 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
2757 #############################
2761 jle _less_than_8_bytes_left\@
2763 mov %rax, (arg2 , %r11)
2765 vpsrldq $8, %xmm9, %xmm9
2769 _less_than_8_bytes_left\@:
2770 movb %al, (arg2 , %r11)
2774 jne _less_than_8_bytes_left\@
2775 #############################
2777 _multiple_of_16_bytes\@:
2778 mov arg7, %r12 # r12 = aadLen (number of bytes)
2779 shl $3, %r12 # convert into number of bits
2780 vmovd %r12d, %xmm15 # len(A) in xmm15
2782 shl $3, arg4 # len(C) in bits (*128)
2784 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
2785 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
2787 vpxor %xmm15, %xmm14, %xmm14
2788 GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
2789 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
2791 mov arg5, %rax # rax = *Y0
2792 vmovdqu (%rax), %xmm9 # xmm9 = Y0
2794 ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
2796 vpxor %xmm14, %xmm9, %xmm9
2801 mov arg8, %r10 # r10 = authTag
2802 mov arg9, %r11 # r11 = auth_tag_len
2815 vpsrldq $8, %xmm9, %xmm9
2823 vpsrldq $4, %xmm9, %xmm9
2837 jmp _return_T_done\@
2840 vmovdqu %xmm9, (%r10)
2852 #############################################################
2853 #void aesni_gcm_precomp_avx_gen4
2854 # (gcm_data *my_ctx_data,
2855 # u8 *hash_subkey)# /* H, the Hash sub key input.
2856 # Data starts on a 16-byte boundary. */
2857 #############################################################
2858 ENTRY(aesni_gcm_precomp_avx_gen4)
2859 #the number of pushes must equal STACK_OFFSET
2869 sub $VARIABLE_OFFSET, %rsp
2870 and $~63, %rsp # align rsp to 64 bytes
2872 vmovdqu (arg2), %xmm6 # xmm6 = HashKey
2874 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
2875 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
2876 vmovdqa %xmm6, %xmm2
2877 vpsllq $1, %xmm6, %xmm6
2878 vpsrlq $63, %xmm2, %xmm2
2879 vmovdqa %xmm2, %xmm1
2880 vpslldq $8, %xmm2, %xmm2
2881 vpsrldq $8, %xmm1, %xmm1
2882 vpor %xmm2, %xmm6, %xmm6
2884 vpshufd $0b00100100, %xmm1, %xmm2
2885 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
2886 vpand POLY(%rip), %xmm2, %xmm2
2887 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
2888 #######################################################################
2889 vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
2892 PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
2901 ENDPROC(aesni_gcm_precomp_avx_gen4)
2904 ###############################################################################
2905 #void aesni_gcm_enc_avx_gen4(
2906 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2907 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2908 # const u8 *in, /* Plaintext input */
2909 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2910 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2911 # (from Security Association) concatenated with 8 byte
2912 # Initialisation Vector (from IPSec ESP Payload)
2913 # concatenated with 0x00000001. 16-byte aligned pointer. */
2914 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2915 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2916 # u8 *auth_tag, /* Authenticated Tag output. */
2917 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2918 # Valid values are 16 (most likely), 12 or 8. */
2919 ###############################################################################
2920 ENTRY(aesni_gcm_enc_avx_gen4)
2921 GCM_ENC_DEC_AVX2 ENC
2923 ENDPROC(aesni_gcm_enc_avx_gen4)
2925 ###############################################################################
2926 #void aesni_gcm_dec_avx_gen4(
2927 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2928 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2929 # const u8 *in, /* Ciphertext input */
2930 # u64 plaintext_len, /* Length of data in Bytes for encryption. */
2931 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2932 # (from Security Association) concatenated with 8 byte
2933 # Initialisation Vector (from IPSec ESP Payload)
2934 # concatenated with 0x00000001. 16-byte aligned pointer. */
2935 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2936 # u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2937 # u8 *auth_tag, /* Authenticated Tag output. */
2938 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2939 # Valid values are 16 (most likely), 12 or 8. */
2940 ###############################################################################
2941 ENTRY(aesni_gcm_dec_avx_gen4)
2942 GCM_ENC_DEC_AVX2 DEC
2944 ENDPROC(aesni_gcm_dec_avx_gen4)
2946 #endif /* CONFIG_AS_AVX2 */