1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
128 POLY: .octa 0xC2000000000000000000000000000001
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
136 TWOONE: .octa 0x00000001000000000000000000000001
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
144 ONE: .octa 0x00000000000000000000000000000001
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
148 ONEf: .octa 0x01000000000000000000000000000000
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
160 .type aad_shift_arr, @object
161 .size aad_shift_arr, 272
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
187 #define InLen (16*1)+8
188 #define PBlockEncKey 16*2
190 #define CurCount 16*4
191 #define PBlockLen 16*5
193 HashKey = 16*6 # store HashKey <<1 mod poly here
194 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
195 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
196 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
197 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
198 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
199 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
200 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
201 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
202 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
203 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
204 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
205 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
206 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
207 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
208 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
216 #define arg7 STACK_OFFSET+8*1(%r14)
217 #define arg8 STACK_OFFSET+8*2(%r14)
218 #define arg9 STACK_OFFSET+8*3(%r14)
219 #define arg10 STACK_OFFSET+8*4(%r14)
220 #define keysize 2*15*16(arg1)
230 .macro define_reg r n
241 # need to push 4 registers into stack to maintain
244 TMP1 = 16*0 # Temporary storage for AAD
245 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
246 TMP3 = 16*2 # Temporary storage for AES State 3
247 TMP4 = 16*3 # Temporary storage for AES State 4
248 TMP5 = 16*4 # Temporary storage for AES State 5
249 TMP6 = 16*5 # Temporary storage for AES State 6
250 TMP7 = 16*6 # Temporary storage for AES State 7
251 TMP8 = 16*7 # Temporary storage for AES State 8
253 VARIABLE_OFFSET = 16*8
255 ################################
257 ################################
260 #the number of pushes must equal STACK_OFFSET
270 sub $VARIABLE_OFFSET, %rsp
271 and $~63, %rsp # align rsp to 64 bytes
283 # Encryption of a single block
284 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
285 vpxor (arg1), \XMM0, \XMM0
289 vaesenc 16*i(arg1), \XMM0, \XMM0
293 vaesenclast 16*i(arg1), \XMM0, \XMM0
296 # combined for GCM encrypt and decrypt functions
297 # clobbering all xmm registers
298 # clobbering r10, r11, r12, r13, r14, r15
299 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
300 vmovdqu AadHash(arg2), %xmm8
301 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
302 add arg5, InLen(arg2)
304 # initialize the data pointer offset as zero
307 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
310 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
311 and $-16, %r13 # r13 = r13 - (r13 mod 16)
316 jz _initial_num_blocks_is_0\@
319 je _initial_num_blocks_is_7\@
321 je _initial_num_blocks_is_6\@
323 je _initial_num_blocks_is_5\@
325 je _initial_num_blocks_is_4\@
327 je _initial_num_blocks_is_3\@
329 je _initial_num_blocks_is_2\@
331 jmp _initial_num_blocks_is_1\@
333 _initial_num_blocks_is_7\@:
334 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
336 jmp _initial_blocks_encrypted\@
338 _initial_num_blocks_is_6\@:
339 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
341 jmp _initial_blocks_encrypted\@
343 _initial_num_blocks_is_5\@:
344 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
346 jmp _initial_blocks_encrypted\@
348 _initial_num_blocks_is_4\@:
349 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
351 jmp _initial_blocks_encrypted\@
353 _initial_num_blocks_is_3\@:
354 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
356 jmp _initial_blocks_encrypted\@
358 _initial_num_blocks_is_2\@:
359 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
361 jmp _initial_blocks_encrypted\@
363 _initial_num_blocks_is_1\@:
364 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
366 jmp _initial_blocks_encrypted\@
368 _initial_num_blocks_is_0\@:
369 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
372 _initial_blocks_encrypted\@:
374 je _zero_cipher_left\@
377 je _eight_cipher_left\@
384 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
397 jne _encrypt_by_8_new\@
399 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
400 jmp _eight_cipher_left\@
403 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
405 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
406 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
409 jne _encrypt_by_8_new\@
411 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
416 _eight_cipher_left\@:
417 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
421 vmovdqu %xmm14, AadHash(arg2)
422 vmovdqu %xmm9, CurCount(arg2)
426 and $15, %r13 # r13 = (arg5 mod 16)
428 je _multiple_of_16_bytes\@
430 # handle the last <16 Byte block separately
432 mov %r13, PBlockLen(arg2)
434 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
435 vmovdqu %xmm9, CurCount(arg2)
436 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
438 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
439 vmovdqu %xmm9, PBlockEncKey(arg2)
442 jge _large_enough_update\@
444 lea (arg4,%r11,1), %r10
447 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
449 lea SHIFT_MASK+16(%rip), %r12
450 sub %r13, %r12 # adjust the shuffle mask pointer to be
451 # able to shift 16-r13 bytes (r13 is the
452 # number of bytes in plaintext mod 16)
454 jmp _final_ghash_mul\@
456 _large_enough_update\@:
460 # receive the last <16 Byte block
461 vmovdqu (arg4, %r11, 1), %xmm1
466 lea SHIFT_MASK+16(%rip), %r12
467 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
468 # (r13 is the number of bytes in plaintext mod 16)
470 # get the appropriate shuffle mask
471 vmovdqu (%r12), %xmm2
472 # shift right 16-r13 bytes
473 vpshufb %xmm2, %xmm1, %xmm1
478 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
479 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
480 # mask out top 16-r13 bytes of xmm9
481 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
482 vpand %xmm1, %xmm2, %xmm2
483 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
484 vpxor %xmm2, %xmm14, %xmm14
486 vmovdqu %xmm14, AadHash(arg2)
488 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
489 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
490 # mask out top 16-r13 bytes of xmm9
491 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
492 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
493 vpxor %xmm9, %xmm14, %xmm14
495 vmovdqu %xmm14, AadHash(arg2)
496 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
500 #############################
504 jle _less_than_8_bytes_left\@
506 mov %rax, (arg3 , %r11)
508 vpsrldq $8, %xmm9, %xmm9
512 _less_than_8_bytes_left\@:
513 movb %al, (arg3 , %r11)
517 jne _less_than_8_bytes_left\@
518 #############################
520 _multiple_of_16_bytes\@:
524 # GCM_COMPLETE Finishes update of tag of last partial block
525 # Output: Authorization Tag (AUTH_TAG)
526 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
527 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
528 vmovdqu AadHash(arg2), %xmm14
529 vmovdqu HashKey(arg2), %xmm13
531 mov PBlockLen(arg2), %r12
535 #GHASH computation for the last <16 Byte block
536 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
539 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
540 shl $3, %r12 # convert into number of bits
541 vmovd %r12d, %xmm15 # len(A) in xmm15
543 mov InLen(arg2), %r12
544 shl $3, %r12 # len(C) in bits (*128)
546 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
547 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
549 vpxor %xmm15, %xmm14, %xmm14
550 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
551 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
553 vmovdqu OrigIV(arg2), %xmm9
555 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
557 vpxor %xmm14, %xmm9, %xmm9
562 mov \AUTH_TAG, %r10 # r10 = authTag
563 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
576 vpsrldq $8, %xmm9, %xmm9
584 vpsrldq $4, %xmm9, %xmm9
601 vmovdqu %xmm9, (%r10)
606 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
608 mov \AAD, %r10 # r10 = AAD
609 mov \AADLEN, %r12 # r12 = aadLen
620 vpshufb SHUF_MASK(%rip), \T7, \T7
622 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
627 jge _get_AAD_blocks\@
634 /* read the last <16B of AAD. since we have at least 4B of
635 data right after the AAD (the ICV, and maybe some CT), we can
636 read 4B/8B blocks safely, and then get rid of the extra stuff */
654 vpslldq $12, \T1, \T1
658 /* finalize: shift out the extra bytes we read, and align
659 left. since pslldq can only shift by an immediate, we use
660 vpshufb and an array of shuffle masks */
663 vmovdqu aad_shift_arr(%r11), \T1
664 vpshufb \T1, \T7, \T7
665 _get_AAD_rest_final\@:
666 vpshufb SHUF_MASK(%rip), \T7, \T7
668 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
671 vmovdqu \T7, AadHash(arg2)
674 .macro INIT GHASH_MUL PRECOMPUTE
676 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
678 mov %r11, InLen(arg2) # ctx_data.in_length = 0
680 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
681 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
684 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
686 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
687 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
689 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
691 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
692 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
694 vpsllq $1, %xmm6, %xmm6
695 vpsrlq $63, %xmm2, %xmm2
697 vpslldq $8, %xmm2, %xmm2
698 vpsrldq $8, %xmm1, %xmm1
699 vpor %xmm2, %xmm6, %xmm6
701 vpshufd $0b00100100, %xmm1, %xmm2
702 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
703 vpand POLY(%rip), %xmm2, %xmm2
704 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
705 #######################################################################
706 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
708 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
710 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
714 # Reads DLEN bytes starting at DPTR and stores in XMMDst
715 # where 0 < DLEN < 16
716 # Clobbers %rax, DLEN
717 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
718 vpxor \XMMDst, \XMMDst, \XMMDst
723 vpinsrq $0, %rax, \XMMDst, \XMMDst
725 jz _done_read_partial_block_\@
729 mov 7(\DPTR, \DLEN, 1), %al
731 jnz _read_next_byte_\@
732 vpinsrq $1, %rax, \XMMDst, \XMMDst
733 jmp _done_read_partial_block_\@
736 _read_next_byte_lt8_\@:
738 mov -1(\DPTR, \DLEN, 1), %al
740 jnz _read_next_byte_lt8_\@
741 vpinsrq $0, %rax, \XMMDst, \XMMDst
742 _done_read_partial_block_\@:
745 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
746 # between update calls.
747 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
748 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
749 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
750 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
752 mov PBlockLen(arg2), %r13
754 je _partial_block_done_\@ # Leave Macro if no partial blocks
755 # Read in input data without over reading
756 cmp $16, \PLAIN_CYPH_LEN
757 jl _fewer_than_16_bytes_\@
758 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
761 _fewer_than_16_bytes_\@:
762 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
763 mov \PLAIN_CYPH_LEN, %r12
764 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
766 mov PBlockLen(arg2), %r13
768 _data_read_\@: # Finished reading in data
770 vmovdqu PBlockEncKey(arg2), %xmm9
771 vmovdqu HashKey(arg2), %xmm13
773 lea SHIFT_MASK(%rip), %r12
775 # adjust the shuffle mask pointer to be able to shift r13 bytes
776 # r16-r13 is the number of bytes in plaintext mod 16)
778 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
779 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
783 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
785 mov \PLAIN_CYPH_LEN, %r10
787 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
789 # Determine if if partial block is not being filled and
790 # shift mask accordingly
791 jge _no_extra_mask_1_\@
795 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
796 # get the appropriate mask to mask out bottom r13 bytes of xmm9
797 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
799 vpand %xmm1, %xmm3, %xmm3
800 vmovdqa SHUF_MASK(%rip), %xmm10
801 vpshufb %xmm10, %xmm3, %xmm3
802 vpshufb %xmm2, %xmm3, %xmm3
803 vpxor %xmm3, \AAD_HASH, \AAD_HASH
806 jl _partial_incomplete_1_\@
808 # GHASH computation for the last <16 Byte block
809 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
812 mov %rax, PBlockLen(arg2)
814 _partial_incomplete_1_\@:
815 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
817 vmovdqu \AAD_HASH, AadHash(arg2)
819 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
821 mov \PLAIN_CYPH_LEN, %r10
823 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
825 # Determine if if partial block is not being filled and
826 # shift mask accordingly
827 jge _no_extra_mask_2_\@
831 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
832 # get the appropriate mask to mask out bottom r13 bytes of xmm9
833 vpand %xmm1, %xmm9, %xmm9
835 vmovdqa SHUF_MASK(%rip), %xmm1
836 vpshufb %xmm1, %xmm9, %xmm9
837 vpshufb %xmm2, %xmm9, %xmm9
838 vpxor %xmm9, \AAD_HASH, \AAD_HASH
841 jl _partial_incomplete_2_\@
843 # GHASH computation for the last <16 Byte block
844 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
847 mov %rax, PBlockLen(arg2)
849 _partial_incomplete_2_\@:
850 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
852 vmovdqu \AAD_HASH, AadHash(arg2)
854 vmovdqa SHUF_MASK(%rip), %xmm10
855 # shuffle xmm9 back to output as ciphertext
856 vpshufb %xmm10, %xmm9, %xmm9
857 vpshufb %xmm2, %xmm9, %xmm9
859 # output encrypted Bytes
864 # Set r13 to be the number of bytes to write out
868 mov \PLAIN_CYPH_LEN, %r13
873 jle _less_than_8_bytes_left_\@
875 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
880 _less_than_8_bytes_left_\@:
881 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
885 jne _less_than_8_bytes_left_\@
886 _partial_block_done_\@:
887 .endm # PARTIAL_BLOCK
889 ###############################################################################
890 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
891 # Input: A and B (128-bits each, bit-reflected)
892 # Output: C = A*B*x mod poly, (i.e. >>1 )
893 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
894 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
895 ###############################################################################
896 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
898 vpshufd $0b01001110, \GH, \T2
899 vpshufd $0b01001110, \HK, \T3
900 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
901 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
903 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
904 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
905 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
907 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
909 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
910 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
912 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
914 #first phase of the reduction
915 vpslld $31, \GH, \T2 # packed right shifting << 31
916 vpslld $30, \GH, \T3 # packed right shifting shift << 30
917 vpslld $25, \GH, \T4 # packed right shifting shift << 25
919 vpxor \T3, \T2, \T2 # xor the shifted versions
922 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
924 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
925 vpxor \T2, \GH, \GH # first phase of the reduction complete
927 #second phase of the reduction
929 vpsrld $1,\GH, \T2 # packed left shifting >> 1
930 vpsrld $2,\GH, \T3 # packed left shifting >> 2
931 vpsrld $7,\GH, \T4 # packed left shifting >> 7
932 vpxor \T3, \T2, \T2 # xor the shifted versions
937 vpxor \T1, \GH, \GH # the result is in GH
942 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
944 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
947 vpshufd $0b01001110, \T5, \T1
949 vmovdqu \T1, HashKey_k(arg2)
951 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
952 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
953 vpshufd $0b01001110, \T5, \T1
955 vmovdqu \T1, HashKey_2_k(arg2)
957 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
958 vmovdqu \T5, HashKey_3(arg2)
959 vpshufd $0b01001110, \T5, \T1
961 vmovdqu \T1, HashKey_3_k(arg2)
963 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
964 vmovdqu \T5, HashKey_4(arg2)
965 vpshufd $0b01001110, \T5, \T1
967 vmovdqu \T1, HashKey_4_k(arg2)
969 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
970 vmovdqu \T5, HashKey_5(arg2)
971 vpshufd $0b01001110, \T5, \T1
973 vmovdqu \T1, HashKey_5_k(arg2)
975 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
976 vmovdqu \T5, HashKey_6(arg2)
977 vpshufd $0b01001110, \T5, \T1
979 vmovdqu \T1, HashKey_6_k(arg2)
981 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
982 vmovdqu \T5, HashKey_7(arg2)
983 vpshufd $0b01001110, \T5, \T1
985 vmovdqu \T1, HashKey_7_k(arg2)
987 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
988 vmovdqu \T5, HashKey_8(arg2)
989 vpshufd $0b01001110, \T5, \T1
991 vmovdqu \T1, HashKey_8_k(arg2)
995 ## if a = number of total plaintext bytes
997 ## num_initial_blocks = b mod 4#
998 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
999 ## r10, r11, r12, rax are clobbered
1000 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1002 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1003 i = (8-\num_initial_blocks)
1005 vmovdqu AadHash(arg2), reg_i
1007 # start AES for num_initial_blocks blocks
1008 vmovdqu CurCount(arg2), \CTR
1010 i = (9-\num_initial_blocks)
1012 .rep \num_initial_blocks
1013 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1015 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1020 vmovdqa (arg1), \T_key
1021 i = (9-\num_initial_blocks)
1023 .rep \num_initial_blocks
1024 vpxor \T_key, reg_i, reg_i
1032 vmovdqa 16*j(arg1), \T_key
1033 i = (9-\num_initial_blocks)
1035 .rep \num_initial_blocks
1036 vaesenc \T_key, reg_i, reg_i
1045 vmovdqa 16*j(arg1), \T_key
1046 i = (9-\num_initial_blocks)
1048 .rep \num_initial_blocks
1049 vaesenclast \T_key, reg_i, reg_i
1054 i = (9-\num_initial_blocks)
1056 .rep \num_initial_blocks
1057 vmovdqu (arg4, %r11), \T1
1058 vpxor \T1, reg_i, reg_i
1059 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1064 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1070 i = (8-\num_initial_blocks)
1071 j = (9-\num_initial_blocks)
1074 .rep \num_initial_blocks
1075 vpxor reg_i, reg_j, reg_j
1076 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1081 # XMM8 has the combined result here
1083 vmovdqa \XMM8, TMP1(%rsp)
1087 jl _initial_blocks_done\@ # no need for precomputed constants
1089 ###############################################################################
1090 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1091 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1093 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1095 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1097 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1099 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1101 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1103 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1105 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1107 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1109 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1111 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1113 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1115 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1117 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1119 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1121 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1123 vmovdqa (arg1), \T_key
1124 vpxor \T_key, \XMM1, \XMM1
1125 vpxor \T_key, \XMM2, \XMM2
1126 vpxor \T_key, \XMM3, \XMM3
1127 vpxor \T_key, \XMM4, \XMM4
1128 vpxor \T_key, \XMM5, \XMM5
1129 vpxor \T_key, \XMM6, \XMM6
1130 vpxor \T_key, \XMM7, \XMM7
1131 vpxor \T_key, \XMM8, \XMM8
1135 .rep \REP # do REP rounds
1136 vmovdqa 16*i(arg1), \T_key
1137 vaesenc \T_key, \XMM1, \XMM1
1138 vaesenc \T_key, \XMM2, \XMM2
1139 vaesenc \T_key, \XMM3, \XMM3
1140 vaesenc \T_key, \XMM4, \XMM4
1141 vaesenc \T_key, \XMM5, \XMM5
1142 vaesenc \T_key, \XMM6, \XMM6
1143 vaesenc \T_key, \XMM7, \XMM7
1144 vaesenc \T_key, \XMM8, \XMM8
1149 vmovdqa 16*i(arg1), \T_key
1150 vaesenclast \T_key, \XMM1, \XMM1
1151 vaesenclast \T_key, \XMM2, \XMM2
1152 vaesenclast \T_key, \XMM3, \XMM3
1153 vaesenclast \T_key, \XMM4, \XMM4
1154 vaesenclast \T_key, \XMM5, \XMM5
1155 vaesenclast \T_key, \XMM6, \XMM6
1156 vaesenclast \T_key, \XMM7, \XMM7
1157 vaesenclast \T_key, \XMM8, \XMM8
1159 vmovdqu (arg4, %r11), \T1
1160 vpxor \T1, \XMM1, \XMM1
1161 vmovdqu \XMM1, (arg3 , %r11)
1166 vmovdqu 16*1(arg4, %r11), \T1
1167 vpxor \T1, \XMM2, \XMM2
1168 vmovdqu \XMM2, 16*1(arg3 , %r11)
1173 vmovdqu 16*2(arg4, %r11), \T1
1174 vpxor \T1, \XMM3, \XMM3
1175 vmovdqu \XMM3, 16*2(arg3 , %r11)
1180 vmovdqu 16*3(arg4, %r11), \T1
1181 vpxor \T1, \XMM4, \XMM4
1182 vmovdqu \XMM4, 16*3(arg3 , %r11)
1187 vmovdqu 16*4(arg4, %r11), \T1
1188 vpxor \T1, \XMM5, \XMM5
1189 vmovdqu \XMM5, 16*4(arg3 , %r11)
1194 vmovdqu 16*5(arg4, %r11), \T1
1195 vpxor \T1, \XMM6, \XMM6
1196 vmovdqu \XMM6, 16*5(arg3 , %r11)
1201 vmovdqu 16*6(arg4, %r11), \T1
1202 vpxor \T1, \XMM7, \XMM7
1203 vmovdqu \XMM7, 16*6(arg3 , %r11)
1208 vmovdqu 16*7(arg4, %r11), \T1
1209 vpxor \T1, \XMM8, \XMM8
1210 vmovdqu \XMM8, 16*7(arg3 , %r11)
1217 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1218 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1219 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1220 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1221 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1222 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1223 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1224 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1225 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1227 ###############################################################################
1229 _initial_blocks_done\@:
1233 # encrypt 8 blocks at a time
1234 # ghash the 8 previously encrypted ciphertext blocks
1235 # arg1, arg3, arg4 are used as pointers only, not modified
1236 # r11 is the data offset value
1237 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1240 vmovdqa \XMM2, TMP2(%rsp)
1241 vmovdqa \XMM3, TMP3(%rsp)
1242 vmovdqa \XMM4, TMP4(%rsp)
1243 vmovdqa \XMM5, TMP5(%rsp)
1244 vmovdqa \XMM6, TMP6(%rsp)
1245 vmovdqa \XMM7, TMP7(%rsp)
1246 vmovdqa \XMM8, TMP8(%rsp)
1248 .if \loop_idx == in_order
1249 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1250 vpaddd ONE(%rip), \XMM1, \XMM2
1251 vpaddd ONE(%rip), \XMM2, \XMM3
1252 vpaddd ONE(%rip), \XMM3, \XMM4
1253 vpaddd ONE(%rip), \XMM4, \XMM5
1254 vpaddd ONE(%rip), \XMM5, \XMM6
1255 vpaddd ONE(%rip), \XMM6, \XMM7
1256 vpaddd ONE(%rip), \XMM7, \XMM8
1259 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1260 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1261 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1262 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1263 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1264 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1265 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1266 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1268 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1269 vpaddd ONEf(%rip), \XMM1, \XMM2
1270 vpaddd ONEf(%rip), \XMM2, \XMM3
1271 vpaddd ONEf(%rip), \XMM3, \XMM4
1272 vpaddd ONEf(%rip), \XMM4, \XMM5
1273 vpaddd ONEf(%rip), \XMM5, \XMM6
1274 vpaddd ONEf(%rip), \XMM6, \XMM7
1275 vpaddd ONEf(%rip), \XMM7, \XMM8
1280 #######################################################################
1283 vpxor \T1, \XMM1, \XMM1
1284 vpxor \T1, \XMM2, \XMM2
1285 vpxor \T1, \XMM3, \XMM3
1286 vpxor \T1, \XMM4, \XMM4
1287 vpxor \T1, \XMM5, \XMM5
1288 vpxor \T1, \XMM6, \XMM6
1289 vpxor \T1, \XMM7, \XMM7
1290 vpxor \T1, \XMM8, \XMM8
1292 #######################################################################
1298 vmovdqu 16*1(arg1), \T1
1299 vaesenc \T1, \XMM1, \XMM1
1300 vaesenc \T1, \XMM2, \XMM2
1301 vaesenc \T1, \XMM3, \XMM3
1302 vaesenc \T1, \XMM4, \XMM4
1303 vaesenc \T1, \XMM5, \XMM5
1304 vaesenc \T1, \XMM6, \XMM6
1305 vaesenc \T1, \XMM7, \XMM7
1306 vaesenc \T1, \XMM8, \XMM8
1308 vmovdqu 16*2(arg1), \T1
1309 vaesenc \T1, \XMM1, \XMM1
1310 vaesenc \T1, \XMM2, \XMM2
1311 vaesenc \T1, \XMM3, \XMM3
1312 vaesenc \T1, \XMM4, \XMM4
1313 vaesenc \T1, \XMM5, \XMM5
1314 vaesenc \T1, \XMM6, \XMM6
1315 vaesenc \T1, \XMM7, \XMM7
1316 vaesenc \T1, \XMM8, \XMM8
1319 #######################################################################
1321 vmovdqu HashKey_8(arg2), \T5
1322 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1323 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1325 vpshufd $0b01001110, \T2, \T6
1328 vmovdqu HashKey_8_k(arg2), \T5
1329 vpclmulqdq $0x00, \T5, \T6, \T6
1331 vmovdqu 16*3(arg1), \T1
1332 vaesenc \T1, \XMM1, \XMM1
1333 vaesenc \T1, \XMM2, \XMM2
1334 vaesenc \T1, \XMM3, \XMM3
1335 vaesenc \T1, \XMM4, \XMM4
1336 vaesenc \T1, \XMM5, \XMM5
1337 vaesenc \T1, \XMM6, \XMM6
1338 vaesenc \T1, \XMM7, \XMM7
1339 vaesenc \T1, \XMM8, \XMM8
1341 vmovdqa TMP2(%rsp), \T1
1342 vmovdqu HashKey_7(arg2), \T5
1343 vpclmulqdq $0x11, \T5, \T1, \T3
1345 vpclmulqdq $0x00, \T5, \T1, \T3
1348 vpshufd $0b01001110, \T1, \T3
1350 vmovdqu HashKey_7_k(arg2), \T5
1351 vpclmulqdq $0x10, \T5, \T3, \T3
1354 vmovdqu 16*4(arg1), \T1
1355 vaesenc \T1, \XMM1, \XMM1
1356 vaesenc \T1, \XMM2, \XMM2
1357 vaesenc \T1, \XMM3, \XMM3
1358 vaesenc \T1, \XMM4, \XMM4
1359 vaesenc \T1, \XMM5, \XMM5
1360 vaesenc \T1, \XMM6, \XMM6
1361 vaesenc \T1, \XMM7, \XMM7
1362 vaesenc \T1, \XMM8, \XMM8
1364 #######################################################################
1366 vmovdqa TMP3(%rsp), \T1
1367 vmovdqu HashKey_6(arg2), \T5
1368 vpclmulqdq $0x11, \T5, \T1, \T3
1370 vpclmulqdq $0x00, \T5, \T1, \T3
1373 vpshufd $0b01001110, \T1, \T3
1375 vmovdqu HashKey_6_k(arg2), \T5
1376 vpclmulqdq $0x10, \T5, \T3, \T3
1379 vmovdqu 16*5(arg1), \T1
1380 vaesenc \T1, \XMM1, \XMM1
1381 vaesenc \T1, \XMM2, \XMM2
1382 vaesenc \T1, \XMM3, \XMM3
1383 vaesenc \T1, \XMM4, \XMM4
1384 vaesenc \T1, \XMM5, \XMM5
1385 vaesenc \T1, \XMM6, \XMM6
1386 vaesenc \T1, \XMM7, \XMM7
1387 vaesenc \T1, \XMM8, \XMM8
1389 vmovdqa TMP4(%rsp), \T1
1390 vmovdqu HashKey_5(arg2), \T5
1391 vpclmulqdq $0x11, \T5, \T1, \T3
1393 vpclmulqdq $0x00, \T5, \T1, \T3
1396 vpshufd $0b01001110, \T1, \T3
1398 vmovdqu HashKey_5_k(arg2), \T5
1399 vpclmulqdq $0x10, \T5, \T3, \T3
1402 vmovdqu 16*6(arg1), \T1
1403 vaesenc \T1, \XMM1, \XMM1
1404 vaesenc \T1, \XMM2, \XMM2
1405 vaesenc \T1, \XMM3, \XMM3
1406 vaesenc \T1, \XMM4, \XMM4
1407 vaesenc \T1, \XMM5, \XMM5
1408 vaesenc \T1, \XMM6, \XMM6
1409 vaesenc \T1, \XMM7, \XMM7
1410 vaesenc \T1, \XMM8, \XMM8
1413 vmovdqa TMP5(%rsp), \T1
1414 vmovdqu HashKey_4(arg2), \T5
1415 vpclmulqdq $0x11, \T5, \T1, \T3
1417 vpclmulqdq $0x00, \T5, \T1, \T3
1420 vpshufd $0b01001110, \T1, \T3
1422 vmovdqu HashKey_4_k(arg2), \T5
1423 vpclmulqdq $0x10, \T5, \T3, \T3
1426 vmovdqu 16*7(arg1), \T1
1427 vaesenc \T1, \XMM1, \XMM1
1428 vaesenc \T1, \XMM2, \XMM2
1429 vaesenc \T1, \XMM3, \XMM3
1430 vaesenc \T1, \XMM4, \XMM4
1431 vaesenc \T1, \XMM5, \XMM5
1432 vaesenc \T1, \XMM6, \XMM6
1433 vaesenc \T1, \XMM7, \XMM7
1434 vaesenc \T1, \XMM8, \XMM8
1436 vmovdqa TMP6(%rsp), \T1
1437 vmovdqu HashKey_3(arg2), \T5
1438 vpclmulqdq $0x11, \T5, \T1, \T3
1440 vpclmulqdq $0x00, \T5, \T1, \T3
1443 vpshufd $0b01001110, \T1, \T3
1445 vmovdqu HashKey_3_k(arg2), \T5
1446 vpclmulqdq $0x10, \T5, \T3, \T3
1450 vmovdqu 16*8(arg1), \T1
1451 vaesenc \T1, \XMM1, \XMM1
1452 vaesenc \T1, \XMM2, \XMM2
1453 vaesenc \T1, \XMM3, \XMM3
1454 vaesenc \T1, \XMM4, \XMM4
1455 vaesenc \T1, \XMM5, \XMM5
1456 vaesenc \T1, \XMM6, \XMM6
1457 vaesenc \T1, \XMM7, \XMM7
1458 vaesenc \T1, \XMM8, \XMM8
1460 vmovdqa TMP7(%rsp), \T1
1461 vmovdqu HashKey_2(arg2), \T5
1462 vpclmulqdq $0x11, \T5, \T1, \T3
1464 vpclmulqdq $0x00, \T5, \T1, \T3
1467 vpshufd $0b01001110, \T1, \T3
1469 vmovdqu HashKey_2_k(arg2), \T5
1470 vpclmulqdq $0x10, \T5, \T3, \T3
1473 #######################################################################
1475 vmovdqu 16*9(arg1), \T5
1476 vaesenc \T5, \XMM1, \XMM1
1477 vaesenc \T5, \XMM2, \XMM2
1478 vaesenc \T5, \XMM3, \XMM3
1479 vaesenc \T5, \XMM4, \XMM4
1480 vaesenc \T5, \XMM5, \XMM5
1481 vaesenc \T5, \XMM6, \XMM6
1482 vaesenc \T5, \XMM7, \XMM7
1483 vaesenc \T5, \XMM8, \XMM8
1485 vmovdqa TMP8(%rsp), \T1
1486 vmovdqu HashKey(arg2), \T5
1487 vpclmulqdq $0x11, \T5, \T1, \T3
1489 vpclmulqdq $0x00, \T5, \T1, \T3
1492 vpshufd $0b01001110, \T1, \T3
1494 vmovdqu HashKey_k(arg2), \T5
1495 vpclmulqdq $0x10, \T5, \T3, \T3
1501 vmovdqu 16*10(arg1), \T5
1507 vaesenc \T5, \XMM1, \XMM1
1508 vaesenc \T5, \XMM2, \XMM2
1509 vaesenc \T5, \XMM3, \XMM3
1510 vaesenc \T5, \XMM4, \XMM4
1511 vaesenc \T5, \XMM5, \XMM5
1512 vaesenc \T5, \XMM6, \XMM6
1513 vaesenc \T5, \XMM7, \XMM7
1514 vaesenc \T5, \XMM8, \XMM8
1516 vmovdqu 16*i(arg1), \T5
1525 vpxor 16*i(arg4, %r11), \T5, \T2
1527 vaesenclast \T2, reg_j, reg_j
1529 vaesenclast \T2, reg_j, \T3
1530 vmovdqu 16*i(arg4, %r11), reg_j
1531 vmovdqu \T3, 16*i(arg3, %r11)
1537 #######################################################################
1540 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1541 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1543 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1547 #######################################################################
1548 #first phase of the reduction
1549 #######################################################################
1550 vpslld $31, \T7, \T2 # packed right shifting << 31
1551 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1552 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1554 vpxor \T3, \T2, \T2 # xor the shifted versions
1557 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1559 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1560 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1561 #######################################################################
1563 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1564 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1565 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1566 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1567 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1568 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1569 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1570 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1573 #######################################################################
1574 #second phase of the reduction
1575 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1576 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1577 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1578 vpxor \T3, \T2, \T2 # xor the shifted versions
1583 vpxor \T7, \T6, \T6 # the result is in T6
1584 #######################################################################
1586 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1587 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1588 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1589 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1590 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1591 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1592 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1593 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1596 vpxor \T6, \XMM1, \XMM1
1603 # GHASH the last 4 ciphertext blocks.
1604 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1609 vpshufd $0b01001110, \XMM1, \T2
1610 vpxor \XMM1, \T2, \T2
1611 vmovdqu HashKey_8(arg2), \T5
1612 vpclmulqdq $0x11, \T5, \XMM1, \T6
1613 vpclmulqdq $0x00, \T5, \XMM1, \T7
1615 vmovdqu HashKey_8_k(arg2), \T3
1616 vpclmulqdq $0x00, \T3, \T2, \XMM1
1618 ######################
1620 vpshufd $0b01001110, \XMM2, \T2
1621 vpxor \XMM2, \T2, \T2
1622 vmovdqu HashKey_7(arg2), \T5
1623 vpclmulqdq $0x11, \T5, \XMM2, \T4
1626 vpclmulqdq $0x00, \T5, \XMM2, \T4
1629 vmovdqu HashKey_7_k(arg2), \T3
1630 vpclmulqdq $0x00, \T3, \T2, \T2
1631 vpxor \T2, \XMM1, \XMM1
1633 ######################
1635 vpshufd $0b01001110, \XMM3, \T2
1636 vpxor \XMM3, \T2, \T2
1637 vmovdqu HashKey_6(arg2), \T5
1638 vpclmulqdq $0x11, \T5, \XMM3, \T4
1641 vpclmulqdq $0x00, \T5, \XMM3, \T4
1644 vmovdqu HashKey_6_k(arg2), \T3
1645 vpclmulqdq $0x00, \T3, \T2, \T2
1646 vpxor \T2, \XMM1, \XMM1
1648 ######################
1650 vpshufd $0b01001110, \XMM4, \T2
1651 vpxor \XMM4, \T2, \T2
1652 vmovdqu HashKey_5(arg2), \T5
1653 vpclmulqdq $0x11, \T5, \XMM4, \T4
1656 vpclmulqdq $0x00, \T5, \XMM4, \T4
1659 vmovdqu HashKey_5_k(arg2), \T3
1660 vpclmulqdq $0x00, \T3, \T2, \T2
1661 vpxor \T2, \XMM1, \XMM1
1663 ######################
1665 vpshufd $0b01001110, \XMM5, \T2
1666 vpxor \XMM5, \T2, \T2
1667 vmovdqu HashKey_4(arg2), \T5
1668 vpclmulqdq $0x11, \T5, \XMM5, \T4
1671 vpclmulqdq $0x00, \T5, \XMM5, \T4
1674 vmovdqu HashKey_4_k(arg2), \T3
1675 vpclmulqdq $0x00, \T3, \T2, \T2
1676 vpxor \T2, \XMM1, \XMM1
1678 ######################
1680 vpshufd $0b01001110, \XMM6, \T2
1681 vpxor \XMM6, \T2, \T2
1682 vmovdqu HashKey_3(arg2), \T5
1683 vpclmulqdq $0x11, \T5, \XMM6, \T4
1686 vpclmulqdq $0x00, \T5, \XMM6, \T4
1689 vmovdqu HashKey_3_k(arg2), \T3
1690 vpclmulqdq $0x00, \T3, \T2, \T2
1691 vpxor \T2, \XMM1, \XMM1
1693 ######################
1695 vpshufd $0b01001110, \XMM7, \T2
1696 vpxor \XMM7, \T2, \T2
1697 vmovdqu HashKey_2(arg2), \T5
1698 vpclmulqdq $0x11, \T5, \XMM7, \T4
1701 vpclmulqdq $0x00, \T5, \XMM7, \T4
1704 vmovdqu HashKey_2_k(arg2), \T3
1705 vpclmulqdq $0x00, \T3, \T2, \T2
1706 vpxor \T2, \XMM1, \XMM1
1708 ######################
1710 vpshufd $0b01001110, \XMM8, \T2
1711 vpxor \XMM8, \T2, \T2
1712 vmovdqu HashKey(arg2), \T5
1713 vpclmulqdq $0x11, \T5, \XMM8, \T4
1716 vpclmulqdq $0x00, \T5, \XMM8, \T4
1719 vmovdqu HashKey_k(arg2), \T3
1720 vpclmulqdq $0x00, \T3, \T2, \T2
1722 vpxor \T2, \XMM1, \XMM1
1723 vpxor \T6, \XMM1, \XMM1
1724 vpxor \T7, \XMM1, \T2
1729 vpslldq $8, \T2, \T4
1730 vpsrldq $8, \T2, \T2
1733 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1734 # the accumulated carry-less multiplications
1736 #######################################################################
1737 #first phase of the reduction
1738 vpslld $31, \T7, \T2 # packed right shifting << 31
1739 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1740 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1742 vpxor \T3, \T2, \T2 # xor the shifted versions
1745 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1747 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1748 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1749 #######################################################################
1752 #second phase of the reduction
1753 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1754 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1755 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1756 vpxor \T3, \T2, \T2 # xor the shifted versions
1761 vpxor \T7, \T6, \T6 # the result is in T6
1765 #############################################################
1766 #void aesni_gcm_precomp_avx_gen2
1767 # (gcm_data *my_ctx_data,
1768 # gcm_context_data *data,
1769 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1770 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1771 # (from Security Association) concatenated with 8 byte
1772 # Initialisation Vector (from IPSec ESP Payload)
1773 # concatenated with 0x00000001. 16-byte aligned pointer. */
1774 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1775 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1776 #############################################################
1777 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1779 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1782 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1784 ###############################################################################
1785 #void aesni_gcm_enc_update_avx_gen2(
1786 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1787 # gcm_context_data *data,
1788 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1789 # const u8 *in, /* Plaintext input */
1790 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1791 ###############################################################################
1792 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1796 je key_256_enc_update
1798 je key_128_enc_update
1800 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1804 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1808 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1811 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1813 ###############################################################################
1814 #void aesni_gcm_dec_update_avx_gen2(
1815 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1816 # gcm_context_data *data,
1817 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1818 # const u8 *in, /* Ciphertext input */
1819 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1820 ###############################################################################
1821 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1825 je key_256_dec_update
1827 je key_128_dec_update
1829 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1833 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1837 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1840 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1842 ###############################################################################
1843 #void aesni_gcm_finalize_avx_gen2(
1844 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1845 # gcm_context_data *data,
1846 # u8 *auth_tag, /* Authenticated Tag output. */
1847 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1848 # Valid values are 16 (most likely), 12 or 8. */
1849 ###############################################################################
1850 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1858 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1862 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1866 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1869 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1871 ###############################################################################
1872 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1873 # Input: A and B (128-bits each, bit-reflected)
1874 # Output: C = A*B*x mod poly, (i.e. >>1 )
1875 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1876 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1877 ###############################################################################
1878 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1880 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1881 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1882 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1883 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1887 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1888 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1893 #######################################################################
1894 #first phase of the reduction
1895 vmovdqa POLY2(%rip), \T3
1897 vpclmulqdq $0x01, \GH, \T3, \T2
1898 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1900 vpxor \T2, \GH, \GH # first phase of the reduction complete
1901 #######################################################################
1902 #second phase of the reduction
1903 vpclmulqdq $0x00, \GH, \T3, \T2
1904 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1906 vpclmulqdq $0x10, \GH, \T3, \GH
1907 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1909 vpxor \T2, \GH, \GH # second phase of the reduction complete
1910 #######################################################################
1911 vpxor \T1, \GH, \GH # the result is in GH
1916 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1918 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1920 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1921 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1923 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1924 vmovdqu \T5, HashKey_3(arg2)
1926 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1927 vmovdqu \T5, HashKey_4(arg2)
1929 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1930 vmovdqu \T5, HashKey_5(arg2)
1932 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1933 vmovdqu \T5, HashKey_6(arg2)
1935 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1936 vmovdqu \T5, HashKey_7(arg2)
1938 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1939 vmovdqu \T5, HashKey_8(arg2)
1943 ## if a = number of total plaintext bytes
1945 ## num_initial_blocks = b mod 4#
1946 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1947 ## r10, r11, r12, rax are clobbered
1948 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1950 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1951 i = (8-\num_initial_blocks)
1953 vmovdqu AadHash(arg2), reg_i
1955 # start AES for num_initial_blocks blocks
1956 vmovdqu CurCount(arg2), \CTR
1958 i = (9-\num_initial_blocks)
1960 .rep \num_initial_blocks
1961 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1963 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1968 vmovdqa (arg1), \T_key
1969 i = (9-\num_initial_blocks)
1971 .rep \num_initial_blocks
1972 vpxor \T_key, reg_i, reg_i
1980 vmovdqa 16*j(arg1), \T_key
1981 i = (9-\num_initial_blocks)
1983 .rep \num_initial_blocks
1984 vaesenc \T_key, reg_i, reg_i
1994 vmovdqa 16*j(arg1), \T_key
1995 i = (9-\num_initial_blocks)
1997 .rep \num_initial_blocks
1998 vaesenclast \T_key, reg_i, reg_i
2003 i = (9-\num_initial_blocks)
2005 .rep \num_initial_blocks
2006 vmovdqu (arg4, %r11), \T1
2007 vpxor \T1, reg_i, reg_i
2008 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
2009 # num_initial_blocks blocks
2014 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
2020 i = (8-\num_initial_blocks)
2021 j = (9-\num_initial_blocks)
2024 .rep \num_initial_blocks
2025 vpxor reg_i, reg_j, reg_j
2026 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
2031 # XMM8 has the combined result here
2033 vmovdqa \XMM8, TMP1(%rsp)
2037 jl _initial_blocks_done\@ # no need for precomputed constants
2039 ###############################################################################
2040 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2041 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2043 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2045 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2047 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2049 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2051 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2053 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2055 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2057 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2059 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2061 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2063 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2065 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2067 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2069 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2071 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2073 vmovdqa (arg1), \T_key
2074 vpxor \T_key, \XMM1, \XMM1
2075 vpxor \T_key, \XMM2, \XMM2
2076 vpxor \T_key, \XMM3, \XMM3
2077 vpxor \T_key, \XMM4, \XMM4
2078 vpxor \T_key, \XMM5, \XMM5
2079 vpxor \T_key, \XMM6, \XMM6
2080 vpxor \T_key, \XMM7, \XMM7
2081 vpxor \T_key, \XMM8, \XMM8
2085 .rep \REP # do REP rounds
2086 vmovdqa 16*i(arg1), \T_key
2087 vaesenc \T_key, \XMM1, \XMM1
2088 vaesenc \T_key, \XMM2, \XMM2
2089 vaesenc \T_key, \XMM3, \XMM3
2090 vaesenc \T_key, \XMM4, \XMM4
2091 vaesenc \T_key, \XMM5, \XMM5
2092 vaesenc \T_key, \XMM6, \XMM6
2093 vaesenc \T_key, \XMM7, \XMM7
2094 vaesenc \T_key, \XMM8, \XMM8
2100 vmovdqa 16*i(arg1), \T_key
2101 vaesenclast \T_key, \XMM1, \XMM1
2102 vaesenclast \T_key, \XMM2, \XMM2
2103 vaesenclast \T_key, \XMM3, \XMM3
2104 vaesenclast \T_key, \XMM4, \XMM4
2105 vaesenclast \T_key, \XMM5, \XMM5
2106 vaesenclast \T_key, \XMM6, \XMM6
2107 vaesenclast \T_key, \XMM7, \XMM7
2108 vaesenclast \T_key, \XMM8, \XMM8
2110 vmovdqu (arg4, %r11), \T1
2111 vpxor \T1, \XMM1, \XMM1
2112 vmovdqu \XMM1, (arg3 , %r11)
2117 vmovdqu 16*1(arg4, %r11), \T1
2118 vpxor \T1, \XMM2, \XMM2
2119 vmovdqu \XMM2, 16*1(arg3 , %r11)
2124 vmovdqu 16*2(arg4, %r11), \T1
2125 vpxor \T1, \XMM3, \XMM3
2126 vmovdqu \XMM3, 16*2(arg3 , %r11)
2131 vmovdqu 16*3(arg4, %r11), \T1
2132 vpxor \T1, \XMM4, \XMM4
2133 vmovdqu \XMM4, 16*3(arg3 , %r11)
2138 vmovdqu 16*4(arg4, %r11), \T1
2139 vpxor \T1, \XMM5, \XMM5
2140 vmovdqu \XMM5, 16*4(arg3 , %r11)
2145 vmovdqu 16*5(arg4, %r11), \T1
2146 vpxor \T1, \XMM6, \XMM6
2147 vmovdqu \XMM6, 16*5(arg3 , %r11)
2152 vmovdqu 16*6(arg4, %r11), \T1
2153 vpxor \T1, \XMM7, \XMM7
2154 vmovdqu \XMM7, 16*6(arg3 , %r11)
2159 vmovdqu 16*7(arg4, %r11), \T1
2160 vpxor \T1, \XMM8, \XMM8
2161 vmovdqu \XMM8, 16*7(arg3 , %r11)
2168 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2169 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2170 # the corresponding ciphertext
2171 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2172 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2173 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2174 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2175 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2176 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2177 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2179 ###############################################################################
2181 _initial_blocks_done\@:
2188 # encrypt 8 blocks at a time
2189 # ghash the 8 previously encrypted ciphertext blocks
2190 # arg1, arg3, arg4 are used as pointers only, not modified
2191 # r11 is the data offset value
2192 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2195 vmovdqa \XMM2, TMP2(%rsp)
2196 vmovdqa \XMM3, TMP3(%rsp)
2197 vmovdqa \XMM4, TMP4(%rsp)
2198 vmovdqa \XMM5, TMP5(%rsp)
2199 vmovdqa \XMM6, TMP6(%rsp)
2200 vmovdqa \XMM7, TMP7(%rsp)
2201 vmovdqa \XMM8, TMP8(%rsp)
2203 .if \loop_idx == in_order
2204 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2205 vpaddd ONE(%rip), \XMM1, \XMM2
2206 vpaddd ONE(%rip), \XMM2, \XMM3
2207 vpaddd ONE(%rip), \XMM3, \XMM4
2208 vpaddd ONE(%rip), \XMM4, \XMM5
2209 vpaddd ONE(%rip), \XMM5, \XMM6
2210 vpaddd ONE(%rip), \XMM6, \XMM7
2211 vpaddd ONE(%rip), \XMM7, \XMM8
2214 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2215 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2216 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2217 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2218 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2219 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2220 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2221 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2223 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2224 vpaddd ONEf(%rip), \XMM1, \XMM2
2225 vpaddd ONEf(%rip), \XMM2, \XMM3
2226 vpaddd ONEf(%rip), \XMM3, \XMM4
2227 vpaddd ONEf(%rip), \XMM4, \XMM5
2228 vpaddd ONEf(%rip), \XMM5, \XMM6
2229 vpaddd ONEf(%rip), \XMM6, \XMM7
2230 vpaddd ONEf(%rip), \XMM7, \XMM8
2235 #######################################################################
2238 vpxor \T1, \XMM1, \XMM1
2239 vpxor \T1, \XMM2, \XMM2
2240 vpxor \T1, \XMM3, \XMM3
2241 vpxor \T1, \XMM4, \XMM4
2242 vpxor \T1, \XMM5, \XMM5
2243 vpxor \T1, \XMM6, \XMM6
2244 vpxor \T1, \XMM7, \XMM7
2245 vpxor \T1, \XMM8, \XMM8
2247 #######################################################################
2253 vmovdqu 16*1(arg1), \T1
2254 vaesenc \T1, \XMM1, \XMM1
2255 vaesenc \T1, \XMM2, \XMM2
2256 vaesenc \T1, \XMM3, \XMM3
2257 vaesenc \T1, \XMM4, \XMM4
2258 vaesenc \T1, \XMM5, \XMM5
2259 vaesenc \T1, \XMM6, \XMM6
2260 vaesenc \T1, \XMM7, \XMM7
2261 vaesenc \T1, \XMM8, \XMM8
2263 vmovdqu 16*2(arg1), \T1
2264 vaesenc \T1, \XMM1, \XMM1
2265 vaesenc \T1, \XMM2, \XMM2
2266 vaesenc \T1, \XMM3, \XMM3
2267 vaesenc \T1, \XMM4, \XMM4
2268 vaesenc \T1, \XMM5, \XMM5
2269 vaesenc \T1, \XMM6, \XMM6
2270 vaesenc \T1, \XMM7, \XMM7
2271 vaesenc \T1, \XMM8, \XMM8
2274 #######################################################################
2276 vmovdqu HashKey_8(arg2), \T5
2277 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2278 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2279 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2280 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2283 vmovdqu 16*3(arg1), \T1
2284 vaesenc \T1, \XMM1, \XMM1
2285 vaesenc \T1, \XMM2, \XMM2
2286 vaesenc \T1, \XMM3, \XMM3
2287 vaesenc \T1, \XMM4, \XMM4
2288 vaesenc \T1, \XMM5, \XMM5
2289 vaesenc \T1, \XMM6, \XMM6
2290 vaesenc \T1, \XMM7, \XMM7
2291 vaesenc \T1, \XMM8, \XMM8
2293 vmovdqa TMP2(%rsp), \T1
2294 vmovdqu HashKey_7(arg2), \T5
2295 vpclmulqdq $0x11, \T5, \T1, \T3
2298 vpclmulqdq $0x00, \T5, \T1, \T3
2301 vpclmulqdq $0x01, \T5, \T1, \T3
2304 vpclmulqdq $0x10, \T5, \T1, \T3
2307 vmovdqu 16*4(arg1), \T1
2308 vaesenc \T1, \XMM1, \XMM1
2309 vaesenc \T1, \XMM2, \XMM2
2310 vaesenc \T1, \XMM3, \XMM3
2311 vaesenc \T1, \XMM4, \XMM4
2312 vaesenc \T1, \XMM5, \XMM5
2313 vaesenc \T1, \XMM6, \XMM6
2314 vaesenc \T1, \XMM7, \XMM7
2315 vaesenc \T1, \XMM8, \XMM8
2317 #######################################################################
2319 vmovdqa TMP3(%rsp), \T1
2320 vmovdqu HashKey_6(arg2), \T5
2321 vpclmulqdq $0x11, \T5, \T1, \T3
2324 vpclmulqdq $0x00, \T5, \T1, \T3
2327 vpclmulqdq $0x01, \T5, \T1, \T3
2330 vpclmulqdq $0x10, \T5, \T1, \T3
2333 vmovdqu 16*5(arg1), \T1
2334 vaesenc \T1, \XMM1, \XMM1
2335 vaesenc \T1, \XMM2, \XMM2
2336 vaesenc \T1, \XMM3, \XMM3
2337 vaesenc \T1, \XMM4, \XMM4
2338 vaesenc \T1, \XMM5, \XMM5
2339 vaesenc \T1, \XMM6, \XMM6
2340 vaesenc \T1, \XMM7, \XMM7
2341 vaesenc \T1, \XMM8, \XMM8
2343 vmovdqa TMP4(%rsp), \T1
2344 vmovdqu HashKey_5(arg2), \T5
2345 vpclmulqdq $0x11, \T5, \T1, \T3
2348 vpclmulqdq $0x00, \T5, \T1, \T3
2351 vpclmulqdq $0x01, \T5, \T1, \T3
2354 vpclmulqdq $0x10, \T5, \T1, \T3
2357 vmovdqu 16*6(arg1), \T1
2358 vaesenc \T1, \XMM1, \XMM1
2359 vaesenc \T1, \XMM2, \XMM2
2360 vaesenc \T1, \XMM3, \XMM3
2361 vaesenc \T1, \XMM4, \XMM4
2362 vaesenc \T1, \XMM5, \XMM5
2363 vaesenc \T1, \XMM6, \XMM6
2364 vaesenc \T1, \XMM7, \XMM7
2365 vaesenc \T1, \XMM8, \XMM8
2368 vmovdqa TMP5(%rsp), \T1
2369 vmovdqu HashKey_4(arg2), \T5
2370 vpclmulqdq $0x11, \T5, \T1, \T3
2373 vpclmulqdq $0x00, \T5, \T1, \T3
2376 vpclmulqdq $0x01, \T5, \T1, \T3
2379 vpclmulqdq $0x10, \T5, \T1, \T3
2382 vmovdqu 16*7(arg1), \T1
2383 vaesenc \T1, \XMM1, \XMM1
2384 vaesenc \T1, \XMM2, \XMM2
2385 vaesenc \T1, \XMM3, \XMM3
2386 vaesenc \T1, \XMM4, \XMM4
2387 vaesenc \T1, \XMM5, \XMM5
2388 vaesenc \T1, \XMM6, \XMM6
2389 vaesenc \T1, \XMM7, \XMM7
2390 vaesenc \T1, \XMM8, \XMM8
2392 vmovdqa TMP6(%rsp), \T1
2393 vmovdqu HashKey_3(arg2), \T5
2394 vpclmulqdq $0x11, \T5, \T1, \T3
2397 vpclmulqdq $0x00, \T5, \T1, \T3
2400 vpclmulqdq $0x01, \T5, \T1, \T3
2403 vpclmulqdq $0x10, \T5, \T1, \T3
2406 vmovdqu 16*8(arg1), \T1
2407 vaesenc \T1, \XMM1, \XMM1
2408 vaesenc \T1, \XMM2, \XMM2
2409 vaesenc \T1, \XMM3, \XMM3
2410 vaesenc \T1, \XMM4, \XMM4
2411 vaesenc \T1, \XMM5, \XMM5
2412 vaesenc \T1, \XMM6, \XMM6
2413 vaesenc \T1, \XMM7, \XMM7
2414 vaesenc \T1, \XMM8, \XMM8
2416 vmovdqa TMP7(%rsp), \T1
2417 vmovdqu HashKey_2(arg2), \T5
2418 vpclmulqdq $0x11, \T5, \T1, \T3
2421 vpclmulqdq $0x00, \T5, \T1, \T3
2424 vpclmulqdq $0x01, \T5, \T1, \T3
2427 vpclmulqdq $0x10, \T5, \T1, \T3
2431 #######################################################################
2433 vmovdqu 16*9(arg1), \T5
2434 vaesenc \T5, \XMM1, \XMM1
2435 vaesenc \T5, \XMM2, \XMM2
2436 vaesenc \T5, \XMM3, \XMM3
2437 vaesenc \T5, \XMM4, \XMM4
2438 vaesenc \T5, \XMM5, \XMM5
2439 vaesenc \T5, \XMM6, \XMM6
2440 vaesenc \T5, \XMM7, \XMM7
2441 vaesenc \T5, \XMM8, \XMM8
2443 vmovdqa TMP8(%rsp), \T1
2444 vmovdqu HashKey(arg2), \T5
2446 vpclmulqdq $0x00, \T5, \T1, \T3
2449 vpclmulqdq $0x01, \T5, \T1, \T3
2452 vpclmulqdq $0x10, \T5, \T1, \T3
2455 vpclmulqdq $0x11, \T5, \T1, \T3
2459 vmovdqu 16*10(arg1), \T5
2464 vaesenc \T5, \XMM1, \XMM1
2465 vaesenc \T5, \XMM2, \XMM2
2466 vaesenc \T5, \XMM3, \XMM3
2467 vaesenc \T5, \XMM4, \XMM4
2468 vaesenc \T5, \XMM5, \XMM5
2469 vaesenc \T5, \XMM6, \XMM6
2470 vaesenc \T5, \XMM7, \XMM7
2471 vaesenc \T5, \XMM8, \XMM8
2473 vmovdqu 16*i(arg1), \T5
2482 vpxor 16*i(arg4, %r11), \T5, \T2
2484 vaesenclast \T2, reg_j, reg_j
2486 vaesenclast \T2, reg_j, \T3
2487 vmovdqu 16*i(arg4, %r11), reg_j
2488 vmovdqu \T3, 16*i(arg3, %r11)
2494 #######################################################################
2497 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2498 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2500 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2504 #######################################################################
2505 #first phase of the reduction
2506 vmovdqa POLY2(%rip), \T3
2508 vpclmulqdq $0x01, \T7, \T3, \T2
2509 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2511 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2512 #######################################################################
2514 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2515 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2516 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2517 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2518 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2519 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2520 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2521 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2524 #######################################################################
2525 #second phase of the reduction
2526 vpclmulqdq $0x00, \T7, \T3, \T2
2527 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2529 vpclmulqdq $0x10, \T7, \T3, \T4
2530 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2532 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2533 #######################################################################
2534 vpxor \T4, \T1, \T1 # the result is in T1
2536 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2537 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2538 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2539 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2540 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2541 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2542 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2543 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2546 vpxor \T1, \XMM1, \XMM1
2553 # GHASH the last 4 ciphertext blocks.
2554 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2558 vmovdqu HashKey_8(arg2), \T5
2560 vpshufd $0b01001110, \XMM1, \T2
2561 vpshufd $0b01001110, \T5, \T3
2562 vpxor \XMM1, \T2, \T2
2565 vpclmulqdq $0x11, \T5, \XMM1, \T6
2566 vpclmulqdq $0x00, \T5, \XMM1, \T7
2568 vpclmulqdq $0x00, \T3, \T2, \XMM1
2570 ######################
2572 vmovdqu HashKey_7(arg2), \T5
2573 vpshufd $0b01001110, \XMM2, \T2
2574 vpshufd $0b01001110, \T5, \T3
2575 vpxor \XMM2, \T2, \T2
2578 vpclmulqdq $0x11, \T5, \XMM2, \T4
2581 vpclmulqdq $0x00, \T5, \XMM2, \T4
2584 vpclmulqdq $0x00, \T3, \T2, \T2
2586 vpxor \T2, \XMM1, \XMM1
2588 ######################
2590 vmovdqu HashKey_6(arg2), \T5
2591 vpshufd $0b01001110, \XMM3, \T2
2592 vpshufd $0b01001110, \T5, \T3
2593 vpxor \XMM3, \T2, \T2
2596 vpclmulqdq $0x11, \T5, \XMM3, \T4
2599 vpclmulqdq $0x00, \T5, \XMM3, \T4
2602 vpclmulqdq $0x00, \T3, \T2, \T2
2604 vpxor \T2, \XMM1, \XMM1
2606 ######################
2608 vmovdqu HashKey_5(arg2), \T5
2609 vpshufd $0b01001110, \XMM4, \T2
2610 vpshufd $0b01001110, \T5, \T3
2611 vpxor \XMM4, \T2, \T2
2614 vpclmulqdq $0x11, \T5, \XMM4, \T4
2617 vpclmulqdq $0x00, \T5, \XMM4, \T4
2620 vpclmulqdq $0x00, \T3, \T2, \T2
2622 vpxor \T2, \XMM1, \XMM1
2624 ######################
2626 vmovdqu HashKey_4(arg2), \T5
2627 vpshufd $0b01001110, \XMM5, \T2
2628 vpshufd $0b01001110, \T5, \T3
2629 vpxor \XMM5, \T2, \T2
2632 vpclmulqdq $0x11, \T5, \XMM5, \T4
2635 vpclmulqdq $0x00, \T5, \XMM5, \T4
2638 vpclmulqdq $0x00, \T3, \T2, \T2
2640 vpxor \T2, \XMM1, \XMM1
2642 ######################
2644 vmovdqu HashKey_3(arg2), \T5
2645 vpshufd $0b01001110, \XMM6, \T2
2646 vpshufd $0b01001110, \T5, \T3
2647 vpxor \XMM6, \T2, \T2
2650 vpclmulqdq $0x11, \T5, \XMM6, \T4
2653 vpclmulqdq $0x00, \T5, \XMM6, \T4
2656 vpclmulqdq $0x00, \T3, \T2, \T2
2658 vpxor \T2, \XMM1, \XMM1
2660 ######################
2662 vmovdqu HashKey_2(arg2), \T5
2663 vpshufd $0b01001110, \XMM7, \T2
2664 vpshufd $0b01001110, \T5, \T3
2665 vpxor \XMM7, \T2, \T2
2668 vpclmulqdq $0x11, \T5, \XMM7, \T4
2671 vpclmulqdq $0x00, \T5, \XMM7, \T4
2674 vpclmulqdq $0x00, \T3, \T2, \T2
2676 vpxor \T2, \XMM1, \XMM1
2678 ######################
2680 vmovdqu HashKey(arg2), \T5
2681 vpshufd $0b01001110, \XMM8, \T2
2682 vpshufd $0b01001110, \T5, \T3
2683 vpxor \XMM8, \T2, \T2
2686 vpclmulqdq $0x11, \T5, \XMM8, \T4
2689 vpclmulqdq $0x00, \T5, \XMM8, \T4
2692 vpclmulqdq $0x00, \T3, \T2, \T2
2694 vpxor \T2, \XMM1, \XMM1
2695 vpxor \T6, \XMM1, \XMM1
2696 vpxor \T7, \XMM1, \T2
2701 vpslldq $8, \T2, \T4
2702 vpsrldq $8, \T2, \T2
2705 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2706 # accumulated carry-less multiplications
2708 #######################################################################
2709 #first phase of the reduction
2710 vmovdqa POLY2(%rip), \T3
2712 vpclmulqdq $0x01, \T7, \T3, \T2
2713 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2715 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2716 #######################################################################
2719 #second phase of the reduction
2720 vpclmulqdq $0x00, \T7, \T3, \T2
2721 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2723 vpclmulqdq $0x10, \T7, \T3, \T4
2724 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2726 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2727 #######################################################################
2728 vpxor \T4, \T6, \T6 # the result is in T6
2733 #############################################################
2734 #void aesni_gcm_init_avx_gen4
2735 # (gcm_data *my_ctx_data,
2736 # gcm_context_data *data,
2737 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2738 # (from Security Association) concatenated with 8 byte
2739 # Initialisation Vector (from IPSec ESP Payload)
2740 # concatenated with 0x00000001. 16-byte aligned pointer. */
2741 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2742 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2743 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2744 #############################################################
2745 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2747 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2750 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2752 ###############################################################################
2753 #void aesni_gcm_enc_avx_gen4(
2754 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2755 # gcm_context_data *data,
2756 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2757 # const u8 *in, /* Plaintext input */
2758 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2759 ###############################################################################
2760 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2764 je key_256_enc_update4
2766 je key_128_enc_update4
2768 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2771 key_128_enc_update4:
2772 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2775 key_256_enc_update4:
2776 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2779 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2781 ###############################################################################
2782 #void aesni_gcm_dec_update_avx_gen4(
2783 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2784 # gcm_context_data *data,
2785 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2786 # const u8 *in, /* Ciphertext input */
2787 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2788 ###############################################################################
2789 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2793 je key_256_dec_update4
2795 je key_128_dec_update4
2797 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2800 key_128_dec_update4:
2801 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2804 key_256_dec_update4:
2805 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2808 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2810 ###############################################################################
2811 #void aesni_gcm_finalize_avx_gen4(
2812 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2813 # gcm_context_data *data,
2814 # u8 *auth_tag, /* Authenticated Tag output. */
2815 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2816 # Valid values are 16 (most likely), 12 or 8. */
2817 ###############################################################################
2818 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2822 je key_256_finalize4
2824 je key_128_finalize4
2826 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2830 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2834 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2837 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)