1 ########################################################################
2 # Copyright (c) 2013, Intel Corporation
4 # This software is available to you under a choice of one of two
5 # licenses. You may choose to be licensed under the terms of the GNU
6 # General Public License (GPL) Version 2, available from the file
7 # COPYING in the main directory of this source tree, or the
8 # OpenIB.org BSD license below:
10 # Redistribution and use in source and binary forms, with or without
11 # modification, are permitted provided that the following conditions are
14 # * Redistributions of source code must retain the above copyright
15 # notice, this list of conditions and the following disclaimer.
17 # * Redistributions in binary form must reproduce the above copyright
18 # notice, this list of conditions and the following disclaimer in the
19 # documentation and/or other materials provided with the
22 # * Neither the name of the Intel Corporation nor the names of its
23 # contributors may be used to endorse or promote products derived from
24 # this software without specific prior written permission.
27 # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31 # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34 # PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 ########################################################################
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
78 ## padded AAD in xmm register = {A1 A0 0 0}
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
90 ## AAD Format with 32-bit Sequence Number
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
107 ## AAD Format with 64-bit Extended Sequence Number
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
125 # constants in mergeable sections, linker can reorder and merge
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
128 POLY: .octa 0xC2000000000000000000000000000001
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
136 TWOONE: .octa 0x00000001000000000000000000000001
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
144 ONE: .octa 0x00000000000000000000000000000001
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
148 ONEf: .octa 0x01000000000000000000000000000000
150 # order of these constants should not change.
151 # more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
152 .section .rodata, "a", @progbits
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
160 .type aad_shift_arr, @object
161 .size aad_shift_arr, 272
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
187 #define InLen (16*1)+8
188 #define PBlockEncKey 16*2
190 #define CurCount 16*4
191 #define PBlockLen 16*5
193 HashKey = 16*6 # store HashKey <<1 mod poly here
194 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
195 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
196 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
197 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
198 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
199 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
200 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
201 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
202 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
203 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
204 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
205 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
206 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
207 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
208 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
216 #define arg7 STACK_OFFSET+8*1(%r14)
217 #define arg8 STACK_OFFSET+8*2(%r14)
218 #define arg9 STACK_OFFSET+8*3(%r14)
219 #define arg10 STACK_OFFSET+8*4(%r14)
220 #define keysize 2*15*16(arg1)
230 .macro define_reg r n
241 # need to push 4 registers into stack to maintain
244 TMP1 = 16*0 # Temporary storage for AAD
245 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
246 TMP3 = 16*2 # Temporary storage for AES State 3
247 TMP4 = 16*3 # Temporary storage for AES State 4
248 TMP5 = 16*4 # Temporary storage for AES State 5
249 TMP6 = 16*5 # Temporary storage for AES State 6
250 TMP7 = 16*6 # Temporary storage for AES State 7
251 TMP8 = 16*7 # Temporary storage for AES State 8
253 VARIABLE_OFFSET = 16*8
255 ################################
257 ################################
260 #the number of pushes must equal STACK_OFFSET
270 sub $VARIABLE_OFFSET, %rsp
271 and $~63, %rsp # align rsp to 64 bytes
283 # Encryption of a single block
284 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
285 vpxor (arg1), \XMM0, \XMM0
289 vaesenc 16*i(arg1), \XMM0, \XMM0
293 vaesenclast 16*i(arg1), \XMM0, \XMM0
296 # combined for GCM encrypt and decrypt functions
297 # clobbering all xmm registers
298 # clobbering r10, r11, r12, r13, r14, r15
299 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
300 vmovdqu AadHash(arg2), %xmm8
301 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
302 add arg5, InLen(arg2)
304 # initialize the data pointer offset as zero
307 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
310 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
311 and $-16, %r13 # r13 = r13 - (r13 mod 16)
316 jz _initial_num_blocks_is_0\@
319 je _initial_num_blocks_is_7\@
321 je _initial_num_blocks_is_6\@
323 je _initial_num_blocks_is_5\@
325 je _initial_num_blocks_is_4\@
327 je _initial_num_blocks_is_3\@
329 je _initial_num_blocks_is_2\@
331 jmp _initial_num_blocks_is_1\@
333 _initial_num_blocks_is_7\@:
334 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
336 jmp _initial_blocks_encrypted\@
338 _initial_num_blocks_is_6\@:
339 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
341 jmp _initial_blocks_encrypted\@
343 _initial_num_blocks_is_5\@:
344 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
346 jmp _initial_blocks_encrypted\@
348 _initial_num_blocks_is_4\@:
349 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
351 jmp _initial_blocks_encrypted\@
353 _initial_num_blocks_is_3\@:
354 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
356 jmp _initial_blocks_encrypted\@
358 _initial_num_blocks_is_2\@:
359 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
361 jmp _initial_blocks_encrypted\@
363 _initial_num_blocks_is_1\@:
364 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
366 jmp _initial_blocks_encrypted\@
368 _initial_num_blocks_is_0\@:
369 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
372 _initial_blocks_encrypted\@:
374 je _zero_cipher_left\@
377 je _eight_cipher_left\@
384 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
397 jne _encrypt_by_8_new\@
399 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
400 jmp _eight_cipher_left\@
403 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
405 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
406 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
409 jne _encrypt_by_8_new\@
411 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
416 _eight_cipher_left\@:
417 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
421 vmovdqu %xmm14, AadHash(arg2)
422 vmovdqu %xmm9, CurCount(arg2)
426 and $15, %r13 # r13 = (arg5 mod 16)
428 je _multiple_of_16_bytes\@
430 # handle the last <16 Byte block separately
432 mov %r13, PBlockLen(arg2)
434 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
435 vmovdqu %xmm9, CurCount(arg2)
436 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
438 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
439 vmovdqu %xmm9, PBlockEncKey(arg2)
442 jge _large_enough_update\@
444 lea (arg4,%r11,1), %r10
447 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
449 lea SHIFT_MASK+16(%rip), %r12
450 sub %r13, %r12 # adjust the shuffle mask pointer to be
451 # able to shift 16-r13 bytes (r13 is the
452 # number of bytes in plaintext mod 16)
454 jmp _final_ghash_mul\@
456 _large_enough_update\@:
460 # receive the last <16 Byte block
461 vmovdqu (arg4, %r11, 1), %xmm1
466 lea SHIFT_MASK+16(%rip), %r12
467 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
468 # (r13 is the number of bytes in plaintext mod 16)
470 # get the appropriate shuffle mask
471 vmovdqu (%r12), %xmm2
472 # shift right 16-r13 bytes
473 vpshufb %xmm2, %xmm1, %xmm1
478 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
479 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
480 # mask out top 16-r13 bytes of xmm9
481 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
482 vpand %xmm1, %xmm2, %xmm2
483 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
484 vpxor %xmm2, %xmm14, %xmm14
486 vmovdqu %xmm14, AadHash(arg2)
488 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
489 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
490 # mask out top 16-r13 bytes of xmm9
491 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
492 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
493 vpxor %xmm9, %xmm14, %xmm14
495 vmovdqu %xmm14, AadHash(arg2)
496 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
500 #############################
504 jle _less_than_8_bytes_left\@
506 mov %rax, (arg3 , %r11)
508 vpsrldq $8, %xmm9, %xmm9
512 _less_than_8_bytes_left\@:
513 movb %al, (arg3 , %r11)
517 jne _less_than_8_bytes_left\@
518 #############################
520 _multiple_of_16_bytes\@:
524 # GCM_COMPLETE Finishes update of tag of last partial block
525 # Output: Authorization Tag (AUTH_TAG)
526 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
527 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
528 vmovdqu AadHash(arg2), %xmm14
529 vmovdqu HashKey(arg2), %xmm13
531 mov PBlockLen(arg2), %r12
535 #GHASH computation for the last <16 Byte block
536 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
539 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
540 shl $3, %r12 # convert into number of bits
541 vmovd %r12d, %xmm15 # len(A) in xmm15
543 mov InLen(arg2), %r12
544 shl $3, %r12 # len(C) in bits (*128)
546 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
547 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
549 vpxor %xmm15, %xmm14, %xmm14
550 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
551 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
553 vmovdqu OrigIV(arg2), %xmm9
555 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
557 vpxor %xmm14, %xmm9, %xmm9
562 mov \AUTH_TAG, %r10 # r10 = authTag
563 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
576 vpsrldq $8, %xmm9, %xmm9
584 vpsrldq $4, %xmm9, %xmm9
601 vmovdqu %xmm9, (%r10)
606 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
608 mov \AAD, %r10 # r10 = AAD
609 mov \AADLEN, %r12 # r12 = aadLen
620 vpshufb SHUF_MASK(%rip), \T7, \T7
622 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
627 jge _get_AAD_blocks\@
634 /* read the last <16B of AAD. since we have at least 4B of
635 data right after the AAD (the ICV, and maybe some CT), we can
636 read 4B/8B blocks safely, and then get rid of the extra stuff */
654 vpslldq $12, \T1, \T1
658 /* finalize: shift out the extra bytes we read, and align
659 left. since pslldq can only shift by an immediate, we use
660 vpshufb and an array of shuffle masks */
663 vmovdqu aad_shift_arr(%r11), \T1
664 vpshufb \T1, \T7, \T7
665 _get_AAD_rest_final\@:
666 vpshufb SHUF_MASK(%rip), \T7, \T7
668 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
671 vmovdqu \T7, AadHash(arg2)
674 .macro INIT GHASH_MUL PRECOMPUTE
676 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
678 mov %r11, InLen(arg2) # ctx_data.in_length = 0
680 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
681 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
684 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
686 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
687 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
689 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
691 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
692 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
694 vpsllq $1, %xmm6, %xmm6
695 vpsrlq $63, %xmm2, %xmm2
697 vpslldq $8, %xmm2, %xmm2
698 vpsrldq $8, %xmm1, %xmm1
699 vpor %xmm2, %xmm6, %xmm6
701 vpshufd $0b00100100, %xmm1, %xmm2
702 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
703 vpand POLY(%rip), %xmm2, %xmm2
704 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
705 #######################################################################
706 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
708 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
710 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
714 # Reads DLEN bytes starting at DPTR and stores in XMMDst
715 # where 0 < DLEN < 16
716 # Clobbers %rax, DLEN
717 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
718 vpxor \XMMDst, \XMMDst, \XMMDst
723 vpinsrq $0, %rax, \XMMDst, \XMMDst
725 jz _done_read_partial_block_\@
729 mov 7(\DPTR, \DLEN, 1), %al
731 jnz _read_next_byte_\@
732 vpinsrq $1, %rax, \XMMDst, \XMMDst
733 jmp _done_read_partial_block_\@
736 _read_next_byte_lt8_\@:
738 mov -1(\DPTR, \DLEN, 1), %al
740 jnz _read_next_byte_lt8_\@
741 vpinsrq $0, %rax, \XMMDst, \XMMDst
742 _done_read_partial_block_\@:
745 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
746 # between update calls.
747 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
748 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
749 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
750 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
752 mov PBlockLen(arg2), %r13
754 je _partial_block_done_\@ # Leave Macro if no partial blocks
755 # Read in input data without over reading
756 cmp $16, \PLAIN_CYPH_LEN
757 jl _fewer_than_16_bytes_\@
758 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
761 _fewer_than_16_bytes_\@:
762 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
763 mov \PLAIN_CYPH_LEN, %r12
764 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
766 mov PBlockLen(arg2), %r13
768 _data_read_\@: # Finished reading in data
770 vmovdqu PBlockEncKey(arg2), %xmm9
771 vmovdqu HashKey(arg2), %xmm13
773 lea SHIFT_MASK(%rip), %r12
775 # adjust the shuffle mask pointer to be able to shift r13 bytes
776 # r16-r13 is the number of bytes in plaintext mod 16)
778 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
779 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
783 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
785 mov \PLAIN_CYPH_LEN, %r10
787 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
789 # Determine if if partial block is not being filled and
790 # shift mask accordingly
791 jge _no_extra_mask_1_\@
795 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
796 # get the appropriate mask to mask out bottom r13 bytes of xmm9
797 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
799 vpand %xmm1, %xmm3, %xmm3
800 vmovdqa SHUF_MASK(%rip), %xmm10
801 vpshufb %xmm10, %xmm3, %xmm3
802 vpshufb %xmm2, %xmm3, %xmm3
803 vpxor %xmm3, \AAD_HASH, \AAD_HASH
806 jl _partial_incomplete_1_\@
808 # GHASH computation for the last <16 Byte block
809 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
812 mov %rax, PBlockLen(arg2)
814 _partial_incomplete_1_\@:
815 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
817 vmovdqu \AAD_HASH, AadHash(arg2)
819 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
821 mov \PLAIN_CYPH_LEN, %r10
823 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
825 # Determine if if partial block is not being filled and
826 # shift mask accordingly
827 jge _no_extra_mask_2_\@
831 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
832 # get the appropriate mask to mask out bottom r13 bytes of xmm9
833 vpand %xmm1, %xmm9, %xmm9
835 vmovdqa SHUF_MASK(%rip), %xmm1
836 vpshufb %xmm1, %xmm9, %xmm9
837 vpshufb %xmm2, %xmm9, %xmm9
838 vpxor %xmm9, \AAD_HASH, \AAD_HASH
841 jl _partial_incomplete_2_\@
843 # GHASH computation for the last <16 Byte block
844 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
847 mov %rax, PBlockLen(arg2)
849 _partial_incomplete_2_\@:
850 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
852 vmovdqu \AAD_HASH, AadHash(arg2)
854 vmovdqa SHUF_MASK(%rip), %xmm10
855 # shuffle xmm9 back to output as ciphertext
856 vpshufb %xmm10, %xmm9, %xmm9
857 vpshufb %xmm2, %xmm9, %xmm9
859 # output encrypted Bytes
864 # Set r13 to be the number of bytes to write out
868 mov \PLAIN_CYPH_LEN, %r13
873 jle _less_than_8_bytes_left_\@
875 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
880 _less_than_8_bytes_left_\@:
881 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
885 jne _less_than_8_bytes_left_\@
886 _partial_block_done_\@:
887 .endm # PARTIAL_BLOCK
890 ###############################################################################
891 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
892 # Input: A and B (128-bits each, bit-reflected)
893 # Output: C = A*B*x mod poly, (i.e. >>1 )
894 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
895 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
896 ###############################################################################
897 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
899 vpshufd $0b01001110, \GH, \T2
900 vpshufd $0b01001110, \HK, \T3
901 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
902 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
904 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
905 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
906 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
908 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
910 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
911 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
913 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
915 #first phase of the reduction
916 vpslld $31, \GH, \T2 # packed right shifting << 31
917 vpslld $30, \GH, \T3 # packed right shifting shift << 30
918 vpslld $25, \GH, \T4 # packed right shifting shift << 25
920 vpxor \T3, \T2, \T2 # xor the shifted versions
923 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
925 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
926 vpxor \T2, \GH, \GH # first phase of the reduction complete
928 #second phase of the reduction
930 vpsrld $1,\GH, \T2 # packed left shifting >> 1
931 vpsrld $2,\GH, \T3 # packed left shifting >> 2
932 vpsrld $7,\GH, \T4 # packed left shifting >> 7
933 vpxor \T3, \T2, \T2 # xor the shifted versions
938 vpxor \T1, \GH, \GH # the result is in GH
943 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
945 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
948 vpshufd $0b01001110, \T5, \T1
950 vmovdqu \T1, HashKey_k(arg2)
952 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
953 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
954 vpshufd $0b01001110, \T5, \T1
956 vmovdqu \T1, HashKey_2_k(arg2)
958 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
959 vmovdqu \T5, HashKey_3(arg2)
960 vpshufd $0b01001110, \T5, \T1
962 vmovdqu \T1, HashKey_3_k(arg2)
964 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
965 vmovdqu \T5, HashKey_4(arg2)
966 vpshufd $0b01001110, \T5, \T1
968 vmovdqu \T1, HashKey_4_k(arg2)
970 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
971 vmovdqu \T5, HashKey_5(arg2)
972 vpshufd $0b01001110, \T5, \T1
974 vmovdqu \T1, HashKey_5_k(arg2)
976 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
977 vmovdqu \T5, HashKey_6(arg2)
978 vpshufd $0b01001110, \T5, \T1
980 vmovdqu \T1, HashKey_6_k(arg2)
982 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
983 vmovdqu \T5, HashKey_7(arg2)
984 vpshufd $0b01001110, \T5, \T1
986 vmovdqu \T1, HashKey_7_k(arg2)
988 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
989 vmovdqu \T5, HashKey_8(arg2)
990 vpshufd $0b01001110, \T5, \T1
992 vmovdqu \T1, HashKey_8_k(arg2)
996 ## if a = number of total plaintext bytes
998 ## num_initial_blocks = b mod 4#
999 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1000 ## r10, r11, r12, rax are clobbered
1001 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1003 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1004 i = (8-\num_initial_blocks)
1006 vmovdqu AadHash(arg2), reg_i
1008 # start AES for num_initial_blocks blocks
1009 vmovdqu CurCount(arg2), \CTR
1011 i = (9-\num_initial_blocks)
1013 .rep \num_initial_blocks
1014 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1016 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1021 vmovdqa (arg1), \T_key
1022 i = (9-\num_initial_blocks)
1024 .rep \num_initial_blocks
1025 vpxor \T_key, reg_i, reg_i
1033 vmovdqa 16*j(arg1), \T_key
1034 i = (9-\num_initial_blocks)
1036 .rep \num_initial_blocks
1037 vaesenc \T_key, reg_i, reg_i
1046 vmovdqa 16*j(arg1), \T_key
1047 i = (9-\num_initial_blocks)
1049 .rep \num_initial_blocks
1050 vaesenclast \T_key, reg_i, reg_i
1055 i = (9-\num_initial_blocks)
1057 .rep \num_initial_blocks
1058 vmovdqu (arg4, %r11), \T1
1059 vpxor \T1, reg_i, reg_i
1060 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1065 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1071 i = (8-\num_initial_blocks)
1072 j = (9-\num_initial_blocks)
1075 .rep \num_initial_blocks
1076 vpxor reg_i, reg_j, reg_j
1077 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1082 # XMM8 has the combined result here
1084 vmovdqa \XMM8, TMP1(%rsp)
1088 jl _initial_blocks_done\@ # no need for precomputed constants
1090 ###############################################################################
1091 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1092 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1094 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1096 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1098 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1100 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1102 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1104 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1106 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1108 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1110 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1112 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1114 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1116 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1118 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1120 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1122 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1124 vmovdqa (arg1), \T_key
1125 vpxor \T_key, \XMM1, \XMM1
1126 vpxor \T_key, \XMM2, \XMM2
1127 vpxor \T_key, \XMM3, \XMM3
1128 vpxor \T_key, \XMM4, \XMM4
1129 vpxor \T_key, \XMM5, \XMM5
1130 vpxor \T_key, \XMM6, \XMM6
1131 vpxor \T_key, \XMM7, \XMM7
1132 vpxor \T_key, \XMM8, \XMM8
1136 .rep \REP # do REP rounds
1137 vmovdqa 16*i(arg1), \T_key
1138 vaesenc \T_key, \XMM1, \XMM1
1139 vaesenc \T_key, \XMM2, \XMM2
1140 vaesenc \T_key, \XMM3, \XMM3
1141 vaesenc \T_key, \XMM4, \XMM4
1142 vaesenc \T_key, \XMM5, \XMM5
1143 vaesenc \T_key, \XMM6, \XMM6
1144 vaesenc \T_key, \XMM7, \XMM7
1145 vaesenc \T_key, \XMM8, \XMM8
1150 vmovdqa 16*i(arg1), \T_key
1151 vaesenclast \T_key, \XMM1, \XMM1
1152 vaesenclast \T_key, \XMM2, \XMM2
1153 vaesenclast \T_key, \XMM3, \XMM3
1154 vaesenclast \T_key, \XMM4, \XMM4
1155 vaesenclast \T_key, \XMM5, \XMM5
1156 vaesenclast \T_key, \XMM6, \XMM6
1157 vaesenclast \T_key, \XMM7, \XMM7
1158 vaesenclast \T_key, \XMM8, \XMM8
1160 vmovdqu (arg4, %r11), \T1
1161 vpxor \T1, \XMM1, \XMM1
1162 vmovdqu \XMM1, (arg3 , %r11)
1167 vmovdqu 16*1(arg4, %r11), \T1
1168 vpxor \T1, \XMM2, \XMM2
1169 vmovdqu \XMM2, 16*1(arg3 , %r11)
1174 vmovdqu 16*2(arg4, %r11), \T1
1175 vpxor \T1, \XMM3, \XMM3
1176 vmovdqu \XMM3, 16*2(arg3 , %r11)
1181 vmovdqu 16*3(arg4, %r11), \T1
1182 vpxor \T1, \XMM4, \XMM4
1183 vmovdqu \XMM4, 16*3(arg3 , %r11)
1188 vmovdqu 16*4(arg4, %r11), \T1
1189 vpxor \T1, \XMM5, \XMM5
1190 vmovdqu \XMM5, 16*4(arg3 , %r11)
1195 vmovdqu 16*5(arg4, %r11), \T1
1196 vpxor \T1, \XMM6, \XMM6
1197 vmovdqu \XMM6, 16*5(arg3 , %r11)
1202 vmovdqu 16*6(arg4, %r11), \T1
1203 vpxor \T1, \XMM7, \XMM7
1204 vmovdqu \XMM7, 16*6(arg3 , %r11)
1209 vmovdqu 16*7(arg4, %r11), \T1
1210 vpxor \T1, \XMM8, \XMM8
1211 vmovdqu \XMM8, 16*7(arg3 , %r11)
1218 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1219 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1220 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1221 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1222 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1223 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1224 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1225 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1226 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1228 ###############################################################################
1230 _initial_blocks_done\@:
1234 # encrypt 8 blocks at a time
1235 # ghash the 8 previously encrypted ciphertext blocks
1236 # arg1, arg3, arg4 are used as pointers only, not modified
1237 # r11 is the data offset value
1238 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1241 vmovdqa \XMM2, TMP2(%rsp)
1242 vmovdqa \XMM3, TMP3(%rsp)
1243 vmovdqa \XMM4, TMP4(%rsp)
1244 vmovdqa \XMM5, TMP5(%rsp)
1245 vmovdqa \XMM6, TMP6(%rsp)
1246 vmovdqa \XMM7, TMP7(%rsp)
1247 vmovdqa \XMM8, TMP8(%rsp)
1249 .if \loop_idx == in_order
1250 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1251 vpaddd ONE(%rip), \XMM1, \XMM2
1252 vpaddd ONE(%rip), \XMM2, \XMM3
1253 vpaddd ONE(%rip), \XMM3, \XMM4
1254 vpaddd ONE(%rip), \XMM4, \XMM5
1255 vpaddd ONE(%rip), \XMM5, \XMM6
1256 vpaddd ONE(%rip), \XMM6, \XMM7
1257 vpaddd ONE(%rip), \XMM7, \XMM8
1260 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1261 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1262 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1263 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1264 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1265 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1266 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1267 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1269 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1270 vpaddd ONEf(%rip), \XMM1, \XMM2
1271 vpaddd ONEf(%rip), \XMM2, \XMM3
1272 vpaddd ONEf(%rip), \XMM3, \XMM4
1273 vpaddd ONEf(%rip), \XMM4, \XMM5
1274 vpaddd ONEf(%rip), \XMM5, \XMM6
1275 vpaddd ONEf(%rip), \XMM6, \XMM7
1276 vpaddd ONEf(%rip), \XMM7, \XMM8
1281 #######################################################################
1284 vpxor \T1, \XMM1, \XMM1
1285 vpxor \T1, \XMM2, \XMM2
1286 vpxor \T1, \XMM3, \XMM3
1287 vpxor \T1, \XMM4, \XMM4
1288 vpxor \T1, \XMM5, \XMM5
1289 vpxor \T1, \XMM6, \XMM6
1290 vpxor \T1, \XMM7, \XMM7
1291 vpxor \T1, \XMM8, \XMM8
1293 #######################################################################
1299 vmovdqu 16*1(arg1), \T1
1300 vaesenc \T1, \XMM1, \XMM1
1301 vaesenc \T1, \XMM2, \XMM2
1302 vaesenc \T1, \XMM3, \XMM3
1303 vaesenc \T1, \XMM4, \XMM4
1304 vaesenc \T1, \XMM5, \XMM5
1305 vaesenc \T1, \XMM6, \XMM6
1306 vaesenc \T1, \XMM7, \XMM7
1307 vaesenc \T1, \XMM8, \XMM8
1309 vmovdqu 16*2(arg1), \T1
1310 vaesenc \T1, \XMM1, \XMM1
1311 vaesenc \T1, \XMM2, \XMM2
1312 vaesenc \T1, \XMM3, \XMM3
1313 vaesenc \T1, \XMM4, \XMM4
1314 vaesenc \T1, \XMM5, \XMM5
1315 vaesenc \T1, \XMM6, \XMM6
1316 vaesenc \T1, \XMM7, \XMM7
1317 vaesenc \T1, \XMM8, \XMM8
1320 #######################################################################
1322 vmovdqu HashKey_8(arg2), \T5
1323 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1324 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1326 vpshufd $0b01001110, \T2, \T6
1329 vmovdqu HashKey_8_k(arg2), \T5
1330 vpclmulqdq $0x00, \T5, \T6, \T6
1332 vmovdqu 16*3(arg1), \T1
1333 vaesenc \T1, \XMM1, \XMM1
1334 vaesenc \T1, \XMM2, \XMM2
1335 vaesenc \T1, \XMM3, \XMM3
1336 vaesenc \T1, \XMM4, \XMM4
1337 vaesenc \T1, \XMM5, \XMM5
1338 vaesenc \T1, \XMM6, \XMM6
1339 vaesenc \T1, \XMM7, \XMM7
1340 vaesenc \T1, \XMM8, \XMM8
1342 vmovdqa TMP2(%rsp), \T1
1343 vmovdqu HashKey_7(arg2), \T5
1344 vpclmulqdq $0x11, \T5, \T1, \T3
1346 vpclmulqdq $0x00, \T5, \T1, \T3
1349 vpshufd $0b01001110, \T1, \T3
1351 vmovdqu HashKey_7_k(arg2), \T5
1352 vpclmulqdq $0x10, \T5, \T3, \T3
1355 vmovdqu 16*4(arg1), \T1
1356 vaesenc \T1, \XMM1, \XMM1
1357 vaesenc \T1, \XMM2, \XMM2
1358 vaesenc \T1, \XMM3, \XMM3
1359 vaesenc \T1, \XMM4, \XMM4
1360 vaesenc \T1, \XMM5, \XMM5
1361 vaesenc \T1, \XMM6, \XMM6
1362 vaesenc \T1, \XMM7, \XMM7
1363 vaesenc \T1, \XMM8, \XMM8
1365 #######################################################################
1367 vmovdqa TMP3(%rsp), \T1
1368 vmovdqu HashKey_6(arg2), \T5
1369 vpclmulqdq $0x11, \T5, \T1, \T3
1371 vpclmulqdq $0x00, \T5, \T1, \T3
1374 vpshufd $0b01001110, \T1, \T3
1376 vmovdqu HashKey_6_k(arg2), \T5
1377 vpclmulqdq $0x10, \T5, \T3, \T3
1380 vmovdqu 16*5(arg1), \T1
1381 vaesenc \T1, \XMM1, \XMM1
1382 vaesenc \T1, \XMM2, \XMM2
1383 vaesenc \T1, \XMM3, \XMM3
1384 vaesenc \T1, \XMM4, \XMM4
1385 vaesenc \T1, \XMM5, \XMM5
1386 vaesenc \T1, \XMM6, \XMM6
1387 vaesenc \T1, \XMM7, \XMM7
1388 vaesenc \T1, \XMM8, \XMM8
1390 vmovdqa TMP4(%rsp), \T1
1391 vmovdqu HashKey_5(arg2), \T5
1392 vpclmulqdq $0x11, \T5, \T1, \T3
1394 vpclmulqdq $0x00, \T5, \T1, \T3
1397 vpshufd $0b01001110, \T1, \T3
1399 vmovdqu HashKey_5_k(arg2), \T5
1400 vpclmulqdq $0x10, \T5, \T3, \T3
1403 vmovdqu 16*6(arg1), \T1
1404 vaesenc \T1, \XMM1, \XMM1
1405 vaesenc \T1, \XMM2, \XMM2
1406 vaesenc \T1, \XMM3, \XMM3
1407 vaesenc \T1, \XMM4, \XMM4
1408 vaesenc \T1, \XMM5, \XMM5
1409 vaesenc \T1, \XMM6, \XMM6
1410 vaesenc \T1, \XMM7, \XMM7
1411 vaesenc \T1, \XMM8, \XMM8
1414 vmovdqa TMP5(%rsp), \T1
1415 vmovdqu HashKey_4(arg2), \T5
1416 vpclmulqdq $0x11, \T5, \T1, \T3
1418 vpclmulqdq $0x00, \T5, \T1, \T3
1421 vpshufd $0b01001110, \T1, \T3
1423 vmovdqu HashKey_4_k(arg2), \T5
1424 vpclmulqdq $0x10, \T5, \T3, \T3
1427 vmovdqu 16*7(arg1), \T1
1428 vaesenc \T1, \XMM1, \XMM1
1429 vaesenc \T1, \XMM2, \XMM2
1430 vaesenc \T1, \XMM3, \XMM3
1431 vaesenc \T1, \XMM4, \XMM4
1432 vaesenc \T1, \XMM5, \XMM5
1433 vaesenc \T1, \XMM6, \XMM6
1434 vaesenc \T1, \XMM7, \XMM7
1435 vaesenc \T1, \XMM8, \XMM8
1437 vmovdqa TMP6(%rsp), \T1
1438 vmovdqu HashKey_3(arg2), \T5
1439 vpclmulqdq $0x11, \T5, \T1, \T3
1441 vpclmulqdq $0x00, \T5, \T1, \T3
1444 vpshufd $0b01001110, \T1, \T3
1446 vmovdqu HashKey_3_k(arg2), \T5
1447 vpclmulqdq $0x10, \T5, \T3, \T3
1451 vmovdqu 16*8(arg1), \T1
1452 vaesenc \T1, \XMM1, \XMM1
1453 vaesenc \T1, \XMM2, \XMM2
1454 vaesenc \T1, \XMM3, \XMM3
1455 vaesenc \T1, \XMM4, \XMM4
1456 vaesenc \T1, \XMM5, \XMM5
1457 vaesenc \T1, \XMM6, \XMM6
1458 vaesenc \T1, \XMM7, \XMM7
1459 vaesenc \T1, \XMM8, \XMM8
1461 vmovdqa TMP7(%rsp), \T1
1462 vmovdqu HashKey_2(arg2), \T5
1463 vpclmulqdq $0x11, \T5, \T1, \T3
1465 vpclmulqdq $0x00, \T5, \T1, \T3
1468 vpshufd $0b01001110, \T1, \T3
1470 vmovdqu HashKey_2_k(arg2), \T5
1471 vpclmulqdq $0x10, \T5, \T3, \T3
1474 #######################################################################
1476 vmovdqu 16*9(arg1), \T5
1477 vaesenc \T5, \XMM1, \XMM1
1478 vaesenc \T5, \XMM2, \XMM2
1479 vaesenc \T5, \XMM3, \XMM3
1480 vaesenc \T5, \XMM4, \XMM4
1481 vaesenc \T5, \XMM5, \XMM5
1482 vaesenc \T5, \XMM6, \XMM6
1483 vaesenc \T5, \XMM7, \XMM7
1484 vaesenc \T5, \XMM8, \XMM8
1486 vmovdqa TMP8(%rsp), \T1
1487 vmovdqu HashKey(arg2), \T5
1488 vpclmulqdq $0x11, \T5, \T1, \T3
1490 vpclmulqdq $0x00, \T5, \T1, \T3
1493 vpshufd $0b01001110, \T1, \T3
1495 vmovdqu HashKey_k(arg2), \T5
1496 vpclmulqdq $0x10, \T5, \T3, \T3
1502 vmovdqu 16*10(arg1), \T5
1508 vaesenc \T5, \XMM1, \XMM1
1509 vaesenc \T5, \XMM2, \XMM2
1510 vaesenc \T5, \XMM3, \XMM3
1511 vaesenc \T5, \XMM4, \XMM4
1512 vaesenc \T5, \XMM5, \XMM5
1513 vaesenc \T5, \XMM6, \XMM6
1514 vaesenc \T5, \XMM7, \XMM7
1515 vaesenc \T5, \XMM8, \XMM8
1517 vmovdqu 16*i(arg1), \T5
1526 vpxor 16*i(arg4, %r11), \T5, \T2
1528 vaesenclast \T2, reg_j, reg_j
1530 vaesenclast \T2, reg_j, \T3
1531 vmovdqu 16*i(arg4, %r11), reg_j
1532 vmovdqu \T3, 16*i(arg3, %r11)
1538 #######################################################################
1541 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1542 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1544 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1548 #######################################################################
1549 #first phase of the reduction
1550 #######################################################################
1551 vpslld $31, \T7, \T2 # packed right shifting << 31
1552 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1553 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1555 vpxor \T3, \T2, \T2 # xor the shifted versions
1558 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1560 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1561 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1562 #######################################################################
1564 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1565 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1566 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1567 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1568 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1569 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1570 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1571 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1574 #######################################################################
1575 #second phase of the reduction
1576 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1577 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1578 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1579 vpxor \T3, \T2, \T2 # xor the shifted versions
1584 vpxor \T7, \T6, \T6 # the result is in T6
1585 #######################################################################
1587 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1588 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1589 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1590 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1591 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1592 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1593 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1594 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1597 vpxor \T6, \XMM1, \XMM1
1604 # GHASH the last 4 ciphertext blocks.
1605 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1610 vpshufd $0b01001110, \XMM1, \T2
1611 vpxor \XMM1, \T2, \T2
1612 vmovdqu HashKey_8(arg2), \T5
1613 vpclmulqdq $0x11, \T5, \XMM1, \T6
1614 vpclmulqdq $0x00, \T5, \XMM1, \T7
1616 vmovdqu HashKey_8_k(arg2), \T3
1617 vpclmulqdq $0x00, \T3, \T2, \XMM1
1619 ######################
1621 vpshufd $0b01001110, \XMM2, \T2
1622 vpxor \XMM2, \T2, \T2
1623 vmovdqu HashKey_7(arg2), \T5
1624 vpclmulqdq $0x11, \T5, \XMM2, \T4
1627 vpclmulqdq $0x00, \T5, \XMM2, \T4
1630 vmovdqu HashKey_7_k(arg2), \T3
1631 vpclmulqdq $0x00, \T3, \T2, \T2
1632 vpxor \T2, \XMM1, \XMM1
1634 ######################
1636 vpshufd $0b01001110, \XMM3, \T2
1637 vpxor \XMM3, \T2, \T2
1638 vmovdqu HashKey_6(arg2), \T5
1639 vpclmulqdq $0x11, \T5, \XMM3, \T4
1642 vpclmulqdq $0x00, \T5, \XMM3, \T4
1645 vmovdqu HashKey_6_k(arg2), \T3
1646 vpclmulqdq $0x00, \T3, \T2, \T2
1647 vpxor \T2, \XMM1, \XMM1
1649 ######################
1651 vpshufd $0b01001110, \XMM4, \T2
1652 vpxor \XMM4, \T2, \T2
1653 vmovdqu HashKey_5(arg2), \T5
1654 vpclmulqdq $0x11, \T5, \XMM4, \T4
1657 vpclmulqdq $0x00, \T5, \XMM4, \T4
1660 vmovdqu HashKey_5_k(arg2), \T3
1661 vpclmulqdq $0x00, \T3, \T2, \T2
1662 vpxor \T2, \XMM1, \XMM1
1664 ######################
1666 vpshufd $0b01001110, \XMM5, \T2
1667 vpxor \XMM5, \T2, \T2
1668 vmovdqu HashKey_4(arg2), \T5
1669 vpclmulqdq $0x11, \T5, \XMM5, \T4
1672 vpclmulqdq $0x00, \T5, \XMM5, \T4
1675 vmovdqu HashKey_4_k(arg2), \T3
1676 vpclmulqdq $0x00, \T3, \T2, \T2
1677 vpxor \T2, \XMM1, \XMM1
1679 ######################
1681 vpshufd $0b01001110, \XMM6, \T2
1682 vpxor \XMM6, \T2, \T2
1683 vmovdqu HashKey_3(arg2), \T5
1684 vpclmulqdq $0x11, \T5, \XMM6, \T4
1687 vpclmulqdq $0x00, \T5, \XMM6, \T4
1690 vmovdqu HashKey_3_k(arg2), \T3
1691 vpclmulqdq $0x00, \T3, \T2, \T2
1692 vpxor \T2, \XMM1, \XMM1
1694 ######################
1696 vpshufd $0b01001110, \XMM7, \T2
1697 vpxor \XMM7, \T2, \T2
1698 vmovdqu HashKey_2(arg2), \T5
1699 vpclmulqdq $0x11, \T5, \XMM7, \T4
1702 vpclmulqdq $0x00, \T5, \XMM7, \T4
1705 vmovdqu HashKey_2_k(arg2), \T3
1706 vpclmulqdq $0x00, \T3, \T2, \T2
1707 vpxor \T2, \XMM1, \XMM1
1709 ######################
1711 vpshufd $0b01001110, \XMM8, \T2
1712 vpxor \XMM8, \T2, \T2
1713 vmovdqu HashKey(arg2), \T5
1714 vpclmulqdq $0x11, \T5, \XMM8, \T4
1717 vpclmulqdq $0x00, \T5, \XMM8, \T4
1720 vmovdqu HashKey_k(arg2), \T3
1721 vpclmulqdq $0x00, \T3, \T2, \T2
1723 vpxor \T2, \XMM1, \XMM1
1724 vpxor \T6, \XMM1, \XMM1
1725 vpxor \T7, \XMM1, \T2
1730 vpslldq $8, \T2, \T4
1731 vpsrldq $8, \T2, \T2
1734 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1735 # the accumulated carry-less multiplications
1737 #######################################################################
1738 #first phase of the reduction
1739 vpslld $31, \T7, \T2 # packed right shifting << 31
1740 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1741 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1743 vpxor \T3, \T2, \T2 # xor the shifted versions
1746 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1748 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1749 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1750 #######################################################################
1753 #second phase of the reduction
1754 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1755 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1756 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1757 vpxor \T3, \T2, \T2 # xor the shifted versions
1762 vpxor \T7, \T6, \T6 # the result is in T6
1766 #############################################################
1767 #void aesni_gcm_precomp_avx_gen2
1768 # (gcm_data *my_ctx_data,
1769 # gcm_context_data *data,
1770 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1771 # u8 *iv, /* Pre-counter block j0: 4 byte salt
1772 # (from Security Association) concatenated with 8 byte
1773 # Initialisation Vector (from IPSec ESP Payload)
1774 # concatenated with 0x00000001. 16-byte aligned pointer. */
1775 # const u8 *aad, /* Additional Authentication Data (AAD)*/
1776 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1777 #############################################################
1778 SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1780 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1783 SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1785 ###############################################################################
1786 #void aesni_gcm_enc_update_avx_gen2(
1787 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1788 # gcm_context_data *data,
1789 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1790 # const u8 *in, /* Plaintext input */
1791 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1792 ###############################################################################
1793 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1797 je key_256_enc_update
1799 je key_128_enc_update
1801 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1805 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1809 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1812 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1814 ###############################################################################
1815 #void aesni_gcm_dec_update_avx_gen2(
1816 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1817 # gcm_context_data *data,
1818 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1819 # const u8 *in, /* Ciphertext input */
1820 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
1821 ###############################################################################
1822 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1826 je key_256_dec_update
1828 je key_128_dec_update
1830 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1834 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1838 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1841 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1843 ###############################################################################
1844 #void aesni_gcm_finalize_avx_gen2(
1845 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1846 # gcm_context_data *data,
1847 # u8 *auth_tag, /* Authenticated Tag output. */
1848 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1849 # Valid values are 16 (most likely), 12 or 8. */
1850 ###############################################################################
1851 SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1859 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1863 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1867 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1870 SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1872 #endif /* CONFIG_AS_AVX */
1874 #ifdef CONFIG_AS_AVX2
1875 ###############################################################################
1876 # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1877 # Input: A and B (128-bits each, bit-reflected)
1878 # Output: C = A*B*x mod poly, (i.e. >>1 )
1879 # To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1880 # GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1881 ###############################################################################
1882 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1884 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1885 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1886 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1887 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1891 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1892 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1897 #######################################################################
1898 #first phase of the reduction
1899 vmovdqa POLY2(%rip), \T3
1901 vpclmulqdq $0x01, \GH, \T3, \T2
1902 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1904 vpxor \T2, \GH, \GH # first phase of the reduction complete
1905 #######################################################################
1906 #second phase of the reduction
1907 vpclmulqdq $0x00, \GH, \T3, \T2
1908 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1910 vpclmulqdq $0x10, \GH, \T3, \GH
1911 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1913 vpxor \T2, \GH, \GH # second phase of the reduction complete
1914 #######################################################################
1915 vpxor \T1, \GH, \GH # the result is in GH
1920 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1922 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1924 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1925 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1927 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1928 vmovdqu \T5, HashKey_3(arg2)
1930 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1931 vmovdqu \T5, HashKey_4(arg2)
1933 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1934 vmovdqu \T5, HashKey_5(arg2)
1936 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1937 vmovdqu \T5, HashKey_6(arg2)
1939 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1940 vmovdqu \T5, HashKey_7(arg2)
1942 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1943 vmovdqu \T5, HashKey_8(arg2)
1947 ## if a = number of total plaintext bytes
1949 ## num_initial_blocks = b mod 4#
1950 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1951 ## r10, r11, r12, rax are clobbered
1952 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1954 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1955 i = (8-\num_initial_blocks)
1957 vmovdqu AadHash(arg2), reg_i
1959 # start AES for num_initial_blocks blocks
1960 vmovdqu CurCount(arg2), \CTR
1962 i = (9-\num_initial_blocks)
1964 .rep \num_initial_blocks
1965 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1967 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1972 vmovdqa (arg1), \T_key
1973 i = (9-\num_initial_blocks)
1975 .rep \num_initial_blocks
1976 vpxor \T_key, reg_i, reg_i
1984 vmovdqa 16*j(arg1), \T_key
1985 i = (9-\num_initial_blocks)
1987 .rep \num_initial_blocks
1988 vaesenc \T_key, reg_i, reg_i
1998 vmovdqa 16*j(arg1), \T_key
1999 i = (9-\num_initial_blocks)
2001 .rep \num_initial_blocks
2002 vaesenclast \T_key, reg_i, reg_i
2007 i = (9-\num_initial_blocks)
2009 .rep \num_initial_blocks
2010 vmovdqu (arg4, %r11), \T1
2011 vpxor \T1, reg_i, reg_i
2012 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
2013 # num_initial_blocks blocks
2018 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
2024 i = (8-\num_initial_blocks)
2025 j = (9-\num_initial_blocks)
2028 .rep \num_initial_blocks
2029 vpxor reg_i, reg_j, reg_j
2030 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
2035 # XMM8 has the combined result here
2037 vmovdqa \XMM8, TMP1(%rsp)
2041 jl _initial_blocks_done\@ # no need for precomputed constants
2043 ###############################################################################
2044 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2045 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2047 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2049 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2051 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2053 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2055 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2057 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2059 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2061 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2063 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2065 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2067 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2069 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2071 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2073 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2075 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2077 vmovdqa (arg1), \T_key
2078 vpxor \T_key, \XMM1, \XMM1
2079 vpxor \T_key, \XMM2, \XMM2
2080 vpxor \T_key, \XMM3, \XMM3
2081 vpxor \T_key, \XMM4, \XMM4
2082 vpxor \T_key, \XMM5, \XMM5
2083 vpxor \T_key, \XMM6, \XMM6
2084 vpxor \T_key, \XMM7, \XMM7
2085 vpxor \T_key, \XMM8, \XMM8
2089 .rep \REP # do REP rounds
2090 vmovdqa 16*i(arg1), \T_key
2091 vaesenc \T_key, \XMM1, \XMM1
2092 vaesenc \T_key, \XMM2, \XMM2
2093 vaesenc \T_key, \XMM3, \XMM3
2094 vaesenc \T_key, \XMM4, \XMM4
2095 vaesenc \T_key, \XMM5, \XMM5
2096 vaesenc \T_key, \XMM6, \XMM6
2097 vaesenc \T_key, \XMM7, \XMM7
2098 vaesenc \T_key, \XMM8, \XMM8
2104 vmovdqa 16*i(arg1), \T_key
2105 vaesenclast \T_key, \XMM1, \XMM1
2106 vaesenclast \T_key, \XMM2, \XMM2
2107 vaesenclast \T_key, \XMM3, \XMM3
2108 vaesenclast \T_key, \XMM4, \XMM4
2109 vaesenclast \T_key, \XMM5, \XMM5
2110 vaesenclast \T_key, \XMM6, \XMM6
2111 vaesenclast \T_key, \XMM7, \XMM7
2112 vaesenclast \T_key, \XMM8, \XMM8
2114 vmovdqu (arg4, %r11), \T1
2115 vpxor \T1, \XMM1, \XMM1
2116 vmovdqu \XMM1, (arg3 , %r11)
2121 vmovdqu 16*1(arg4, %r11), \T1
2122 vpxor \T1, \XMM2, \XMM2
2123 vmovdqu \XMM2, 16*1(arg3 , %r11)
2128 vmovdqu 16*2(arg4, %r11), \T1
2129 vpxor \T1, \XMM3, \XMM3
2130 vmovdqu \XMM3, 16*2(arg3 , %r11)
2135 vmovdqu 16*3(arg4, %r11), \T1
2136 vpxor \T1, \XMM4, \XMM4
2137 vmovdqu \XMM4, 16*3(arg3 , %r11)
2142 vmovdqu 16*4(arg4, %r11), \T1
2143 vpxor \T1, \XMM5, \XMM5
2144 vmovdqu \XMM5, 16*4(arg3 , %r11)
2149 vmovdqu 16*5(arg4, %r11), \T1
2150 vpxor \T1, \XMM6, \XMM6
2151 vmovdqu \XMM6, 16*5(arg3 , %r11)
2156 vmovdqu 16*6(arg4, %r11), \T1
2157 vpxor \T1, \XMM7, \XMM7
2158 vmovdqu \XMM7, 16*6(arg3 , %r11)
2163 vmovdqu 16*7(arg4, %r11), \T1
2164 vpxor \T1, \XMM8, \XMM8
2165 vmovdqu \XMM8, 16*7(arg3 , %r11)
2172 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2173 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2174 # the corresponding ciphertext
2175 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2176 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2177 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2178 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2179 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2180 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2181 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2183 ###############################################################################
2185 _initial_blocks_done\@:
2192 # encrypt 8 blocks at a time
2193 # ghash the 8 previously encrypted ciphertext blocks
2194 # arg1, arg3, arg4 are used as pointers only, not modified
2195 # r11 is the data offset value
2196 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2199 vmovdqa \XMM2, TMP2(%rsp)
2200 vmovdqa \XMM3, TMP3(%rsp)
2201 vmovdqa \XMM4, TMP4(%rsp)
2202 vmovdqa \XMM5, TMP5(%rsp)
2203 vmovdqa \XMM6, TMP6(%rsp)
2204 vmovdqa \XMM7, TMP7(%rsp)
2205 vmovdqa \XMM8, TMP8(%rsp)
2207 .if \loop_idx == in_order
2208 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2209 vpaddd ONE(%rip), \XMM1, \XMM2
2210 vpaddd ONE(%rip), \XMM2, \XMM3
2211 vpaddd ONE(%rip), \XMM3, \XMM4
2212 vpaddd ONE(%rip), \XMM4, \XMM5
2213 vpaddd ONE(%rip), \XMM5, \XMM6
2214 vpaddd ONE(%rip), \XMM6, \XMM7
2215 vpaddd ONE(%rip), \XMM7, \XMM8
2218 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2219 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2220 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2221 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2222 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2223 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2224 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2225 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2227 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2228 vpaddd ONEf(%rip), \XMM1, \XMM2
2229 vpaddd ONEf(%rip), \XMM2, \XMM3
2230 vpaddd ONEf(%rip), \XMM3, \XMM4
2231 vpaddd ONEf(%rip), \XMM4, \XMM5
2232 vpaddd ONEf(%rip), \XMM5, \XMM6
2233 vpaddd ONEf(%rip), \XMM6, \XMM7
2234 vpaddd ONEf(%rip), \XMM7, \XMM8
2239 #######################################################################
2242 vpxor \T1, \XMM1, \XMM1
2243 vpxor \T1, \XMM2, \XMM2
2244 vpxor \T1, \XMM3, \XMM3
2245 vpxor \T1, \XMM4, \XMM4
2246 vpxor \T1, \XMM5, \XMM5
2247 vpxor \T1, \XMM6, \XMM6
2248 vpxor \T1, \XMM7, \XMM7
2249 vpxor \T1, \XMM8, \XMM8
2251 #######################################################################
2257 vmovdqu 16*1(arg1), \T1
2258 vaesenc \T1, \XMM1, \XMM1
2259 vaesenc \T1, \XMM2, \XMM2
2260 vaesenc \T1, \XMM3, \XMM3
2261 vaesenc \T1, \XMM4, \XMM4
2262 vaesenc \T1, \XMM5, \XMM5
2263 vaesenc \T1, \XMM6, \XMM6
2264 vaesenc \T1, \XMM7, \XMM7
2265 vaesenc \T1, \XMM8, \XMM8
2267 vmovdqu 16*2(arg1), \T1
2268 vaesenc \T1, \XMM1, \XMM1
2269 vaesenc \T1, \XMM2, \XMM2
2270 vaesenc \T1, \XMM3, \XMM3
2271 vaesenc \T1, \XMM4, \XMM4
2272 vaesenc \T1, \XMM5, \XMM5
2273 vaesenc \T1, \XMM6, \XMM6
2274 vaesenc \T1, \XMM7, \XMM7
2275 vaesenc \T1, \XMM8, \XMM8
2278 #######################################################################
2280 vmovdqu HashKey_8(arg2), \T5
2281 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2282 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2283 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2284 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2287 vmovdqu 16*3(arg1), \T1
2288 vaesenc \T1, \XMM1, \XMM1
2289 vaesenc \T1, \XMM2, \XMM2
2290 vaesenc \T1, \XMM3, \XMM3
2291 vaesenc \T1, \XMM4, \XMM4
2292 vaesenc \T1, \XMM5, \XMM5
2293 vaesenc \T1, \XMM6, \XMM6
2294 vaesenc \T1, \XMM7, \XMM7
2295 vaesenc \T1, \XMM8, \XMM8
2297 vmovdqa TMP2(%rsp), \T1
2298 vmovdqu HashKey_7(arg2), \T5
2299 vpclmulqdq $0x11, \T5, \T1, \T3
2302 vpclmulqdq $0x00, \T5, \T1, \T3
2305 vpclmulqdq $0x01, \T5, \T1, \T3
2308 vpclmulqdq $0x10, \T5, \T1, \T3
2311 vmovdqu 16*4(arg1), \T1
2312 vaesenc \T1, \XMM1, \XMM1
2313 vaesenc \T1, \XMM2, \XMM2
2314 vaesenc \T1, \XMM3, \XMM3
2315 vaesenc \T1, \XMM4, \XMM4
2316 vaesenc \T1, \XMM5, \XMM5
2317 vaesenc \T1, \XMM6, \XMM6
2318 vaesenc \T1, \XMM7, \XMM7
2319 vaesenc \T1, \XMM8, \XMM8
2321 #######################################################################
2323 vmovdqa TMP3(%rsp), \T1
2324 vmovdqu HashKey_6(arg2), \T5
2325 vpclmulqdq $0x11, \T5, \T1, \T3
2328 vpclmulqdq $0x00, \T5, \T1, \T3
2331 vpclmulqdq $0x01, \T5, \T1, \T3
2334 vpclmulqdq $0x10, \T5, \T1, \T3
2337 vmovdqu 16*5(arg1), \T1
2338 vaesenc \T1, \XMM1, \XMM1
2339 vaesenc \T1, \XMM2, \XMM2
2340 vaesenc \T1, \XMM3, \XMM3
2341 vaesenc \T1, \XMM4, \XMM4
2342 vaesenc \T1, \XMM5, \XMM5
2343 vaesenc \T1, \XMM6, \XMM6
2344 vaesenc \T1, \XMM7, \XMM7
2345 vaesenc \T1, \XMM8, \XMM8
2347 vmovdqa TMP4(%rsp), \T1
2348 vmovdqu HashKey_5(arg2), \T5
2349 vpclmulqdq $0x11, \T5, \T1, \T3
2352 vpclmulqdq $0x00, \T5, \T1, \T3
2355 vpclmulqdq $0x01, \T5, \T1, \T3
2358 vpclmulqdq $0x10, \T5, \T1, \T3
2361 vmovdqu 16*6(arg1), \T1
2362 vaesenc \T1, \XMM1, \XMM1
2363 vaesenc \T1, \XMM2, \XMM2
2364 vaesenc \T1, \XMM3, \XMM3
2365 vaesenc \T1, \XMM4, \XMM4
2366 vaesenc \T1, \XMM5, \XMM5
2367 vaesenc \T1, \XMM6, \XMM6
2368 vaesenc \T1, \XMM7, \XMM7
2369 vaesenc \T1, \XMM8, \XMM8
2372 vmovdqa TMP5(%rsp), \T1
2373 vmovdqu HashKey_4(arg2), \T5
2374 vpclmulqdq $0x11, \T5, \T1, \T3
2377 vpclmulqdq $0x00, \T5, \T1, \T3
2380 vpclmulqdq $0x01, \T5, \T1, \T3
2383 vpclmulqdq $0x10, \T5, \T1, \T3
2386 vmovdqu 16*7(arg1), \T1
2387 vaesenc \T1, \XMM1, \XMM1
2388 vaesenc \T1, \XMM2, \XMM2
2389 vaesenc \T1, \XMM3, \XMM3
2390 vaesenc \T1, \XMM4, \XMM4
2391 vaesenc \T1, \XMM5, \XMM5
2392 vaesenc \T1, \XMM6, \XMM6
2393 vaesenc \T1, \XMM7, \XMM7
2394 vaesenc \T1, \XMM8, \XMM8
2396 vmovdqa TMP6(%rsp), \T1
2397 vmovdqu HashKey_3(arg2), \T5
2398 vpclmulqdq $0x11, \T5, \T1, \T3
2401 vpclmulqdq $0x00, \T5, \T1, \T3
2404 vpclmulqdq $0x01, \T5, \T1, \T3
2407 vpclmulqdq $0x10, \T5, \T1, \T3
2410 vmovdqu 16*8(arg1), \T1
2411 vaesenc \T1, \XMM1, \XMM1
2412 vaesenc \T1, \XMM2, \XMM2
2413 vaesenc \T1, \XMM3, \XMM3
2414 vaesenc \T1, \XMM4, \XMM4
2415 vaesenc \T1, \XMM5, \XMM5
2416 vaesenc \T1, \XMM6, \XMM6
2417 vaesenc \T1, \XMM7, \XMM7
2418 vaesenc \T1, \XMM8, \XMM8
2420 vmovdqa TMP7(%rsp), \T1
2421 vmovdqu HashKey_2(arg2), \T5
2422 vpclmulqdq $0x11, \T5, \T1, \T3
2425 vpclmulqdq $0x00, \T5, \T1, \T3
2428 vpclmulqdq $0x01, \T5, \T1, \T3
2431 vpclmulqdq $0x10, \T5, \T1, \T3
2435 #######################################################################
2437 vmovdqu 16*9(arg1), \T5
2438 vaesenc \T5, \XMM1, \XMM1
2439 vaesenc \T5, \XMM2, \XMM2
2440 vaesenc \T5, \XMM3, \XMM3
2441 vaesenc \T5, \XMM4, \XMM4
2442 vaesenc \T5, \XMM5, \XMM5
2443 vaesenc \T5, \XMM6, \XMM6
2444 vaesenc \T5, \XMM7, \XMM7
2445 vaesenc \T5, \XMM8, \XMM8
2447 vmovdqa TMP8(%rsp), \T1
2448 vmovdqu HashKey(arg2), \T5
2450 vpclmulqdq $0x00, \T5, \T1, \T3
2453 vpclmulqdq $0x01, \T5, \T1, \T3
2456 vpclmulqdq $0x10, \T5, \T1, \T3
2459 vpclmulqdq $0x11, \T5, \T1, \T3
2463 vmovdqu 16*10(arg1), \T5
2468 vaesenc \T5, \XMM1, \XMM1
2469 vaesenc \T5, \XMM2, \XMM2
2470 vaesenc \T5, \XMM3, \XMM3
2471 vaesenc \T5, \XMM4, \XMM4
2472 vaesenc \T5, \XMM5, \XMM5
2473 vaesenc \T5, \XMM6, \XMM6
2474 vaesenc \T5, \XMM7, \XMM7
2475 vaesenc \T5, \XMM8, \XMM8
2477 vmovdqu 16*i(arg1), \T5
2486 vpxor 16*i(arg4, %r11), \T5, \T2
2488 vaesenclast \T2, reg_j, reg_j
2490 vaesenclast \T2, reg_j, \T3
2491 vmovdqu 16*i(arg4, %r11), reg_j
2492 vmovdqu \T3, 16*i(arg3, %r11)
2498 #######################################################################
2501 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2502 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2504 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2508 #######################################################################
2509 #first phase of the reduction
2510 vmovdqa POLY2(%rip), \T3
2512 vpclmulqdq $0x01, \T7, \T3, \T2
2513 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2515 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2516 #######################################################################
2518 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2519 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2520 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2521 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2522 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2523 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2524 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2525 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2528 #######################################################################
2529 #second phase of the reduction
2530 vpclmulqdq $0x00, \T7, \T3, \T2
2531 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2533 vpclmulqdq $0x10, \T7, \T3, \T4
2534 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2536 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2537 #######################################################################
2538 vpxor \T4, \T1, \T1 # the result is in T1
2540 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2541 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2542 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2543 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2544 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2545 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2546 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2547 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2550 vpxor \T1, \XMM1, \XMM1
2557 # GHASH the last 4 ciphertext blocks.
2558 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2562 vmovdqu HashKey_8(arg2), \T5
2564 vpshufd $0b01001110, \XMM1, \T2
2565 vpshufd $0b01001110, \T5, \T3
2566 vpxor \XMM1, \T2, \T2
2569 vpclmulqdq $0x11, \T5, \XMM1, \T6
2570 vpclmulqdq $0x00, \T5, \XMM1, \T7
2572 vpclmulqdq $0x00, \T3, \T2, \XMM1
2574 ######################
2576 vmovdqu HashKey_7(arg2), \T5
2577 vpshufd $0b01001110, \XMM2, \T2
2578 vpshufd $0b01001110, \T5, \T3
2579 vpxor \XMM2, \T2, \T2
2582 vpclmulqdq $0x11, \T5, \XMM2, \T4
2585 vpclmulqdq $0x00, \T5, \XMM2, \T4
2588 vpclmulqdq $0x00, \T3, \T2, \T2
2590 vpxor \T2, \XMM1, \XMM1
2592 ######################
2594 vmovdqu HashKey_6(arg2), \T5
2595 vpshufd $0b01001110, \XMM3, \T2
2596 vpshufd $0b01001110, \T5, \T3
2597 vpxor \XMM3, \T2, \T2
2600 vpclmulqdq $0x11, \T5, \XMM3, \T4
2603 vpclmulqdq $0x00, \T5, \XMM3, \T4
2606 vpclmulqdq $0x00, \T3, \T2, \T2
2608 vpxor \T2, \XMM1, \XMM1
2610 ######################
2612 vmovdqu HashKey_5(arg2), \T5
2613 vpshufd $0b01001110, \XMM4, \T2
2614 vpshufd $0b01001110, \T5, \T3
2615 vpxor \XMM4, \T2, \T2
2618 vpclmulqdq $0x11, \T5, \XMM4, \T4
2621 vpclmulqdq $0x00, \T5, \XMM4, \T4
2624 vpclmulqdq $0x00, \T3, \T2, \T2
2626 vpxor \T2, \XMM1, \XMM1
2628 ######################
2630 vmovdqu HashKey_4(arg2), \T5
2631 vpshufd $0b01001110, \XMM5, \T2
2632 vpshufd $0b01001110, \T5, \T3
2633 vpxor \XMM5, \T2, \T2
2636 vpclmulqdq $0x11, \T5, \XMM5, \T4
2639 vpclmulqdq $0x00, \T5, \XMM5, \T4
2642 vpclmulqdq $0x00, \T3, \T2, \T2
2644 vpxor \T2, \XMM1, \XMM1
2646 ######################
2648 vmovdqu HashKey_3(arg2), \T5
2649 vpshufd $0b01001110, \XMM6, \T2
2650 vpshufd $0b01001110, \T5, \T3
2651 vpxor \XMM6, \T2, \T2
2654 vpclmulqdq $0x11, \T5, \XMM6, \T4
2657 vpclmulqdq $0x00, \T5, \XMM6, \T4
2660 vpclmulqdq $0x00, \T3, \T2, \T2
2662 vpxor \T2, \XMM1, \XMM1
2664 ######################
2666 vmovdqu HashKey_2(arg2), \T5
2667 vpshufd $0b01001110, \XMM7, \T2
2668 vpshufd $0b01001110, \T5, \T3
2669 vpxor \XMM7, \T2, \T2
2672 vpclmulqdq $0x11, \T5, \XMM7, \T4
2675 vpclmulqdq $0x00, \T5, \XMM7, \T4
2678 vpclmulqdq $0x00, \T3, \T2, \T2
2680 vpxor \T2, \XMM1, \XMM1
2682 ######################
2684 vmovdqu HashKey(arg2), \T5
2685 vpshufd $0b01001110, \XMM8, \T2
2686 vpshufd $0b01001110, \T5, \T3
2687 vpxor \XMM8, \T2, \T2
2690 vpclmulqdq $0x11, \T5, \XMM8, \T4
2693 vpclmulqdq $0x00, \T5, \XMM8, \T4
2696 vpclmulqdq $0x00, \T3, \T2, \T2
2698 vpxor \T2, \XMM1, \XMM1
2699 vpxor \T6, \XMM1, \XMM1
2700 vpxor \T7, \XMM1, \T2
2705 vpslldq $8, \T2, \T4
2706 vpsrldq $8, \T2, \T2
2709 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2710 # accumulated carry-less multiplications
2712 #######################################################################
2713 #first phase of the reduction
2714 vmovdqa POLY2(%rip), \T3
2716 vpclmulqdq $0x01, \T7, \T3, \T2
2717 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2719 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2720 #######################################################################
2723 #second phase of the reduction
2724 vpclmulqdq $0x00, \T7, \T3, \T2
2725 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2727 vpclmulqdq $0x10, \T7, \T3, \T4
2728 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2730 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2731 #######################################################################
2732 vpxor \T4, \T6, \T6 # the result is in T6
2737 #############################################################
2738 #void aesni_gcm_init_avx_gen4
2739 # (gcm_data *my_ctx_data,
2740 # gcm_context_data *data,
2741 # u8 *iv, /* Pre-counter block j0: 4 byte salt
2742 # (from Security Association) concatenated with 8 byte
2743 # Initialisation Vector (from IPSec ESP Payload)
2744 # concatenated with 0x00000001. 16-byte aligned pointer. */
2745 # u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2746 # const u8 *aad, /* Additional Authentication Data (AAD)*/
2747 # u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2748 #############################################################
2749 SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2751 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2754 SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2756 ###############################################################################
2757 #void aesni_gcm_enc_avx_gen4(
2758 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2759 # gcm_context_data *data,
2760 # u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2761 # const u8 *in, /* Plaintext input */
2762 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2763 ###############################################################################
2764 SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2768 je key_256_enc_update4
2770 je key_128_enc_update4
2772 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2775 key_128_enc_update4:
2776 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2779 key_256_enc_update4:
2780 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2783 SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2785 ###############################################################################
2786 #void aesni_gcm_dec_update_avx_gen4(
2787 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2788 # gcm_context_data *data,
2789 # u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2790 # const u8 *in, /* Ciphertext input */
2791 # u64 plaintext_len) /* Length of data in Bytes for encryption. */
2792 ###############################################################################
2793 SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2797 je key_256_dec_update4
2799 je key_128_dec_update4
2801 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2804 key_128_dec_update4:
2805 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2808 key_256_dec_update4:
2809 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2812 SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2814 ###############################################################################
2815 #void aesni_gcm_finalize_avx_gen4(
2816 # gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2817 # gcm_context_data *data,
2818 # u8 *auth_tag, /* Authenticated Tag output. */
2819 # u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2820 # Valid values are 16 (most likely), 12 or 8. */
2821 ###############################################################################
2822 SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2826 je key_256_finalize4
2828 je key_128_finalize4
2830 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2834 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2838 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2841 SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2843 #endif /* CONFIG_AS_AVX2 */