arch/x86/crypto/aesni-intel_asm.S

   1 /*
   2  * Implement AES algorithm in Intel AES-NI instructions.
   3  *
   4  * The white paper of AES-NI instructions can be downloaded from:
   5  *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
   6  *
   7  * Copyright (C) 2008, Intel Corp.
   8  *    Author: Huang Ying <ying.huang@intel.com>
   9  *            Vinodh Gopal <vinodh.gopal@intel.com>
  10  *            Kahraman Akdemir
  11  *
  12  * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
  13  * interface for 64-bit kernels.
  14  *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
  15  *             Aidan O'Mahony (aidan.o.mahony@intel.com)
  16  *             Adrian Hoban <adrian.hoban@intel.com>
  17  *             James Guilford (james.guilford@intel.com)
  18  *             Gabriele Paoloni <gabriele.paoloni@intel.com>
  19  *             Tadeusz Struk (tadeusz.struk@intel.com)
  20  *             Wajdi Feghali (wajdi.k.feghali@intel.com)
  21  *    Copyright (c) 2010, Intel Corporation.
  22  *
  23  * Ported x86_64 version to x86:
  24  *    Author: Mathias Krause <minipli@googlemail.com>
  25  *
  26  * This program is free software; you can redistribute it and/or modify
  27  * it under the terms of the GNU General Public License as published by
  28  * the Free Software Foundation; either version 2 of the License, or
  29  * (at your option) any later version.
  30  */
  31
  32 #include <linux/linkage.h>
  33 #include <asm/inst.h>
  34 #include <asm/frame.h>
  35 #include <asm/nospec-branch.h>
  36
  37 /*
  38  * The following macros are used to move an (un)aligned 16 byte value to/from
  39  * an XMM register.  This can done for either FP or integer values, for FP use
  40  * movaps (move aligned packed single) or integer use movdqa (move double quad
  41  * aligned).  It doesn't make a performance difference which instruction is used
  42  * since Nehalem (original Core i7) was released.  However, the movaps is a byte
  43  * shorter, so that is the one we'll use for now. (same for unaligned).
  44  */
  45 #define MOVADQ  movaps
  46 #define MOVUDQ  movups
  47
  48 #ifdef __x86_64__
  49
  50 # constants in mergeable sections, linker can reorder and merge
  51 .section        .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
  52 .align 16
  53 .Lgf128mul_x_ble_mask:
  54         .octa 0x00000000000000010000000000000087
  55 .section        .rodata.cst16.POLY, "aM", @progbits, 16
  56 .align 16
  57 POLY:   .octa 0xC2000000000000000000000000000001
  58 .section        .rodata.cst16.TWOONE, "aM", @progbits, 16
  59 .align 16
  60 TWOONE: .octa 0x00000001000000000000000000000001
  61
  62 .section        .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
  63 .align 16
  64 SHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
  65 .section        .rodata.cst16.MASK1, "aM", @progbits, 16
  66 .align 16
  67 MASK1:      .octa 0x0000000000000000ffffffffffffffff
  68 .section        .rodata.cst16.MASK2, "aM", @progbits, 16
  69 .align 16
  70 MASK2:      .octa 0xffffffffffffffff0000000000000000
  71 .section        .rodata.cst16.ONE, "aM", @progbits, 16
  72 .align 16
  73 ONE:        .octa 0x00000000000000000000000000000001
  74 .section        .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
  75 .align 16
  76 F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
  77 .section        .rodata.cst16.dec, "aM", @progbits, 16
  78 .align 16
  79 dec:        .octa 0x1
  80 .section        .rodata.cst16.enc, "aM", @progbits, 16
  81 .align 16
  82 enc:        .octa 0x2
  83
  84 # order of these constants should not change.
  85 # more specifically, ALL_F should follow SHIFT_MASK,
  86 # and zero should follow ALL_F
  87 .section        .rodata, "a", @progbits
  88 .align 16
  89 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
  90 ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
  91             .octa 0x00000000000000000000000000000000
  92
  93 .text
  94
  95
  96 #define STACK_OFFSET    8*3
  97
  98 #define AadHash 16*0
  99 #define AadLen 16*1
 100 #define InLen (16*1)+8
 101 #define PBlockEncKey 16*2
 102 #define OrigIV 16*3
 103 #define CurCount 16*4
 104 #define PBlockLen 16*5
 105 #define HashKey         16*6    // store HashKey <<1 mod poly here
 106 #define HashKey_2       16*7    // store HashKey^2 <<1 mod poly here
 107 #define HashKey_3       16*8    // store HashKey^3 <<1 mod poly here
 108 #define HashKey_4       16*9    // store HashKey^4 <<1 mod poly here
 109 #define HashKey_k       16*10   // store XOR of High 64 bits and Low 64
 110                                 // bits of  HashKey <<1 mod poly here
 111                                 //(for Karatsuba purposes)
 112 #define HashKey_2_k     16*11   // store XOR of High 64 bits and Low 64
 113                                 // bits of  HashKey^2 <<1 mod poly here
 114                                 // (for Karatsuba purposes)
 115 #define HashKey_3_k     16*12   // store XOR of High 64 bits and Low 64
 116                                 // bits of  HashKey^3 <<1 mod poly here
 117                                 // (for Karatsuba purposes)
 118 #define HashKey_4_k     16*13   // store XOR of High 64 bits and Low 64
 119                                 // bits of  HashKey^4 <<1 mod poly here
 120                                 // (for Karatsuba purposes)
 121
 122 #define arg1 rdi
 123 #define arg2 rsi
 124 #define arg3 rdx
 125 #define arg4 rcx
 126 #define arg5 r8
 127 #define arg6 r9
 128 #define arg7 STACK_OFFSET+8(%rsp)
 129 #define arg8 STACK_OFFSET+16(%rsp)
 130 #define arg9 STACK_OFFSET+24(%rsp)
 131 #define arg10 STACK_OFFSET+32(%rsp)
 132 #define arg11 STACK_OFFSET+40(%rsp)
 133 #define keysize 2*15*16(%arg1)
 134 #endif
 135
 136
 137 #define STATE1  %xmm0
 138 #define STATE2  %xmm4
 139 #define STATE3  %xmm5
 140 #define STATE4  %xmm6
 141 #define STATE   STATE1
 142 #define IN1     %xmm1
 143 #define IN2     %xmm7
 144 #define IN3     %xmm8
 145 #define IN4     %xmm9
 146 #define IN      IN1
 147 #define KEY     %xmm2
 148 #define IV      %xmm3
 149
 150 #define BSWAP_MASK %xmm10
 151 #define CTR     %xmm11
 152 #define INC     %xmm12
 153
 154 #define GF128MUL_MASK %xmm10
 155
 156 #ifdef __x86_64__
 157 #define AREG    %rax
 158 #define KEYP    %rdi
 159 #define OUTP    %rsi
 160 #define UKEYP   OUTP
 161 #define INP     %rdx
 162 #define LEN     %rcx
 163 #define IVP     %r8
 164 #define KLEN    %r9d
 165 #define T1      %r10
 166 #define TKEYP   T1
 167 #define T2      %r11
 168 #define TCTR_LOW T2
 169 #else
 170 #define AREG    %eax
 171 #define KEYP    %edi
 172 #define OUTP    AREG
 173 #define UKEYP   OUTP
 174 #define INP     %edx
 175 #define LEN     %esi
 176 #define IVP     %ebp
 177 #define KLEN    %ebx
 178 #define T1      %ecx
 179 #define TKEYP   T1
 180 #endif
 181
 182 .macro FUNC_SAVE
 183         push    %r12
 184         push    %r13
 185         push    %r14
 186 #
 187 # states of %xmm registers %xmm6:%xmm15 not saved
 188 # all %xmm registers are clobbered
 189 #
 190 .endm
 191
 192
 193 .macro FUNC_RESTORE
 194         pop     %r14
 195         pop     %r13
 196         pop     %r12
 197 .endm
 198
 199 # Precompute hashkeys.
 200 # Input: Hash subkey.
 201 # Output: HashKeys stored in gcm_context_data.  Only needs to be called
 202 # once per key.
 203 # clobbers r12, and tmp xmm registers.
 204 .macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
 205         mov     \SUBKEY, %r12
 206         movdqu  (%r12), \TMP3
 207         movdqa  SHUF_MASK(%rip), \TMP2
 208         PSHUFB_XMM \TMP2, \TMP3
 209
 210         # precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
 211
 212         movdqa  \TMP3, \TMP2
 213         psllq   $1, \TMP3
 214         psrlq   $63, \TMP2
 215         movdqa  \TMP2, \TMP1
 216         pslldq  $8, \TMP2
 217         psrldq  $8, \TMP1
 218         por     \TMP2, \TMP3
 219
 220         # reduce HashKey<<1
 221
 222         pshufd  $0x24, \TMP1, \TMP2
 223         pcmpeqd TWOONE(%rip), \TMP2
 224         pand    POLY(%rip), \TMP2
 225         pxor    \TMP2, \TMP3
 226         movdqu  \TMP3, HashKey(%arg2)
 227
 228         movdqa     \TMP3, \TMP5
 229         pshufd     $78, \TMP3, \TMP1
 230         pxor       \TMP3, \TMP1
 231         movdqu     \TMP1, HashKey_k(%arg2)
 232
 233         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 234 # TMP5 = HashKey^2<<1 (mod poly)
 235         movdqu     \TMP5, HashKey_2(%arg2)
 236 # HashKey_2 = HashKey^2<<1 (mod poly)
 237         pshufd     $78, \TMP5, \TMP1
 238         pxor       \TMP5, \TMP1
 239         movdqu     \TMP1, HashKey_2_k(%arg2)
 240
 241         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 242 # TMP5 = HashKey^3<<1 (mod poly)
 243         movdqu     \TMP5, HashKey_3(%arg2)
 244         pshufd     $78, \TMP5, \TMP1
 245         pxor       \TMP5, \TMP1
 246         movdqu     \TMP1, HashKey_3_k(%arg2)
 247
 248         GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
 249 # TMP5 = HashKey^3<<1 (mod poly)
 250         movdqu     \TMP5, HashKey_4(%arg2)
 251         pshufd     $78, \TMP5, \TMP1
 252         pxor       \TMP5, \TMP1
 253         movdqu     \TMP1, HashKey_4_k(%arg2)
 254 .endm
 255
 256 # GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
 257 # Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
 258 .macro GCM_INIT Iv SUBKEY AAD AADLEN
 259         mov \AADLEN, %r11
 260         mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
 261         xor %r11d, %r11d
 262         mov %r11, InLen(%arg2) # ctx_data.in_length = 0
 263         mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
 264         mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
 265         mov \Iv, %rax
 266         movdqu (%rax), %xmm0
 267         movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
 268
 269         movdqa  SHUF_MASK(%rip), %xmm2
 270         PSHUFB_XMM %xmm2, %xmm0
 271         movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
 272
 273         PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
 274         movdqu HashKey(%arg2), %xmm13
 275
 276         CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
 277         %xmm4, %xmm5, %xmm6
 278 .endm
 279
 280 # GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
 281 # struct has been initialized by GCM_INIT.
 282 # Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
 283 # Clobbers rax, r10-r13, and xmm0-xmm15
 284 .macro GCM_ENC_DEC operation
 285         movdqu AadHash(%arg2), %xmm8
 286         movdqu HashKey(%arg2), %xmm13
 287         add %arg5, InLen(%arg2)
 288
 289         xor %r11d, %r11d # initialise the data pointer offset as zero
 290         PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
 291
 292         sub %r11, %arg5         # sub partial block data used
 293         mov %arg5, %r13         # save the number of bytes
 294
 295         and $-16, %r13          # %r13 = %r13 - (%r13 mod 16)
 296         mov %r13, %r12
 297         # Encrypt/Decrypt first few blocks
 298
 299         and     $(3<<4), %r12
 300         jz      _initial_num_blocks_is_0_\@
 301         cmp     $(2<<4), %r12
 302         jb      _initial_num_blocks_is_1_\@
 303         je      _initial_num_blocks_is_2_\@
 304 _initial_num_blocks_is_3_\@:
 305         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 306 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
 307         sub     $48, %r13
 308         jmp     _initial_blocks_\@
 309 _initial_num_blocks_is_2_\@:
 310         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 311 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
 312         sub     $32, %r13
 313         jmp     _initial_blocks_\@
 314 _initial_num_blocks_is_1_\@:
 315         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 316 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
 317         sub     $16, %r13
 318         jmp     _initial_blocks_\@
 319 _initial_num_blocks_is_0_\@:
 320         INITIAL_BLOCKS_ENC_DEC  %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
 321 %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
 322 _initial_blocks_\@:
 323
 324         # Main loop - Encrypt/Decrypt remaining blocks
 325
 326         cmp     $0, %r13
 327         je      _zero_cipher_left_\@
 328         sub     $64, %r13
 329         je      _four_cipher_left_\@
 330 _crypt_by_4_\@:
 331         GHASH_4_ENCRYPT_4_PARALLEL_\operation   %xmm9, %xmm10, %xmm11, %xmm12, \
 332         %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
 333         %xmm7, %xmm8, enc
 334         add     $64, %r11
 335         sub     $64, %r13
 336         jne     _crypt_by_4_\@
 337 _four_cipher_left_\@:
 338         GHASH_LAST_4    %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
 339 %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
 340 _zero_cipher_left_\@:
 341         movdqu %xmm8, AadHash(%arg2)
 342         movdqu %xmm0, CurCount(%arg2)
 343
 344         mov     %arg5, %r13
 345         and     $15, %r13                       # %r13 = arg5 (mod 16)
 346         je      _multiple_of_16_bytes_\@
 347
 348         mov %r13, PBlockLen(%arg2)
 349
 350         # Handle the last <16 Byte block separately
 351         paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
 352         movdqu %xmm0, CurCount(%arg2)
 353         movdqa SHUF_MASK(%rip), %xmm10
 354         PSHUFB_XMM %xmm10, %xmm0
 355
 356         ENCRYPT_SINGLE_BLOCK    %xmm0, %xmm1        # Encrypt(K, Yn)
 357         movdqu %xmm0, PBlockEncKey(%arg2)
 358
 359         cmp     $16, %arg5
 360         jge _large_enough_update_\@
 361
 362         lea (%arg4,%r11,1), %r10
 363         mov %r13, %r12
 364         READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
 365         jmp _data_read_\@
 366
 367 _large_enough_update_\@:
 368         sub     $16, %r11
 369         add     %r13, %r11
 370
 371         # receive the last <16 Byte block
 372         movdqu  (%arg4, %r11, 1), %xmm1
 373
 374         sub     %r13, %r11
 375         add     $16, %r11
 376
 377         lea     SHIFT_MASK+16(%rip), %r12
 378         # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
 379         # (r13 is the number of bytes in plaintext mod 16)
 380         sub     %r13, %r12
 381         # get the appropriate shuffle mask
 382         movdqu  (%r12), %xmm2
 383         # shift right 16-r13 bytes
 384         PSHUFB_XMM  %xmm2, %xmm1
 385
 386 _data_read_\@:
 387         lea ALL_F+16(%rip), %r12
 388         sub %r13, %r12
 389
 390 .ifc \operation, dec
 391         movdqa  %xmm1, %xmm2
 392 .endif
 393         pxor    %xmm1, %xmm0            # XOR Encrypt(K, Yn)
 394         movdqu  (%r12), %xmm1
 395         # get the appropriate mask to mask out top 16-r13 bytes of xmm0
 396         pand    %xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
 397 .ifc \operation, dec
 398         pand    %xmm1, %xmm2
 399         movdqa SHUF_MASK(%rip), %xmm10
 400         PSHUFB_XMM %xmm10 ,%xmm2
 401
 402         pxor %xmm2, %xmm8
 403 .else
 404         movdqa SHUF_MASK(%rip), %xmm10
 405         PSHUFB_XMM %xmm10,%xmm0
 406
 407         pxor    %xmm0, %xmm8
 408 .endif
 409
 410         movdqu %xmm8, AadHash(%arg2)
 411 .ifc \operation, enc
 412         # GHASH computation for the last <16 byte block
 413         movdqa SHUF_MASK(%rip), %xmm10
 414         # shuffle xmm0 back to output as ciphertext
 415         PSHUFB_XMM %xmm10, %xmm0
 416 .endif
 417
 418         # Output %r13 bytes
 419         MOVQ_R64_XMM %xmm0, %rax
 420         cmp $8, %r13
 421         jle _less_than_8_bytes_left_\@
 422         mov %rax, (%arg3 , %r11, 1)
 423         add $8, %r11
 424         psrldq $8, %xmm0
 425         MOVQ_R64_XMM %xmm0, %rax
 426         sub $8, %r13
 427 _less_than_8_bytes_left_\@:
 428         mov %al,  (%arg3, %r11, 1)
 429         add $1, %r11
 430         shr $8, %rax
 431         sub $1, %r13
 432         jne _less_than_8_bytes_left_\@
 433 _multiple_of_16_bytes_\@:
 434 .endm
 435
 436 # GCM_COMPLETE Finishes update of tag of last partial block
 437 # Output: Authorization Tag (AUTH_TAG)
 438 # Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
 439 .macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
 440         movdqu AadHash(%arg2), %xmm8
 441         movdqu HashKey(%arg2), %xmm13
 442
 443         mov PBlockLen(%arg2), %r12
 444
 445         cmp $0, %r12
 446         je _partial_done\@
 447
 448         GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 449
 450 _partial_done\@:
 451         mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
 452         shl     $3, %r12                  # convert into number of bits
 453         movd    %r12d, %xmm15             # len(A) in %xmm15
 454         mov InLen(%arg2), %r12
 455         shl     $3, %r12                  # len(C) in bits (*128)
 456         MOVQ_R64_XMM    %r12, %xmm1
 457
 458         pslldq  $8, %xmm15                # %xmm15 = len(A)||0x0000000000000000
 459         pxor    %xmm1, %xmm15             # %xmm15 = len(A)||len(C)
 460         pxor    %xmm15, %xmm8
 461         GHASH_MUL       %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
 462         # final GHASH computation
 463         movdqa SHUF_MASK(%rip), %xmm10
 464         PSHUFB_XMM %xmm10, %xmm8
 465
 466         movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
 467         ENCRYPT_SINGLE_BLOCK    %xmm0,  %xmm1     # E(K, Y0)
 468         pxor    %xmm8, %xmm0
 469 _return_T_\@:
 470         mov     \AUTHTAG, %r10                     # %r10 = authTag
 471         mov     \AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
 472         cmp     $16, %r11
 473         je      _T_16_\@
 474         cmp     $8, %r11
 475         jl      _T_4_\@
 476 _T_8_\@:
 477         MOVQ_R64_XMM    %xmm0, %rax
 478         mov     %rax, (%r10)
 479         add     $8, %r10
 480         sub     $8, %r11
 481         psrldq  $8, %xmm0
 482         cmp     $0, %r11
 483         je      _return_T_done_\@
 484 _T_4_\@:
 485         movd    %xmm0, %eax
 486         mov     %eax, (%r10)
 487         add     $4, %r10
 488         sub     $4, %r11
 489         psrldq  $4, %xmm0
 490         cmp     $0, %r11
 491         je      _return_T_done_\@
 492 _T_123_\@:
 493         movd    %xmm0, %eax
 494         cmp     $2, %r11
 495         jl      _T_1_\@
 496         mov     %ax, (%r10)
 497         cmp     $2, %r11
 498         je      _return_T_done_\@
 499         add     $2, %r10
 500         sar     $16, %eax
 501 _T_1_\@:
 502         mov     %al, (%r10)
 503         jmp     _return_T_done_\@
 504 _T_16_\@:
 505         movdqu  %xmm0, (%r10)
 506 _return_T_done_\@:
 507 .endm
 508
 509 #ifdef __x86_64__
 510 /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
 511 *
 512 *
 513 * Input: A and B (128-bits each, bit-reflected)
 514 * Output: C = A*B*x mod poly, (i.e. >>1 )
 515 * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
 516 * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
 517 *
 518 */
 519 .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
 520         movdqa    \GH, \TMP1
 521         pshufd    $78, \GH, \TMP2
 522         pshufd    $78, \HK, \TMP3
 523         pxor      \GH, \TMP2            # TMP2 = a1+a0
 524         pxor      \HK, \TMP3            # TMP3 = b1+b0
 525         PCLMULQDQ 0x11, \HK, \TMP1     # TMP1 = a1*b1
 526         PCLMULQDQ 0x00, \HK, \GH       # GH = a0*b0
 527         PCLMULQDQ 0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
 528         pxor      \GH, \TMP2
 529         pxor      \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
 530         movdqa    \TMP2, \TMP3
 531         pslldq    $8, \TMP3             # left shift TMP3 2 DWs
 532         psrldq    $8, \TMP2             # right shift TMP2 2 DWs
 533         pxor      \TMP3, \GH
 534         pxor      \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
 535
 536         # first phase of the reduction
 537
 538         movdqa    \GH, \TMP2
 539         movdqa    \GH, \TMP3
 540         movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
 541                                         # in in order to perform
 542                                         # independent shifts
 543         pslld     $31, \TMP2            # packed right shift <<31
 544         pslld     $30, \TMP3            # packed right shift <<30
 545         pslld     $25, \TMP4            # packed right shift <<25
 546         pxor      \TMP3, \TMP2          # xor the shifted versions
 547         pxor      \TMP4, \TMP2
 548         movdqa    \TMP2, \TMP5
 549         psrldq    $4, \TMP5             # right shift TMP5 1 DW
 550         pslldq    $12, \TMP2            # left shift TMP2 3 DWs
 551         pxor      \TMP2, \GH
 552
 553         # second phase of the reduction
 554
 555         movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
 556                                         # in in order to perform
 557                                         # independent shifts
 558         movdqa    \GH,\TMP3
 559         movdqa    \GH,\TMP4
 560         psrld     $1,\TMP2              # packed left shift >>1
 561         psrld     $2,\TMP3              # packed left shift >>2
 562         psrld     $7,\TMP4              # packed left shift >>7
 563         pxor      \TMP3,\TMP2           # xor the shifted versions
 564         pxor      \TMP4,\TMP2
 565         pxor      \TMP5, \TMP2
 566         pxor      \TMP2, \GH
 567         pxor      \TMP1, \GH            # result is in TMP1
 568 .endm
 569
 570 # Reads DLEN bytes starting at DPTR and stores in XMMDst
 571 # where 0 < DLEN < 16
 572 # Clobbers %rax, DLEN and XMM1
 573 .macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
 574         cmp $8, \DLEN
 575         jl _read_lt8_\@
 576         mov (\DPTR), %rax
 577         MOVQ_R64_XMM %rax, \XMMDst
 578         sub $8, \DLEN
 579         jz _done_read_partial_block_\@
 580         xor %eax, %eax
 581 _read_next_byte_\@:
 582         shl $8, %rax
 583         mov 7(\DPTR, \DLEN, 1), %al
 584         dec \DLEN
 585         jnz _read_next_byte_\@
 586         MOVQ_R64_XMM %rax, \XMM1
 587         pslldq $8, \XMM1
 588         por \XMM1, \XMMDst
 589         jmp _done_read_partial_block_\@
 590 _read_lt8_\@:
 591         xor %eax, %eax
 592 _read_next_byte_lt8_\@:
 593         shl $8, %rax
 594         mov -1(\DPTR, \DLEN, 1), %al
 595         dec \DLEN
 596         jnz _read_next_byte_lt8_\@
 597         MOVQ_R64_XMM %rax, \XMMDst
 598 _done_read_partial_block_\@:
 599 .endm
 600
 601 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
 602 # clobbers r10-11, xmm14
 603 .macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
 604         TMP6 TMP7
 605         MOVADQ     SHUF_MASK(%rip), %xmm14
 606         mov        \AAD, %r10           # %r10 = AAD
 607         mov        \AADLEN, %r11                # %r11 = aadLen
 608         pxor       \TMP7, \TMP7
 609         pxor       \TMP6, \TMP6
 610
 611         cmp        $16, %r11
 612         jl         _get_AAD_rest\@
 613 _get_AAD_blocks\@:
 614         movdqu     (%r10), \TMP7
 615         PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 616         pxor       \TMP7, \TMP6
 617         GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 618         add        $16, %r10
 619         sub        $16, %r11
 620         cmp        $16, %r11
 621         jge        _get_AAD_blocks\@
 622
 623         movdqu     \TMP6, \TMP7
 624
 625         /* read the last <16B of AAD */
 626 _get_AAD_rest\@:
 627         cmp        $0, %r11
 628         je         _get_AAD_done\@
 629
 630         READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
 631         PSHUFB_XMM   %xmm14, \TMP7 # byte-reflect the AAD data
 632         pxor       \TMP6, \TMP7
 633         GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
 634         movdqu \TMP7, \TMP6
 635
 636 _get_AAD_done\@:
 637         movdqu \TMP6, AadHash(%arg2)
 638 .endm
 639
 640 # PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
 641 # between update calls.
 642 # Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
 643 # Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
 644 # Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
 645 .macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
 646         AAD_HASH operation
 647         mov     PBlockLen(%arg2), %r13
 648         cmp     $0, %r13
 649         je      _partial_block_done_\@  # Leave Macro if no partial blocks
 650         # Read in input data without over reading
 651         cmp     $16, \PLAIN_CYPH_LEN
 652         jl      _fewer_than_16_bytes_\@
 653         movups  (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
 654         jmp     _data_read_\@
 655
 656 _fewer_than_16_bytes_\@:
 657         lea     (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
 658         mov     \PLAIN_CYPH_LEN, %r12
 659         READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
 660
 661         mov PBlockLen(%arg2), %r13
 662
 663 _data_read_\@:                          # Finished reading in data
 664
 665         movdqu  PBlockEncKey(%arg2), %xmm9
 666         movdqu  HashKey(%arg2), %xmm13
 667
 668         lea     SHIFT_MASK(%rip), %r12
 669
 670         # adjust the shuffle mask pointer to be able to shift r13 bytes
 671         # r16-r13 is the number of bytes in plaintext mod 16)
 672         add     %r13, %r12
 673         movdqu  (%r12), %xmm2           # get the appropriate shuffle mask
 674         PSHUFB_XMM %xmm2, %xmm9         # shift right r13 bytes
 675
 676 .ifc \operation, dec
 677         movdqa  %xmm1, %xmm3
 678         pxor    %xmm1, %xmm9            # Cyphertext XOR E(K, Yn)
 679
 680         mov     \PLAIN_CYPH_LEN, %r10
 681         add     %r13, %r10
 682         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 683         sub     $16, %r10
 684         # Determine if if partial block is not being filled and
 685         # shift mask accordingly
 686         jge     _no_extra_mask_1_\@
 687         sub     %r10, %r12
 688 _no_extra_mask_1_\@:
 689
 690         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 691         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 692         pand    %xmm1, %xmm9            # mask out bottom r13 bytes of xmm9
 693
 694         pand    %xmm1, %xmm3
 695         movdqa  SHUF_MASK(%rip), %xmm10
 696         PSHUFB_XMM      %xmm10, %xmm3
 697         PSHUFB_XMM      %xmm2, %xmm3
 698         pxor    %xmm3, \AAD_HASH
 699
 700         cmp     $0, %r10
 701         jl      _partial_incomplete_1_\@
 702
 703         # GHASH computation for the last <16 Byte block
 704         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 705         xor     %eax, %eax
 706
 707         mov     %rax, PBlockLen(%arg2)
 708         jmp     _dec_done_\@
 709 _partial_incomplete_1_\@:
 710         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 711 _dec_done_\@:
 712         movdqu  \AAD_HASH, AadHash(%arg2)
 713 .else
 714         pxor    %xmm1, %xmm9                    # Plaintext XOR E(K, Yn)
 715
 716         mov     \PLAIN_CYPH_LEN, %r10
 717         add     %r13, %r10
 718         # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
 719         sub     $16, %r10
 720         # Determine if if partial block is not being filled and
 721         # shift mask accordingly
 722         jge     _no_extra_mask_2_\@
 723         sub     %r10, %r12
 724 _no_extra_mask_2_\@:
 725
 726         movdqu  ALL_F-SHIFT_MASK(%r12), %xmm1
 727         # get the appropriate mask to mask out bottom r13 bytes of xmm9
 728         pand    %xmm1, %xmm9
 729
 730         movdqa  SHUF_MASK(%rip), %xmm1
 731         PSHUFB_XMM %xmm1, %xmm9
 732         PSHUFB_XMM %xmm2, %xmm9
 733         pxor    %xmm9, \AAD_HASH
 734
 735         cmp     $0, %r10
 736         jl      _partial_incomplete_2_\@
 737
 738         # GHASH computation for the last <16 Byte block
 739         GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
 740         xor     %eax, %eax
 741
 742         mov     %rax, PBlockLen(%arg2)
 743         jmp     _encode_done_\@
 744 _partial_incomplete_2_\@:
 745         add     \PLAIN_CYPH_LEN, PBlockLen(%arg2)
 746 _encode_done_\@:
 747         movdqu  \AAD_HASH, AadHash(%arg2)
 748
 749         movdqa  SHUF_MASK(%rip), %xmm10
 750         # shuffle xmm9 back to output as ciphertext
 751         PSHUFB_XMM      %xmm10, %xmm9
 752         PSHUFB_XMM      %xmm2, %xmm9
 753 .endif
 754         # output encrypted Bytes
 755         cmp     $0, %r10
 756         jl      _partial_fill_\@
 757         mov     %r13, %r12
 758         mov     $16, %r13
 759         # Set r13 to be the number of bytes to write out
 760         sub     %r12, %r13
 761         jmp     _count_set_\@
 762 _partial_fill_\@:
 763         mov     \PLAIN_CYPH_LEN, %r13
 764 _count_set_\@:
 765         movdqa  %xmm9, %xmm0
 766         MOVQ_R64_XMM    %xmm0, %rax
 767         cmp     $8, %r13
 768         jle     _less_than_8_bytes_left_\@
 769
 770         mov     %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 771         add     $8, \DATA_OFFSET
 772         psrldq  $8, %xmm0
 773         MOVQ_R64_XMM    %xmm0, %rax
 774         sub     $8, %r13
 775 _less_than_8_bytes_left_\@:
 776         movb    %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
 777         add     $1, \DATA_OFFSET
 778         shr     $8, %rax
 779         sub     $1, %r13
 780         jne     _less_than_8_bytes_left_\@
 781 _partial_block_done_\@:
 782 .endm # PARTIAL_BLOCK
 783
 784 /*
 785 * if a = number of total plaintext bytes
 786 * b = floor(a/16)
 787 * num_initial_blocks = b mod 4
 788 * encrypt the initial num_initial_blocks blocks and apply ghash on
 789 * the ciphertext
 790 * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
 791 * are clobbered
 792 * arg1, %arg2, %arg3 are used as a pointer only, not modified
 793 */
 794
 795
 796 .macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
 797         XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
 798         MOVADQ          SHUF_MASK(%rip), %xmm14
 799
 800         movdqu AadHash(%arg2), %xmm\i               # XMM0 = Y0
 801
 802         # start AES for num_initial_blocks blocks
 803
 804         movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
 805
 806 .if (\i == 5) || (\i == 6) || (\i == 7)
 807
 808         MOVADQ          ONE(%RIP),\TMP1
 809         MOVADQ          0(%arg1),\TMP2
 810 .irpc index, \i_seq
 811         paddd           \TMP1, \XMM0                 # INCR Y0
 812 .ifc \operation, dec
 813         movdqa     \XMM0, %xmm\index
 814 .else
 815         MOVADQ          \XMM0, %xmm\index
 816 .endif
 817         PSHUFB_XMM      %xmm14, %xmm\index      # perform a 16 byte swap
 818         pxor            \TMP2, %xmm\index
 819 .endr
 820         lea     0x10(%arg1),%r10
 821         mov     keysize,%eax
 822         shr     $2,%eax                         # 128->4, 192->6, 256->8
 823         add     $5,%eax                       # 128->9, 192->11, 256->13
 824
 825 aes_loop_initial_\@:
 826         MOVADQ  (%r10),\TMP1
 827 .irpc   index, \i_seq
 828         AESENC  \TMP1, %xmm\index
 829 .endr
 830         add     $16,%r10
 831         sub     $1,%eax
 832         jnz     aes_loop_initial_\@
 833
 834         MOVADQ  (%r10), \TMP1
 835 .irpc index, \i_seq
 836         AESENCLAST \TMP1, %xmm\index         # Last Round
 837 .endr
 838 .irpc index, \i_seq
 839         movdqu     (%arg4 , %r11, 1), \TMP1
 840         pxor       \TMP1, %xmm\index
 841         movdqu     %xmm\index, (%arg3 , %r11, 1)
 842         # write back plaintext/ciphertext for num_initial_blocks
 843         add        $16, %r11
 844
 845 .ifc \operation, dec
 846         movdqa     \TMP1, %xmm\index
 847 .endif
 848         PSHUFB_XMM         %xmm14, %xmm\index
 849
 850                 # prepare plaintext/ciphertext for GHASH computation
 851 .endr
 852 .endif
 853
 854         # apply GHASH on num_initial_blocks blocks
 855
 856 .if \i == 5
 857         pxor       %xmm5, %xmm6
 858         GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 859         pxor       %xmm6, %xmm7
 860         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 861         pxor       %xmm7, %xmm8
 862         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 863 .elseif \i == 6
 864         pxor       %xmm6, %xmm7
 865         GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 866         pxor       %xmm7, %xmm8
 867         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 868 .elseif \i == 7
 869         pxor       %xmm7, %xmm8
 870         GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
 871 .endif
 872         cmp        $64, %r13
 873         jl      _initial_blocks_done\@
 874         # no need for precomputed values
 875 /*
 876 *
 877 * Precomputations for HashKey parallel with encryption of first 4 blocks.
 878 * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
 879 */
 880         MOVADQ     ONE(%RIP),\TMP1
 881         paddd      \TMP1, \XMM0              # INCR Y0
 882         MOVADQ     \XMM0, \XMM1
 883         PSHUFB_XMM  %xmm14, \XMM1        # perform a 16 byte swap
 884
 885         paddd      \TMP1, \XMM0              # INCR Y0
 886         MOVADQ     \XMM0, \XMM2
 887         PSHUFB_XMM  %xmm14, \XMM2        # perform a 16 byte swap
 888
 889         paddd      \TMP1, \XMM0              # INCR Y0
 890         MOVADQ     \XMM0, \XMM3
 891         PSHUFB_XMM %xmm14, \XMM3        # perform a 16 byte swap
 892
 893         paddd      \TMP1, \XMM0              # INCR Y0
 894         MOVADQ     \XMM0, \XMM4
 895         PSHUFB_XMM %xmm14, \XMM4        # perform a 16 byte swap
 896
 897         MOVADQ     0(%arg1),\TMP1
 898         pxor       \TMP1, \XMM1
 899         pxor       \TMP1, \XMM2
 900         pxor       \TMP1, \XMM3
 901         pxor       \TMP1, \XMM4
 902 .irpc index, 1234 # do 4 rounds
 903         movaps 0x10*\index(%arg1), \TMP1
 904         AESENC     \TMP1, \XMM1
 905         AESENC     \TMP1, \XMM2
 906         AESENC     \TMP1, \XMM3
 907         AESENC     \TMP1, \XMM4
 908 .endr
 909 .irpc index, 56789 # do next 5 rounds
 910         movaps 0x10*\index(%arg1), \TMP1
 911         AESENC     \TMP1, \XMM1
 912         AESENC     \TMP1, \XMM2
 913         AESENC     \TMP1, \XMM3
 914         AESENC     \TMP1, \XMM4
 915 .endr
 916         lea        0xa0(%arg1),%r10
 917         mov        keysize,%eax
 918         shr        $2,%eax                      # 128->4, 192->6, 256->8
 919         sub        $4,%eax                      # 128->0, 192->2, 256->4
 920         jz         aes_loop_pre_done\@
 921
 922 aes_loop_pre_\@:
 923         MOVADQ     (%r10),\TMP2
 924 .irpc   index, 1234
 925         AESENC     \TMP2, %xmm\index
 926 .endr
 927         add        $16,%r10
 928         sub        $1,%eax
 929         jnz        aes_loop_pre_\@
 930
 931 aes_loop_pre_done\@:
 932         MOVADQ     (%r10), \TMP2
 933         AESENCLAST \TMP2, \XMM1
 934         AESENCLAST \TMP2, \XMM2
 935         AESENCLAST \TMP2, \XMM3
 936         AESENCLAST \TMP2, \XMM4
 937         movdqu     16*0(%arg4 , %r11 , 1), \TMP1
 938         pxor       \TMP1, \XMM1
 939 .ifc \operation, dec
 940         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 941         movdqa     \TMP1, \XMM1
 942 .endif
 943         movdqu     16*1(%arg4 , %r11 , 1), \TMP1
 944         pxor       \TMP1, \XMM2
 945 .ifc \operation, dec
 946         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 947         movdqa     \TMP1, \XMM2
 948 .endif
 949         movdqu     16*2(%arg4 , %r11 , 1), \TMP1
 950         pxor       \TMP1, \XMM3
 951 .ifc \operation, dec
 952         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 953         movdqa     \TMP1, \XMM3
 954 .endif
 955         movdqu     16*3(%arg4 , %r11 , 1), \TMP1
 956         pxor       \TMP1, \XMM4
 957 .ifc \operation, dec
 958         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 959         movdqa     \TMP1, \XMM4
 960 .else
 961         movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
 962         movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
 963         movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
 964         movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
 965 .endif
 966
 967         add        $64, %r11
 968         PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
 969         pxor       \XMMDst, \XMM1
 970 # combine GHASHed value with the corresponding ciphertext
 971         PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
 972         PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
 973         PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
 974
 975 _initial_blocks_done\@:
 976
 977 .endm
 978
 979 /*
 980 * encrypt 4 blocks at a time
 981 * ghash the 4 previously encrypted ciphertext blocks
 982 * arg1, %arg3, %arg4 are used as pointers only, not modified
 983 * %r11 is the data offset value
 984 */
 985 .macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
 986 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
 987
 988         movdqa    \XMM1, \XMM5
 989         movdqa    \XMM2, \XMM6
 990         movdqa    \XMM3, \XMM7
 991         movdqa    \XMM4, \XMM8
 992
 993         movdqa    SHUF_MASK(%rip), %xmm15
 994         # multiply TMP5 * HashKey using karatsuba
 995
 996         movdqa    \XMM5, \TMP4
 997         pshufd    $78, \XMM5, \TMP6
 998         pxor      \XMM5, \TMP6
 999         paddd     ONE(%rip), \XMM0              # INCR CNT
1000         movdqu    HashKey_4(%arg2), \TMP5
1001         PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1002         movdqa    \XMM0, \XMM1
1003         paddd     ONE(%rip), \XMM0              # INCR CNT
1004         movdqa    \XMM0, \XMM2
1005         paddd     ONE(%rip), \XMM0              # INCR CNT
1006         movdqa    \XMM0, \XMM3
1007         paddd     ONE(%rip), \XMM0              # INCR CNT
1008         movdqa    \XMM0, \XMM4
1009         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1010         PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1011         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1012         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1013         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1014
1015         pxor      (%arg1), \XMM1
1016         pxor      (%arg1), \XMM2
1017         pxor      (%arg1), \XMM3
1018         pxor      (%arg1), \XMM4
1019         movdqu    HashKey_4_k(%arg2), \TMP5
1020         PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1021         movaps 0x10(%arg1), \TMP1
1022         AESENC    \TMP1, \XMM1              # Round 1
1023         AESENC    \TMP1, \XMM2
1024         AESENC    \TMP1, \XMM3
1025         AESENC    \TMP1, \XMM4
1026         movaps 0x20(%arg1), \TMP1
1027         AESENC    \TMP1, \XMM1              # Round 2
1028         AESENC    \TMP1, \XMM2
1029         AESENC    \TMP1, \XMM3
1030         AESENC    \TMP1, \XMM4
1031         movdqa    \XMM6, \TMP1
1032         pshufd    $78, \XMM6, \TMP2
1033         pxor      \XMM6, \TMP2
1034         movdqu    HashKey_3(%arg2), \TMP5
1035         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1036         movaps 0x30(%arg1), \TMP3
1037         AESENC    \TMP3, \XMM1              # Round 3
1038         AESENC    \TMP3, \XMM2
1039         AESENC    \TMP3, \XMM3
1040         AESENC    \TMP3, \XMM4
1041         PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1042         movaps 0x40(%arg1), \TMP3
1043         AESENC    \TMP3, \XMM1              # Round 4
1044         AESENC    \TMP3, \XMM2
1045         AESENC    \TMP3, \XMM3
1046         AESENC    \TMP3, \XMM4
1047         movdqu    HashKey_3_k(%arg2), \TMP5
1048         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1049         movaps 0x50(%arg1), \TMP3
1050         AESENC    \TMP3, \XMM1              # Round 5
1051         AESENC    \TMP3, \XMM2
1052         AESENC    \TMP3, \XMM3
1053         AESENC    \TMP3, \XMM4
1054         pxor      \TMP1, \TMP4
1055 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1056         pxor      \XMM6, \XMM5
1057         pxor      \TMP2, \TMP6
1058         movdqa    \XMM7, \TMP1
1059         pshufd    $78, \XMM7, \TMP2
1060         pxor      \XMM7, \TMP2
1061         movdqu    HashKey_2(%arg2), \TMP5
1062
1063         # Multiply TMP5 * HashKey using karatsuba
1064
1065         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1066         movaps 0x60(%arg1), \TMP3
1067         AESENC    \TMP3, \XMM1              # Round 6
1068         AESENC    \TMP3, \XMM2
1069         AESENC    \TMP3, \XMM3
1070         AESENC    \TMP3, \XMM4
1071         PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1072         movaps 0x70(%arg1), \TMP3
1073         AESENC    \TMP3, \XMM1             # Round 7
1074         AESENC    \TMP3, \XMM2
1075         AESENC    \TMP3, \XMM3
1076         AESENC    \TMP3, \XMM4
1077         movdqu    HashKey_2_k(%arg2), \TMP5
1078         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1079         movaps 0x80(%arg1), \TMP3
1080         AESENC    \TMP3, \XMM1             # Round 8
1081         AESENC    \TMP3, \XMM2
1082         AESENC    \TMP3, \XMM3
1083         AESENC    \TMP3, \XMM4
1084         pxor      \TMP1, \TMP4
1085 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1086         pxor      \XMM7, \XMM5
1087         pxor      \TMP2, \TMP6
1088
1089         # Multiply XMM8 * HashKey
1090         # XMM8 and TMP5 hold the values for the two operands
1091
1092         movdqa    \XMM8, \TMP1
1093         pshufd    $78, \XMM8, \TMP2
1094         pxor      \XMM8, \TMP2
1095         movdqu    HashKey(%arg2), \TMP5
1096         PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1097         movaps 0x90(%arg1), \TMP3
1098         AESENC    \TMP3, \XMM1            # Round 9
1099         AESENC    \TMP3, \XMM2
1100         AESENC    \TMP3, \XMM3
1101         AESENC    \TMP3, \XMM4
1102         PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1103         lea       0xa0(%arg1),%r10
1104         mov       keysize,%eax
1105         shr       $2,%eax                       # 128->4, 192->6, 256->8
1106         sub       $4,%eax                       # 128->0, 192->2, 256->4
1107         jz        aes_loop_par_enc_done\@
1108
1109 aes_loop_par_enc\@:
1110         MOVADQ    (%r10),\TMP3
1111 .irpc   index, 1234
1112         AESENC    \TMP3, %xmm\index
1113 .endr
1114         add       $16,%r10
1115         sub       $1,%eax
1116         jnz       aes_loop_par_enc\@
1117
1118 aes_loop_par_enc_done\@:
1119         MOVADQ    (%r10), \TMP3
1120         AESENCLAST \TMP3, \XMM1           # Round 10
1121         AESENCLAST \TMP3, \XMM2
1122         AESENCLAST \TMP3, \XMM3
1123         AESENCLAST \TMP3, \XMM4
1124         movdqu    HashKey_k(%arg2), \TMP5
1125         PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1126         movdqu    (%arg4,%r11,1), \TMP3
1127         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1128         movdqu    16(%arg4,%r11,1), \TMP3
1129         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1130         movdqu    32(%arg4,%r11,1), \TMP3
1131         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1132         movdqu    48(%arg4,%r11,1), \TMP3
1133         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1134         movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1135         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1136         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1137         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1138         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1139         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1140         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1141         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1142
1143         pxor      \TMP4, \TMP1
1144         pxor      \XMM8, \XMM5
1145         pxor      \TMP6, \TMP2
1146         pxor      \TMP1, \TMP2
1147         pxor      \XMM5, \TMP2
1148         movdqa    \TMP2, \TMP3
1149         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1150         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1151         pxor      \TMP3, \XMM5
1152         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1153
1154         # first phase of reduction
1155
1156         movdqa    \XMM5, \TMP2
1157         movdqa    \XMM5, \TMP3
1158         movdqa    \XMM5, \TMP4
1159 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1160         pslld     $31, \TMP2                   # packed right shift << 31
1161         pslld     $30, \TMP3                   # packed right shift << 30
1162         pslld     $25, \TMP4                   # packed right shift << 25
1163         pxor      \TMP3, \TMP2                 # xor the shifted versions
1164         pxor      \TMP4, \TMP2
1165         movdqa    \TMP2, \TMP5
1166         psrldq    $4, \TMP5                    # right shift T5 1 DW
1167         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1168         pxor      \TMP2, \XMM5
1169
1170         # second phase of reduction
1171
1172         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1173         movdqa    \XMM5,\TMP3
1174         movdqa    \XMM5,\TMP4
1175         psrld     $1, \TMP2                    # packed left shift >>1
1176         psrld     $2, \TMP3                    # packed left shift >>2
1177         psrld     $7, \TMP4                    # packed left shift >>7
1178         pxor      \TMP3,\TMP2                  # xor the shifted versions
1179         pxor      \TMP4,\TMP2
1180         pxor      \TMP5, \TMP2
1181         pxor      \TMP2, \XMM5
1182         pxor      \TMP1, \XMM5                 # result is in TMP1
1183
1184         pxor      \XMM5, \XMM1
1185 .endm
1186
1187 /*
1188 * decrypt 4 blocks at a time
1189 * ghash the 4 previously decrypted ciphertext blocks
1190 * arg1, %arg3, %arg4 are used as pointers only, not modified
1191 * %r11 is the data offset value
1192 */
1193 .macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
1194 TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1195
1196         movdqa    \XMM1, \XMM5
1197         movdqa    \XMM2, \XMM6
1198         movdqa    \XMM3, \XMM7
1199         movdqa    \XMM4, \XMM8
1200
1201         movdqa    SHUF_MASK(%rip), %xmm15
1202         # multiply TMP5 * HashKey using karatsuba
1203
1204         movdqa    \XMM5, \TMP4
1205         pshufd    $78, \XMM5, \TMP6
1206         pxor      \XMM5, \TMP6
1207         paddd     ONE(%rip), \XMM0              # INCR CNT
1208         movdqu    HashKey_4(%arg2), \TMP5
1209         PCLMULQDQ 0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1210         movdqa    \XMM0, \XMM1
1211         paddd     ONE(%rip), \XMM0              # INCR CNT
1212         movdqa    \XMM0, \XMM2
1213         paddd     ONE(%rip), \XMM0              # INCR CNT
1214         movdqa    \XMM0, \XMM3
1215         paddd     ONE(%rip), \XMM0              # INCR CNT
1216         movdqa    \XMM0, \XMM4
1217         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1218         PCLMULQDQ 0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1219         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1220         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1221         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1222
1223         pxor      (%arg1), \XMM1
1224         pxor      (%arg1), \XMM2
1225         pxor      (%arg1), \XMM3
1226         pxor      (%arg1), \XMM4
1227         movdqu    HashKey_4_k(%arg2), \TMP5
1228         PCLMULQDQ 0x00, \TMP5, \TMP6           # TMP6 = (a1+a0)*(b1+b0)
1229         movaps 0x10(%arg1), \TMP1
1230         AESENC    \TMP1, \XMM1              # Round 1
1231         AESENC    \TMP1, \XMM2
1232         AESENC    \TMP1, \XMM3
1233         AESENC    \TMP1, \XMM4
1234         movaps 0x20(%arg1), \TMP1
1235         AESENC    \TMP1, \XMM1              # Round 2
1236         AESENC    \TMP1, \XMM2
1237         AESENC    \TMP1, \XMM3
1238         AESENC    \TMP1, \XMM4
1239         movdqa    \XMM6, \TMP1
1240         pshufd    $78, \XMM6, \TMP2
1241         pxor      \XMM6, \TMP2
1242         movdqu    HashKey_3(%arg2), \TMP5
1243         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1 * b1
1244         movaps 0x30(%arg1), \TMP3
1245         AESENC    \TMP3, \XMM1              # Round 3
1246         AESENC    \TMP3, \XMM2
1247         AESENC    \TMP3, \XMM3
1248         AESENC    \TMP3, \XMM4
1249         PCLMULQDQ 0x00, \TMP5, \XMM6           # XMM6 = a0*b0
1250         movaps 0x40(%arg1), \TMP3
1251         AESENC    \TMP3, \XMM1              # Round 4
1252         AESENC    \TMP3, \XMM2
1253         AESENC    \TMP3, \XMM3
1254         AESENC    \TMP3, \XMM4
1255         movdqu    HashKey_3_k(%arg2), \TMP5
1256         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1257         movaps 0x50(%arg1), \TMP3
1258         AESENC    \TMP3, \XMM1              # Round 5
1259         AESENC    \TMP3, \XMM2
1260         AESENC    \TMP3, \XMM3
1261         AESENC    \TMP3, \XMM4
1262         pxor      \TMP1, \TMP4
1263 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1264         pxor      \XMM6, \XMM5
1265         pxor      \TMP2, \TMP6
1266         movdqa    \XMM7, \TMP1
1267         pshufd    $78, \XMM7, \TMP2
1268         pxor      \XMM7, \TMP2
1269         movdqu    HashKey_2(%arg2), \TMP5
1270
1271         # Multiply TMP5 * HashKey using karatsuba
1272
1273         PCLMULQDQ 0x11, \TMP5, \TMP1           # TMP1 = a1*b1
1274         movaps 0x60(%arg1), \TMP3
1275         AESENC    \TMP3, \XMM1              # Round 6
1276         AESENC    \TMP3, \XMM2
1277         AESENC    \TMP3, \XMM3
1278         AESENC    \TMP3, \XMM4
1279         PCLMULQDQ 0x00, \TMP5, \XMM7           # XMM7 = a0*b0
1280         movaps 0x70(%arg1), \TMP3
1281         AESENC    \TMP3, \XMM1             # Round 7
1282         AESENC    \TMP3, \XMM2
1283         AESENC    \TMP3, \XMM3
1284         AESENC    \TMP3, \XMM4
1285         movdqu    HashKey_2_k(%arg2), \TMP5
1286         PCLMULQDQ 0x00, \TMP5, \TMP2           # TMP2 = (a1+a0)*(b1+b0)
1287         movaps 0x80(%arg1), \TMP3
1288         AESENC    \TMP3, \XMM1             # Round 8
1289         AESENC    \TMP3, \XMM2
1290         AESENC    \TMP3, \XMM3
1291         AESENC    \TMP3, \XMM4
1292         pxor      \TMP1, \TMP4
1293 # accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1294         pxor      \XMM7, \XMM5
1295         pxor      \TMP2, \TMP6
1296
1297         # Multiply XMM8 * HashKey
1298         # XMM8 and TMP5 hold the values for the two operands
1299
1300         movdqa    \XMM8, \TMP1
1301         pshufd    $78, \XMM8, \TMP2
1302         pxor      \XMM8, \TMP2
1303         movdqu    HashKey(%arg2), \TMP5
1304         PCLMULQDQ 0x11, \TMP5, \TMP1          # TMP1 = a1*b1
1305         movaps 0x90(%arg1), \TMP3
1306         AESENC    \TMP3, \XMM1            # Round 9
1307         AESENC    \TMP3, \XMM2
1308         AESENC    \TMP3, \XMM3
1309         AESENC    \TMP3, \XMM4
1310         PCLMULQDQ 0x00, \TMP5, \XMM8          # XMM8 = a0*b0
1311         lea       0xa0(%arg1),%r10
1312         mov       keysize,%eax
1313         shr       $2,%eax                       # 128->4, 192->6, 256->8
1314         sub       $4,%eax                       # 128->0, 192->2, 256->4
1315         jz        aes_loop_par_dec_done\@
1316
1317 aes_loop_par_dec\@:
1318         MOVADQ    (%r10),\TMP3
1319 .irpc   index, 1234
1320         AESENC    \TMP3, %xmm\index
1321 .endr
1322         add       $16,%r10
1323         sub       $1,%eax
1324         jnz       aes_loop_par_dec\@
1325
1326 aes_loop_par_dec_done\@:
1327         MOVADQ    (%r10), \TMP3
1328         AESENCLAST \TMP3, \XMM1           # last round
1329         AESENCLAST \TMP3, \XMM2
1330         AESENCLAST \TMP3, \XMM3
1331         AESENCLAST \TMP3, \XMM4
1332         movdqu    HashKey_k(%arg2), \TMP5
1333         PCLMULQDQ 0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1334         movdqu    (%arg4,%r11,1), \TMP3
1335         pxor      \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1336         movdqu    \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1337         movdqa    \TMP3, \XMM1
1338         movdqu    16(%arg4,%r11,1), \TMP3
1339         pxor      \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1340         movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1341         movdqa    \TMP3, \XMM2
1342         movdqu    32(%arg4,%r11,1), \TMP3
1343         pxor      \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1344         movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1345         movdqa    \TMP3, \XMM3
1346         movdqu    48(%arg4,%r11,1), \TMP3
1347         pxor      \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1348         movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1349         movdqa    \TMP3, \XMM4
1350         PSHUFB_XMM %xmm15, \XMM1        # perform a 16 byte swap
1351         PSHUFB_XMM %xmm15, \XMM2        # perform a 16 byte swap
1352         PSHUFB_XMM %xmm15, \XMM3        # perform a 16 byte swap
1353         PSHUFB_XMM %xmm15, \XMM4        # perform a 16 byte swap
1354
1355         pxor      \TMP4, \TMP1
1356         pxor      \XMM8, \XMM5
1357         pxor      \TMP6, \TMP2
1358         pxor      \TMP1, \TMP2
1359         pxor      \XMM5, \TMP2
1360         movdqa    \TMP2, \TMP3
1361         pslldq    $8, \TMP3                    # left shift TMP3 2 DWs
1362         psrldq    $8, \TMP2                    # right shift TMP2 2 DWs
1363         pxor      \TMP3, \XMM5
1364         pxor      \TMP2, \TMP1    # accumulate the results in TMP1:XMM5
1365
1366         # first phase of reduction
1367
1368         movdqa    \XMM5, \TMP2
1369         movdqa    \XMM5, \TMP3
1370         movdqa    \XMM5, \TMP4
1371 # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1372         pslld     $31, \TMP2                   # packed right shift << 31
1373         pslld     $30, \TMP3                   # packed right shift << 30
1374         pslld     $25, \TMP4                   # packed right shift << 25
1375         pxor      \TMP3, \TMP2                 # xor the shifted versions
1376         pxor      \TMP4, \TMP2
1377         movdqa    \TMP2, \TMP5
1378         psrldq    $4, \TMP5                    # right shift T5 1 DW
1379         pslldq    $12, \TMP2                   # left shift T2 3 DWs
1380         pxor      \TMP2, \XMM5
1381
1382         # second phase of reduction
1383
1384         movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1385         movdqa    \XMM5,\TMP3
1386         movdqa    \XMM5,\TMP4
1387         psrld     $1, \TMP2                    # packed left shift >>1
1388         psrld     $2, \TMP3                    # packed left shift >>2
1389         psrld     $7, \TMP4                    # packed left shift >>7
1390         pxor      \TMP3,\TMP2                  # xor the shifted versions
1391         pxor      \TMP4,\TMP2
1392         pxor      \TMP5, \TMP2
1393         pxor      \TMP2, \XMM5
1394         pxor      \TMP1, \XMM5                 # result is in TMP1
1395
1396         pxor      \XMM5, \XMM1
1397 .endm
1398
1399 /* GHASH the last 4 ciphertext blocks. */
1400 .macro  GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1401 TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1402
1403         # Multiply TMP6 * HashKey (using Karatsuba)
1404
1405         movdqa    \XMM1, \TMP6
1406         pshufd    $78, \XMM1, \TMP2
1407         pxor      \XMM1, \TMP2
1408         movdqu    HashKey_4(%arg2), \TMP5
1409         PCLMULQDQ 0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1410         PCLMULQDQ 0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1411         movdqu    HashKey_4_k(%arg2), \TMP4
1412         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1413         movdqa    \XMM1, \XMMDst
1414         movdqa    \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1415
1416         # Multiply TMP1 * HashKey (using Karatsuba)
1417
1418         movdqa    \XMM2, \TMP1
1419         pshufd    $78, \XMM2, \TMP2
1420         pxor      \XMM2, \TMP2
1421         movdqu    HashKey_3(%arg2), \TMP5
1422         PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1423         PCLMULQDQ 0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1424         movdqu    HashKey_3_k(%arg2), \TMP4
1425         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1426         pxor      \TMP1, \TMP6
1427         pxor      \XMM2, \XMMDst
1428         pxor      \TMP2, \XMM1
1429 # results accumulated in TMP6, XMMDst, XMM1
1430
1431         # Multiply TMP1 * HashKey (using Karatsuba)
1432
1433         movdqa    \XMM3, \TMP1
1434         pshufd    $78, \XMM3, \TMP2
1435         pxor      \XMM3, \TMP2
1436         movdqu    HashKey_2(%arg2), \TMP5
1437         PCLMULQDQ 0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1438         PCLMULQDQ 0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1439         movdqu    HashKey_2_k(%arg2), \TMP4
1440         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1441         pxor      \TMP1, \TMP6
1442         pxor      \XMM3, \XMMDst
1443         pxor      \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1444
1445         # Multiply TMP1 * HashKey (using Karatsuba)
1446         movdqa    \XMM4, \TMP1
1447         pshufd    $78, \XMM4, \TMP2
1448         pxor      \XMM4, \TMP2
1449         movdqu    HashKey(%arg2), \TMP5
1450         PCLMULQDQ 0x11, \TMP5, \TMP1        # TMP1 = a1*b1
1451         PCLMULQDQ 0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1452         movdqu    HashKey_k(%arg2), \TMP4
1453         PCLMULQDQ 0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1454         pxor      \TMP1, \TMP6
1455         pxor      \XMM4, \XMMDst
1456         pxor      \XMM1, \TMP2
1457         pxor      \TMP6, \TMP2
1458         pxor      \XMMDst, \TMP2
1459         # middle section of the temp results combined as in karatsuba algorithm
1460         movdqa    \TMP2, \TMP4
1461         pslldq    $8, \TMP4                 # left shift TMP4 2 DWs
1462         psrldq    $8, \TMP2                 # right shift TMP2 2 DWs
1463         pxor      \TMP4, \XMMDst
1464         pxor      \TMP2, \TMP6
1465 # TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1466         # first phase of the reduction
1467         movdqa    \XMMDst, \TMP2
1468         movdqa    \XMMDst, \TMP3
1469         movdqa    \XMMDst, \TMP4
1470 # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1471         pslld     $31, \TMP2                # packed right shifting << 31
1472         pslld     $30, \TMP3                # packed right shifting << 30
1473         pslld     $25, \TMP4                # packed right shifting << 25
1474         pxor      \TMP3, \TMP2              # xor the shifted versions
1475         pxor      \TMP4, \TMP2
1476         movdqa    \TMP2, \TMP7
1477         psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1478         pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1479         pxor      \TMP2, \XMMDst
1480
1481         # second phase of the reduction
1482         movdqa    \XMMDst, \TMP2
1483         # make 3 copies of XMMDst for doing 3 shift operations
1484         movdqa    \XMMDst, \TMP3
1485         movdqa    \XMMDst, \TMP4
1486         psrld     $1, \TMP2                 # packed left shift >> 1
1487         psrld     $2, \TMP3                 # packed left shift >> 2
1488         psrld     $7, \TMP4                 # packed left shift >> 7
1489         pxor      \TMP3, \TMP2              # xor the shifted versions
1490         pxor      \TMP4, \TMP2
1491         pxor      \TMP7, \TMP2
1492         pxor      \TMP2, \XMMDst
1493         pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1494 .endm
1495
1496
1497 /* Encryption of a single block
1498 * uses eax & r10
1499 */
1500
1501 .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1502
1503         pxor            (%arg1), \XMM0
1504         mov             keysize,%eax
1505         shr             $2,%eax                 # 128->4, 192->6, 256->8
1506         add             $5,%eax                 # 128->9, 192->11, 256->13
1507         lea             16(%arg1), %r10   # get first expanded key address
1508
1509 _esb_loop_\@:
1510         MOVADQ          (%r10),\TMP1
1511         AESENC          \TMP1,\XMM0
1512         add             $16,%r10
1513         sub             $1,%eax
1514         jnz             _esb_loop_\@
1515
1516         MOVADQ          (%r10),\TMP1
1517         AESENCLAST      \TMP1,\XMM0
1518 .endm
1519 /*****************************************************************************
1520 * void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1521 *                   struct gcm_context_data *data
1522 *                                      // Context data
1523 *                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1524 *                   const u8 *in,      // Ciphertext input
1525 *                   u64 plaintext_len, // Length of data in bytes for decryption.
1526 *                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1527 *                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1528 *                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1529 *                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1530 *                   const u8 *aad,     // Additional Authentication Data (AAD)
1531 *                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1532 *                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1533 *                                      // given authentication tag and only return the plaintext if they match.
1534 *                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1535 *                                      // (most likely), 12 or 8.
1536 *
1537 * Assumptions:
1538 *
1539 * keys:
1540 *       keys are pre-expanded and aligned to 16 bytes. we are using the first
1541 *       set of 11 keys in the data structure void *aes_ctx
1542 *
1543 * iv:
1544 *       0                   1                   2                   3
1545 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1546 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547 *       |                             Salt  (From the SA)               |
1548 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549 *       |                     Initialization Vector                     |
1550 *       |         (This is the sequence number from IPSec header)       |
1551 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1552 *       |                              0x1                              |
1553 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1554 *
1555 *
1556 *
1557 * AAD:
1558 *       AAD padded to 128 bits with 0
1559 *       for example, assume AAD is a u32 vector
1560 *
1561 *       if AAD is 8 bytes:
1562 *       AAD[3] = {A0, A1};
1563 *       padded AAD in xmm register = {A1 A0 0 0}
1564 *
1565 *       0                   1                   2                   3
1566 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1567 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1568 *       |                               SPI (A1)                        |
1569 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1570 *       |                     32-bit Sequence Number (A0)               |
1571 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1572 *       |                              0x0                              |
1573 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1574 *
1575 *                                       AAD Format with 32-bit Sequence Number
1576 *
1577 *       if AAD is 12 bytes:
1578 *       AAD[3] = {A0, A1, A2};
1579 *       padded AAD in xmm register = {A2 A1 A0 0}
1580 *
1581 *       0                   1                   2                   3
1582 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1583 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1584 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1585 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586 *       |                               SPI (A2)                        |
1587 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1589 *       |                                                               |
1590 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1591 *       |                              0x0                              |
1592 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1593 *
1594 *                        AAD Format with 64-bit Extended Sequence Number
1595 *
1596 * poly = x^128 + x^127 + x^126 + x^121 + 1
1597 *
1598 *****************************************************************************/
1599 ENTRY(aesni_gcm_dec)
1600         FUNC_SAVE
1601
1602         GCM_INIT %arg6, arg7, arg8, arg9
1603         GCM_ENC_DEC dec
1604         GCM_COMPLETE arg10, arg11
1605         FUNC_RESTORE
1606         ret
1607 ENDPROC(aesni_gcm_dec)
1608
1609
1610 /*****************************************************************************
1611 * void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1612 *                    struct gcm_context_data *data
1613 *                                        // Context data
1614 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1615 *                    const u8 *in,       // Plaintext input
1616 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1617 *                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1618 *                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1619 *                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1620 *                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1621 *                    const u8 *aad,      // Additional Authentication Data (AAD)
1622 *                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1623 *                    u8 *auth_tag,       // Authenticated Tag output.
1624 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1625 *                                        // 12 or 8.
1626 *
1627 * Assumptions:
1628 *
1629 * keys:
1630 *       keys are pre-expanded and aligned to 16 bytes. we are using the
1631 *       first set of 11 keys in the data structure void *aes_ctx
1632 *
1633 *
1634 * iv:
1635 *       0                   1                   2                   3
1636 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1637 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638 *       |                             Salt  (From the SA)               |
1639 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640 *       |                     Initialization Vector                     |
1641 *       |         (This is the sequence number from IPSec header)       |
1642 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1643 *       |                              0x1                              |
1644 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1645 *
1646 *
1647 *
1648 * AAD:
1649 *       AAD padded to 128 bits with 0
1650 *       for example, assume AAD is a u32 vector
1651 *
1652 *       if AAD is 8 bytes:
1653 *       AAD[3] = {A0, A1};
1654 *       padded AAD in xmm register = {A1 A0 0 0}
1655 *
1656 *       0                   1                   2                   3
1657 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1658 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1659 *       |                               SPI (A1)                        |
1660 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1661 *       |                     32-bit Sequence Number (A0)               |
1662 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1663 *       |                              0x0                              |
1664 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1665 *
1666 *                                 AAD Format with 32-bit Sequence Number
1667 *
1668 *       if AAD is 12 bytes:
1669 *       AAD[3] = {A0, A1, A2};
1670 *       padded AAD in xmm register = {A2 A1 A0 0}
1671 *
1672 *       0                   1                   2                   3
1673 *       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1674 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675 *       |                               SPI (A2)                        |
1676 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677 *       |                 64-bit Extended Sequence Number {A1,A0}       |
1678 *       |                                                               |
1679 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1680 *       |                              0x0                              |
1681 *       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1682 *
1683 *                         AAD Format with 64-bit Extended Sequence Number
1684 *
1685 * poly = x^128 + x^127 + x^126 + x^121 + 1
1686 ***************************************************************************/
1687 ENTRY(aesni_gcm_enc)
1688         FUNC_SAVE
1689
1690         GCM_INIT %arg6, arg7, arg8, arg9
1691         GCM_ENC_DEC enc
1692
1693         GCM_COMPLETE arg10, arg11
1694         FUNC_RESTORE
1695         ret
1696 ENDPROC(aesni_gcm_enc)
1697
1698 /*****************************************************************************
1699 * void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1700 *                     struct gcm_context_data *data,
1701 *                                         // context data
1702 *                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1703 *                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1704 *                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1705 *                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1706 *                     const u8 *aad,      // Additional Authentication Data (AAD)
1707 *                     u64 aad_len)        // Length of AAD in bytes.
1708 */
1709 ENTRY(aesni_gcm_init)
1710         FUNC_SAVE
1711         GCM_INIT %arg3, %arg4,%arg5, %arg6
1712         FUNC_RESTORE
1713         ret
1714 ENDPROC(aesni_gcm_init)
1715
1716 /*****************************************************************************
1717 * void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1718 *                    struct gcm_context_data *data,
1719 *                                        // context data
1720 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1721 *                    const u8 *in,       // Plaintext input
1722 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1723 */
1724 ENTRY(aesni_gcm_enc_update)
1725         FUNC_SAVE
1726         GCM_ENC_DEC enc
1727         FUNC_RESTORE
1728         ret
1729 ENDPROC(aesni_gcm_enc_update)
1730
1731 /*****************************************************************************
1732 * void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1733 *                    struct gcm_context_data *data,
1734 *                                        // context data
1735 *                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1736 *                    const u8 *in,       // Plaintext input
1737 *                    u64 plaintext_len,  // Length of data in bytes for encryption.
1738 */
1739 ENTRY(aesni_gcm_dec_update)
1740         FUNC_SAVE
1741         GCM_ENC_DEC dec
1742         FUNC_RESTORE
1743         ret
1744 ENDPROC(aesni_gcm_dec_update)
1745
1746 /*****************************************************************************
1747 * void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1748 *                    struct gcm_context_data *data,
1749 *                                        // context data
1750 *                    u8 *auth_tag,       // Authenticated Tag output.
1751 *                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1752 *                                        // 12 or 8.
1753 */
1754 ENTRY(aesni_gcm_finalize)
1755         FUNC_SAVE
1756         GCM_COMPLETE %arg3 %arg4
1757         FUNC_RESTORE
1758         ret
1759 ENDPROC(aesni_gcm_finalize)
1760
1761 #endif
1762
1763
1764 .align 4
1765 _key_expansion_128:
1766 _key_expansion_256a:
1767         pshufd $0b11111111, %xmm1, %xmm1
1768         shufps $0b00010000, %xmm0, %xmm4
1769         pxor %xmm4, %xmm0
1770         shufps $0b10001100, %xmm0, %xmm4
1771         pxor %xmm4, %xmm0
1772         pxor %xmm1, %xmm0
1773         movaps %xmm0, (TKEYP)
1774         add $0x10, TKEYP
1775         ret
1776 ENDPROC(_key_expansion_128)
1777 ENDPROC(_key_expansion_256a)
1778
1779 .align 4
1780 _key_expansion_192a:
1781         pshufd $0b01010101, %xmm1, %xmm1
1782         shufps $0b00010000, %xmm0, %xmm4
1783         pxor %xmm4, %xmm0
1784         shufps $0b10001100, %xmm0, %xmm4
1785         pxor %xmm4, %xmm0
1786         pxor %xmm1, %xmm0
1787
1788         movaps %xmm2, %xmm5
1789         movaps %xmm2, %xmm6
1790         pslldq $4, %xmm5
1791         pshufd $0b11111111, %xmm0, %xmm3
1792         pxor %xmm3, %xmm2
1793         pxor %xmm5, %xmm2
1794
1795         movaps %xmm0, %xmm1
1796         shufps $0b01000100, %xmm0, %xmm6
1797         movaps %xmm6, (TKEYP)
1798         shufps $0b01001110, %xmm2, %xmm1
1799         movaps %xmm1, 0x10(TKEYP)
1800         add $0x20, TKEYP
1801         ret
1802 ENDPROC(_key_expansion_192a)
1803
1804 .align 4
1805 _key_expansion_192b:
1806         pshufd $0b01010101, %xmm1, %xmm1
1807         shufps $0b00010000, %xmm0, %xmm4
1808         pxor %xmm4, %xmm0
1809         shufps $0b10001100, %xmm0, %xmm4
1810         pxor %xmm4, %xmm0
1811         pxor %xmm1, %xmm0
1812
1813         movaps %xmm2, %xmm5
1814         pslldq $4, %xmm5
1815         pshufd $0b11111111, %xmm0, %xmm3
1816         pxor %xmm3, %xmm2
1817         pxor %xmm5, %xmm2
1818
1819         movaps %xmm0, (TKEYP)
1820         add $0x10, TKEYP
1821         ret
1822 ENDPROC(_key_expansion_192b)
1823
1824 .align 4
1825 _key_expansion_256b:
1826         pshufd $0b10101010, %xmm1, %xmm1
1827         shufps $0b00010000, %xmm2, %xmm4
1828         pxor %xmm4, %xmm2
1829         shufps $0b10001100, %xmm2, %xmm4
1830         pxor %xmm4, %xmm2
1831         pxor %xmm1, %xmm2
1832         movaps %xmm2, (TKEYP)
1833         add $0x10, TKEYP
1834         ret
1835 ENDPROC(_key_expansion_256b)
1836
1837 /*
1838  * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1839  *                   unsigned int key_len)
1840  */
1841 ENTRY(aesni_set_key)
1842         FRAME_BEGIN
1843 #ifndef __x86_64__
1844         pushl KEYP
1845         movl (FRAME_OFFSET+8)(%esp), KEYP       # ctx
1846         movl (FRAME_OFFSET+12)(%esp), UKEYP     # in_key
1847         movl (FRAME_OFFSET+16)(%esp), %edx      # key_len
1848 #endif
1849         movups (UKEYP), %xmm0           # user key (first 16 bytes)
1850         movaps %xmm0, (KEYP)
1851         lea 0x10(KEYP), TKEYP           # key addr
1852         movl %edx, 480(KEYP)
1853         pxor %xmm4, %xmm4               # xmm4 is assumed 0 in _key_expansion_x
1854         cmp $24, %dl
1855         jb .Lenc_key128
1856         je .Lenc_key192
1857         movups 0x10(UKEYP), %xmm2       # other user key
1858         movaps %xmm2, (TKEYP)
1859         add $0x10, TKEYP
1860         AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
1861         call _key_expansion_256a
1862         AESKEYGENASSIST 0x1 %xmm0 %xmm1
1863         call _key_expansion_256b
1864         AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
1865         call _key_expansion_256a
1866         AESKEYGENASSIST 0x2 %xmm0 %xmm1
1867         call _key_expansion_256b
1868         AESKEYGENASSIST 0x4 %xmm2 %xmm1         # round 3
1869         call _key_expansion_256a
1870         AESKEYGENASSIST 0x4 %xmm0 %xmm1
1871         call _key_expansion_256b
1872         AESKEYGENASSIST 0x8 %xmm2 %xmm1         # round 4
1873         call _key_expansion_256a
1874         AESKEYGENASSIST 0x8 %xmm0 %xmm1
1875         call _key_expansion_256b
1876         AESKEYGENASSIST 0x10 %xmm2 %xmm1        # round 5
1877         call _key_expansion_256a
1878         AESKEYGENASSIST 0x10 %xmm0 %xmm1
1879         call _key_expansion_256b
1880         AESKEYGENASSIST 0x20 %xmm2 %xmm1        # round 6
1881         call _key_expansion_256a
1882         AESKEYGENASSIST 0x20 %xmm0 %xmm1
1883         call _key_expansion_256b
1884         AESKEYGENASSIST 0x40 %xmm2 %xmm1        # round 7
1885         call _key_expansion_256a
1886         jmp .Ldec_key
1887 .Lenc_key192:
1888         movq 0x10(UKEYP), %xmm2         # other user key
1889         AESKEYGENASSIST 0x1 %xmm2 %xmm1         # round 1
1890         call _key_expansion_192a
1891         AESKEYGENASSIST 0x2 %xmm2 %xmm1         # round 2
1892         call _key_expansion_192b
1893         AESKEYGENASSIST 0x4 %xmm2 %xmm1         # round 3
1894         call _key_expansion_192a
1895         AESKEYGENASSIST 0x8 %xmm2 %xmm1         # round 4
1896         call _key_expansion_192b
1897         AESKEYGENASSIST 0x10 %xmm2 %xmm1        # round 5
1898         call _key_expansion_192a
1899         AESKEYGENASSIST 0x20 %xmm2 %xmm1        # round 6
1900         call _key_expansion_192b
1901         AESKEYGENASSIST 0x40 %xmm2 %xmm1        # round 7
1902         call _key_expansion_192a
1903         AESKEYGENASSIST 0x80 %xmm2 %xmm1        # round 8
1904         call _key_expansion_192b
1905         jmp .Ldec_key
1906 .Lenc_key128:
1907         AESKEYGENASSIST 0x1 %xmm0 %xmm1         # round 1
1908         call _key_expansion_128
1909         AESKEYGENASSIST 0x2 %xmm0 %xmm1         # round 2
1910         call _key_expansion_128
1911         AESKEYGENASSIST 0x4 %xmm0 %xmm1         # round 3
1912         call _key_expansion_128
1913         AESKEYGENASSIST 0x8 %xmm0 %xmm1         # round 4
1914         call _key_expansion_128
1915         AESKEYGENASSIST 0x10 %xmm0 %xmm1        # round 5
1916         call _key_expansion_128
1917         AESKEYGENASSIST 0x20 %xmm0 %xmm1        # round 6
1918         call _key_expansion_128
1919         AESKEYGENASSIST 0x40 %xmm0 %xmm1        # round 7
1920         call _key_expansion_128
1921         AESKEYGENASSIST 0x80 %xmm0 %xmm1        # round 8
1922         call _key_expansion_128
1923         AESKEYGENASSIST 0x1b %xmm0 %xmm1        # round 9
1924         call _key_expansion_128
1925         AESKEYGENASSIST 0x36 %xmm0 %xmm1        # round 10
1926         call _key_expansion_128
1927 .Ldec_key:
1928         sub $0x10, TKEYP
1929         movaps (KEYP), %xmm0
1930         movaps (TKEYP), %xmm1
1931         movaps %xmm0, 240(TKEYP)
1932         movaps %xmm1, 240(KEYP)
1933         add $0x10, KEYP
1934         lea 240-16(TKEYP), UKEYP
1935 .align 4
1936 .Ldec_key_loop:
1937         movaps (KEYP), %xmm0
1938         AESIMC %xmm0 %xmm1
1939         movaps %xmm1, (UKEYP)
1940         add $0x10, KEYP
1941         sub $0x10, UKEYP
1942         cmp TKEYP, KEYP
1943         jb .Ldec_key_loop
1944         xor AREG, AREG
1945 #ifndef __x86_64__
1946         popl KEYP
1947 #endif
1948         FRAME_END
1949         ret
1950 ENDPROC(aesni_set_key)
1951
1952 /*
1953  * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
1954  */
1955 ENTRY(aesni_enc)
1956         FRAME_BEGIN
1957 #ifndef __x86_64__
1958         pushl KEYP
1959         pushl KLEN
1960         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
1961         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
1962         movl (FRAME_OFFSET+20)(%esp), INP       # src
1963 #endif
1964         movl 480(KEYP), KLEN            # key length
1965         movups (INP), STATE             # input
1966         call _aesni_enc1
1967         movups STATE, (OUTP)            # output
1968 #ifndef __x86_64__
1969         popl KLEN
1970         popl KEYP
1971 #endif
1972         FRAME_END
1973         ret
1974 ENDPROC(aesni_enc)
1975
1976 /*
1977  * _aesni_enc1:         internal ABI
1978  * input:
1979  *      KEYP:           key struct pointer
1980  *      KLEN:           round count
1981  *      STATE:          initial state (input)
1982  * output:
1983  *      STATE:          finial state (output)
1984  * changed:
1985  *      KEY
1986  *      TKEYP (T1)
1987  */
1988 .align 4
1989 _aesni_enc1:
1990         movaps (KEYP), KEY              # key
1991         mov KEYP, TKEYP
1992         pxor KEY, STATE         # round 0
1993         add $0x30, TKEYP
1994         cmp $24, KLEN
1995         jb .Lenc128
1996         lea 0x20(TKEYP), TKEYP
1997         je .Lenc192
1998         add $0x20, TKEYP
1999         movaps -0x60(TKEYP), KEY
2000         AESENC KEY STATE
2001         movaps -0x50(TKEYP), KEY
2002         AESENC KEY STATE
2003 .align 4
2004 .Lenc192:
2005         movaps -0x40(TKEYP), KEY
2006         AESENC KEY STATE
2007         movaps -0x30(TKEYP), KEY
2008         AESENC KEY STATE
2009 .align 4
2010 .Lenc128:
2011         movaps -0x20(TKEYP), KEY
2012         AESENC KEY STATE
2013         movaps -0x10(TKEYP), KEY
2014         AESENC KEY STATE
2015         movaps (TKEYP), KEY
2016         AESENC KEY STATE
2017         movaps 0x10(TKEYP), KEY
2018         AESENC KEY STATE
2019         movaps 0x20(TKEYP), KEY
2020         AESENC KEY STATE
2021         movaps 0x30(TKEYP), KEY
2022         AESENC KEY STATE
2023         movaps 0x40(TKEYP), KEY
2024         AESENC KEY STATE
2025         movaps 0x50(TKEYP), KEY
2026         AESENC KEY STATE
2027         movaps 0x60(TKEYP), KEY
2028         AESENC KEY STATE
2029         movaps 0x70(TKEYP), KEY
2030         AESENCLAST KEY STATE
2031         ret
2032 ENDPROC(_aesni_enc1)
2033
2034 /*
2035  * _aesni_enc4: internal ABI
2036  * input:
2037  *      KEYP:           key struct pointer
2038  *      KLEN:           round count
2039  *      STATE1:         initial state (input)
2040  *      STATE2
2041  *      STATE3
2042  *      STATE4
2043  * output:
2044  *      STATE1:         finial state (output)
2045  *      STATE2
2046  *      STATE3
2047  *      STATE4
2048  * changed:
2049  *      KEY
2050  *      TKEYP (T1)
2051  */
2052 .align 4
2053 _aesni_enc4:
2054         movaps (KEYP), KEY              # key
2055         mov KEYP, TKEYP
2056         pxor KEY, STATE1                # round 0
2057         pxor KEY, STATE2
2058         pxor KEY, STATE3
2059         pxor KEY, STATE4
2060         add $0x30, TKEYP
2061         cmp $24, KLEN
2062         jb .L4enc128
2063         lea 0x20(TKEYP), TKEYP
2064         je .L4enc192
2065         add $0x20, TKEYP
2066         movaps -0x60(TKEYP), KEY
2067         AESENC KEY STATE1
2068         AESENC KEY STATE2
2069         AESENC KEY STATE3
2070         AESENC KEY STATE4
2071         movaps -0x50(TKEYP), KEY
2072         AESENC KEY STATE1
2073         AESENC KEY STATE2
2074         AESENC KEY STATE3
2075         AESENC KEY STATE4
2076 #.align 4
2077 .L4enc192:
2078         movaps -0x40(TKEYP), KEY
2079         AESENC KEY STATE1
2080         AESENC KEY STATE2
2081         AESENC KEY STATE3
2082         AESENC KEY STATE4
2083         movaps -0x30(TKEYP), KEY
2084         AESENC KEY STATE1
2085         AESENC KEY STATE2
2086         AESENC KEY STATE3
2087         AESENC KEY STATE4
2088 #.align 4
2089 .L4enc128:
2090         movaps -0x20(TKEYP), KEY
2091         AESENC KEY STATE1
2092         AESENC KEY STATE2
2093         AESENC KEY STATE3
2094         AESENC KEY STATE4
2095         movaps -0x10(TKEYP), KEY
2096         AESENC KEY STATE1
2097         AESENC KEY STATE2
2098         AESENC KEY STATE3
2099         AESENC KEY STATE4
2100         movaps (TKEYP), KEY
2101         AESENC KEY STATE1
2102         AESENC KEY STATE2
2103         AESENC KEY STATE3
2104         AESENC KEY STATE4
2105         movaps 0x10(TKEYP), KEY
2106         AESENC KEY STATE1
2107         AESENC KEY STATE2
2108         AESENC KEY STATE3
2109         AESENC KEY STATE4
2110         movaps 0x20(TKEYP), KEY
2111         AESENC KEY STATE1
2112         AESENC KEY STATE2
2113         AESENC KEY STATE3
2114         AESENC KEY STATE4
2115         movaps 0x30(TKEYP), KEY
2116         AESENC KEY STATE1
2117         AESENC KEY STATE2
2118         AESENC KEY STATE3
2119         AESENC KEY STATE4
2120         movaps 0x40(TKEYP), KEY
2121         AESENC KEY STATE1
2122         AESENC KEY STATE2
2123         AESENC KEY STATE3
2124         AESENC KEY STATE4
2125         movaps 0x50(TKEYP), KEY
2126         AESENC KEY STATE1
2127         AESENC KEY STATE2
2128         AESENC KEY STATE3
2129         AESENC KEY STATE4
2130         movaps 0x60(TKEYP), KEY
2131         AESENC KEY STATE1
2132         AESENC KEY STATE2
2133         AESENC KEY STATE3
2134         AESENC KEY STATE4
2135         movaps 0x70(TKEYP), KEY
2136         AESENCLAST KEY STATE1           # last round
2137         AESENCLAST KEY STATE2
2138         AESENCLAST KEY STATE3
2139         AESENCLAST KEY STATE4
2140         ret
2141 ENDPROC(_aesni_enc4)
2142
2143 /*
2144  * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
2145  */
2146 ENTRY(aesni_dec)
2147         FRAME_BEGIN
2148 #ifndef __x86_64__
2149         pushl KEYP
2150         pushl KLEN
2151         movl (FRAME_OFFSET+12)(%esp), KEYP      # ctx
2152         movl (FRAME_OFFSET+16)(%esp), OUTP      # dst
2153         movl (FRAME_OFFSET+20)(%esp), INP       # src
2154 #endif
2155         mov 480(KEYP), KLEN             # key length
2156         add $240, KEYP
2157         movups (INP), STATE             # input
2158         call _aesni_dec1
2159         movups STATE, (OUTP)            #output
2160 #ifndef __x86_64__
2161         popl KLEN
2162         popl KEYP
2163 #endif
2164         FRAME_END
2165         ret
2166 ENDPROC(aesni_dec)
2167
2168 /*
2169  * _aesni_dec1:         internal ABI
2170  * input:
2171  *      KEYP:           key struct pointer
2172  *      KLEN:           key length
2173  *      STATE:          initial state (input)
2174  * output:
2175  *      STATE:          finial state (output)
2176  * changed:
2177  *      KEY
2178  *      TKEYP (T1)
2179  */
2180 .align 4
2181 _aesni_dec1:
2182         movaps (KEYP), KEY              # key
2183         mov KEYP, TKEYP
2184         pxor KEY, STATE         # round 0
2185         add $0x30, TKEYP
2186         cmp $24, KLEN
2187         jb .Ldec128
2188         lea 0x20(TKEYP), TKEYP
2189         je .Ldec192
2190         add $0x20, TKEYP
2191         movaps -0x60(TKEYP), KEY
2192         AESDEC KEY STATE
2193         movaps -0x50(TKEYP), KEY
2194         AESDEC KEY STATE
2195 .align 4
2196 .Ldec192:
2197         movaps -0x40(TKEYP), KEY
2198         AESDEC KEY STATE
2199         movaps -0x30(TKEYP), KEY
2200         AESDEC KEY STATE
2201 .align 4
2202 .Ldec128:
2203         movaps -0x20(TKEYP), KEY
2204         AESDEC KEY STATE
2205         movaps -0x10(TKEYP), KEY
2206         AESDEC KEY STATE
2207         movaps (TKEYP), KEY
2208         AESDEC KEY STATE
2209         movaps 0x10(TKEYP), KEY
2210         AESDEC KEY STATE
2211         movaps 0x20(TKEYP), KEY
2212         AESDEC KEY STATE
2213         movaps 0x30(TKEYP), KEY
2214         AESDEC KEY STATE
2215         movaps 0x40(TKEYP), KEY
2216         AESDEC KEY STATE
2217         movaps 0x50(TKEYP), KEY
2218         AESDEC KEY STATE
2219         movaps 0x60(TKEYP), KEY
2220         AESDEC KEY STATE
2221         movaps 0x70(TKEYP), KEY
2222         AESDECLAST KEY STATE
2223         ret
2224 ENDPROC(_aesni_dec1)
2225
2226 /*
2227  * _aesni_dec4: internal ABI
2228  * input:
2229  *      KEYP:           key struct pointer
2230  *      KLEN:           key length
2231  *      STATE1:         initial state (input)
2232  *      STATE2
2233  *      STATE3
2234  *      STATE4
2235  * output:
2236  *      STATE1:         finial state (output)
2237  *      STATE2
2238  *      STATE3
2239  *      STATE4
2240  * changed:
2241  *      KEY
2242  *      TKEYP (T1)
2243  */
2244 .align 4
2245 _aesni_dec4:
2246         movaps (KEYP), KEY              # key
2247         mov KEYP, TKEYP
2248         pxor KEY, STATE1                # round 0
2249         pxor KEY, STATE2
2250         pxor KEY, STATE3
2251         pxor KEY, STATE4
2252         add $0x30, TKEYP
2253         cmp $24, KLEN
2254         jb .L4dec128
2255         lea 0x20(TKEYP), TKEYP
2256         je .L4dec192
2257         add $0x20, TKEYP
2258         movaps -0x60(TKEYP), KEY
2259         AESDEC KEY STATE1
2260         AESDEC KEY STATE2
2261         AESDEC KEY STATE3
2262         AESDEC KEY STATE4
2263         movaps -0x50(TKEYP), KEY
2264         AESDEC KEY STATE1
2265         AESDEC KEY STATE2
2266         AESDEC KEY STATE3
2267         AESDEC KEY STATE4
2268 .align 4
2269 .L4dec192:
2270         movaps -0x40(TKEYP), KEY
2271         AESDEC KEY STATE1
2272         AESDEC KEY STATE2
2273         AESDEC KEY STATE3
2274         AESDEC KEY STATE4
2275         movaps -0x30(TKEYP), KEY
2276         AESDEC KEY STATE1
2277         AESDEC KEY STATE2
2278         AESDEC KEY STATE3
2279         AESDEC KEY STATE4
2280 .align 4
2281 .L4dec128:
2282         movaps -0x20(TKEYP), KEY
2283         AESDEC KEY STATE1
2284         AESDEC KEY STATE2
2285         AESDEC KEY STATE3
2286         AESDEC KEY STATE4
2287         movaps -0x10(TKEYP), KEY
2288         AESDEC KEY STATE1
2289         AESDEC KEY STATE2
2290         AESDEC KEY STATE3
2291         AESDEC KEY STATE4
2292         movaps (TKEYP), KEY
2293         AESDEC KEY STATE1
2294         AESDEC KEY STATE2
2295         AESDEC KEY STATE3
2296         AESDEC KEY STATE4
2297         movaps 0x10(TKEYP), KEY
2298         AESDEC KEY STATE1
2299         AESDEC KEY STATE2
2300         AESDEC KEY STATE3
2301         AESDEC KEY STATE4
2302         movaps 0x20(TKEYP), KEY
2303         AESDEC KEY STATE1
2304         AESDEC KEY STATE2
2305         AESDEC KEY STATE3
2306         AESDEC KEY STATE4
2307         movaps 0x30(TKEYP), KEY
2308         AESDEC KEY STATE1
2309         AESDEC KEY STATE2
2310         AESDEC KEY STATE3
2311         AESDEC KEY STATE4
2312         movaps 0x40(TKEYP), KEY
2313         AESDEC KEY STATE1
2314         AESDEC KEY STATE2
2315         AESDEC KEY STATE3
2316         AESDEC KEY STATE4
2317         movaps 0x50(TKEYP), KEY
2318         AESDEC KEY STATE1
2319         AESDEC KEY STATE2
2320         AESDEC KEY STATE3
2321         AESDEC KEY STATE4
2322         movaps 0x60(TKEYP), KEY
2323         AESDEC KEY STATE1
2324         AESDEC KEY STATE2
2325         AESDEC KEY STATE3
2326         AESDEC KEY STATE4
2327         movaps 0x70(TKEYP), KEY
2328         AESDECLAST KEY STATE1           # last round
2329         AESDECLAST KEY STATE2
2330         AESDECLAST KEY STATE3
2331         AESDECLAST KEY STATE4
2332         ret
2333 ENDPROC(_aesni_dec4)
2334
2335 /*
2336  * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2337  *                    size_t len)
2338  */
2339 ENTRY(aesni_ecb_enc)
2340         FRAME_BEGIN
2341 #ifndef __x86_64__
2342         pushl LEN
2343         pushl KEYP
2344         pushl KLEN
2345         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2346         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2347         movl (FRAME_OFFSET+24)(%esp), INP       # src
2348         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2349 #endif
2350         test LEN, LEN           # check length
2351         jz .Lecb_enc_ret
2352         mov 480(KEYP), KLEN
2353         cmp $16, LEN
2354         jb .Lecb_enc_ret
2355         cmp $64, LEN
2356         jb .Lecb_enc_loop1
2357 .align 4
2358 .Lecb_enc_loop4:
2359         movups (INP), STATE1
2360         movups 0x10(INP), STATE2
2361         movups 0x20(INP), STATE3
2362         movups 0x30(INP), STATE4
2363         call _aesni_enc4
2364         movups STATE1, (OUTP)
2365         movups STATE2, 0x10(OUTP)
2366         movups STATE3, 0x20(OUTP)
2367         movups STATE4, 0x30(OUTP)
2368         sub $64, LEN
2369         add $64, INP
2370         add $64, OUTP
2371         cmp $64, LEN
2372         jge .Lecb_enc_loop4
2373         cmp $16, LEN
2374         jb .Lecb_enc_ret
2375 .align 4
2376 .Lecb_enc_loop1:
2377         movups (INP), STATE1
2378         call _aesni_enc1
2379         movups STATE1, (OUTP)
2380         sub $16, LEN
2381         add $16, INP
2382         add $16, OUTP
2383         cmp $16, LEN
2384         jge .Lecb_enc_loop1
2385 .Lecb_enc_ret:
2386 #ifndef __x86_64__
2387         popl KLEN
2388         popl KEYP
2389         popl LEN
2390 #endif
2391         FRAME_END
2392         ret
2393 ENDPROC(aesni_ecb_enc)
2394
2395 /*
2396  * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2397  *                    size_t len);
2398  */
2399 ENTRY(aesni_ecb_dec)
2400         FRAME_BEGIN
2401 #ifndef __x86_64__
2402         pushl LEN
2403         pushl KEYP
2404         pushl KLEN
2405         movl (FRAME_OFFSET+16)(%esp), KEYP      # ctx
2406         movl (FRAME_OFFSET+20)(%esp), OUTP      # dst
2407         movl (FRAME_OFFSET+24)(%esp), INP       # src
2408         movl (FRAME_OFFSET+28)(%esp), LEN       # len
2409 #endif
2410         test LEN, LEN
2411         jz .Lecb_dec_ret
2412         mov 480(KEYP), KLEN
2413         add $240, KEYP
2414         cmp $16, LEN
2415         jb .Lecb_dec_ret
2416         cmp $64, LEN
2417         jb .Lecb_dec_loop1
2418 .align 4
2419 .Lecb_dec_loop4:
2420         movups (INP), STATE1
2421         movups 0x10(INP), STATE2
2422         movups 0x20(INP), STATE3
2423         movups 0x30(INP), STATE4
2424         call _aesni_dec4
2425         movups STATE1, (OUTP)
2426         movups STATE2, 0x10(OUTP)
2427         movups STATE3, 0x20(OUTP)
2428         movups STATE4, 0x30(OUTP)
2429         sub $64, LEN
2430         add $64, INP
2431         add $64, OUTP
2432         cmp $64, LEN
2433         jge .Lecb_dec_loop4
2434         cmp $16, LEN
2435         jb .Lecb_dec_ret
2436 .align 4
2437 .Lecb_dec_loop1:
2438         movups (INP), STATE1
2439         call _aesni_dec1
2440         movups STATE1, (OUTP)
2441         sub $16, LEN
2442         add $16, INP
2443         add $16, OUTP
2444         cmp $16, LEN
2445         jge .Lecb_dec_loop1
2446 .Lecb_dec_ret:
2447 #ifndef __x86_64__
2448         popl KLEN
2449         popl KEYP
2450         popl LEN
2451 #endif
2452         FRAME_END
2453         ret
2454 ENDPROC(aesni_ecb_dec)
2455
2456 /*
2457  * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2458  *                    size_t len, u8 *iv)
2459  */
2460 ENTRY(aesni_cbc_enc)
2461         FRAME_BEGIN
2462 #ifndef __x86_64__
2463         pushl IVP
2464         pushl LEN
2465         pushl KEYP
2466         pushl KLEN
2467         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2468         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2469         movl (FRAME_OFFSET+28)(%esp), INP       # src
2470         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2471         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2472 #endif
2473         cmp $16, LEN
2474         jb .Lcbc_enc_ret
2475         mov 480(KEYP), KLEN
2476         movups (IVP), STATE     # load iv as initial state
2477 .align 4
2478 .Lcbc_enc_loop:
2479         movups (INP), IN        # load input
2480         pxor IN, STATE
2481         call _aesni_enc1
2482         movups STATE, (OUTP)    # store output
2483         sub $16, LEN
2484         add $16, INP
2485         add $16, OUTP
2486         cmp $16, LEN
2487         jge .Lcbc_enc_loop
2488         movups STATE, (IVP)
2489 .Lcbc_enc_ret:
2490 #ifndef __x86_64__
2491         popl KLEN
2492         popl KEYP
2493         popl LEN
2494         popl IVP
2495 #endif
2496         FRAME_END
2497         ret
2498 ENDPROC(aesni_cbc_enc)
2499
2500 /*
2501  * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2502  *                    size_t len, u8 *iv)
2503  */
2504 ENTRY(aesni_cbc_dec)
2505         FRAME_BEGIN
2506 #ifndef __x86_64__
2507         pushl IVP
2508         pushl LEN
2509         pushl KEYP
2510         pushl KLEN
2511         movl (FRAME_OFFSET+20)(%esp), KEYP      # ctx
2512         movl (FRAME_OFFSET+24)(%esp), OUTP      # dst
2513         movl (FRAME_OFFSET+28)(%esp), INP       # src
2514         movl (FRAME_OFFSET+32)(%esp), LEN       # len
2515         movl (FRAME_OFFSET+36)(%esp), IVP       # iv
2516 #endif
2517         cmp $16, LEN
2518         jb .Lcbc_dec_just_ret
2519         mov 480(KEYP), KLEN
2520         add $240, KEYP
2521         movups (IVP), IV
2522         cmp $64, LEN
2523         jb .Lcbc_dec_loop1
2524 .align 4
2525 .Lcbc_dec_loop4:
2526         movups (INP), IN1
2527         movaps IN1, STATE1
2528         movups 0x10(INP), IN2
2529         movaps IN2, STATE2
2530 #ifdef __x86_64__
2531         movups 0x20(INP), IN3
2532         movaps IN3, STATE3
2533         movups 0x30(INP), IN4
2534         movaps IN4, STATE4
2535 #else
2536         movups 0x20(INP), IN1
2537         movaps IN1, STATE3
2538         movups 0x30(INP), IN2
2539         movaps IN2, STATE4
2540 #endif
2541         call _aesni_dec4
2542         pxor IV, STATE1
2543 #ifdef __x86_64__
2544         pxor IN1, STATE2
2545         pxor IN2, STATE3
2546         pxor IN3, STATE4
2547         movaps IN4, IV
2548 #else
2549         pxor IN1, STATE4
2550         movaps IN2, IV
2551         movups (INP), IN1
2552         pxor IN1, STATE2
2553         movups 0x10(INP), IN2
2554         pxor IN2, STATE3
2555 #endif
2556         movups STATE1, (OUTP)
2557         movups STATE2, 0x10(OUTP)
2558         movups STATE3, 0x20(OUTP)
2559         movups STATE4, 0x30(OUTP)
2560         sub $64, LEN
2561         add $64, INP
2562         add $64, OUTP
2563         cmp $64, LEN
2564         jge .Lcbc_dec_loop4
2565         cmp $16, LEN
2566         jb .Lcbc_dec_ret
2567 .align 4
2568 .Lcbc_dec_loop1:
2569         movups (INP), IN
2570         movaps IN, STATE
2571         call _aesni_dec1
2572         pxor IV, STATE
2573         movups STATE, (OUTP)
2574         movaps IN, IV
2575         sub $16, LEN
2576         add $16, INP
2577         add $16, OUTP
2578         cmp $16, LEN
2579         jge .Lcbc_dec_loop1
2580 .Lcbc_dec_ret:
2581         movups IV, (IVP)
2582 .Lcbc_dec_just_ret:
2583 #ifndef __x86_64__
2584         popl KLEN
2585         popl KEYP
2586         popl LEN
2587         popl IVP
2588 #endif
2589         FRAME_END
2590         ret
2591 ENDPROC(aesni_cbc_dec)
2592
2593 #ifdef __x86_64__
2594 .pushsection .rodata
2595 .align 16
2596 .Lbswap_mask:
2597         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2598 .popsection
2599
2600 /*
2601  * _aesni_inc_init:     internal ABI
2602  *      setup registers used by _aesni_inc
2603  * input:
2604  *      IV
2605  * output:
2606  *      CTR:    == IV, in little endian
2607  *      TCTR_LOW: == lower qword of CTR
2608  *      INC:    == 1, in little endian
2609  *      BSWAP_MASK == endian swapping mask
2610  */
2611 .align 4
2612 _aesni_inc_init:
2613         movaps .Lbswap_mask, BSWAP_MASK
2614         movaps IV, CTR
2615         PSHUFB_XMM BSWAP_MASK CTR
2616         mov $1, TCTR_LOW
2617         MOVQ_R64_XMM TCTR_LOW INC
2618         MOVQ_R64_XMM CTR TCTR_LOW
2619         ret
2620 ENDPROC(_aesni_inc_init)
2621
2622 /*
2623  * _aesni_inc:          internal ABI
2624  *      Increase IV by 1, IV is in big endian
2625  * input:
2626  *      IV
2627  *      CTR:    == IV, in little endian
2628  *      TCTR_LOW: == lower qword of CTR
2629  *      INC:    == 1, in little endian
2630  *      BSWAP_MASK == endian swapping mask
2631  * output:
2632  *      IV:     Increase by 1
2633  * changed:
2634  *      CTR:    == output IV, in little endian
2635  *      TCTR_LOW: == lower qword of CTR
2636  */
2637 .align 4
2638 _aesni_inc:
2639         paddq INC, CTR
2640         add $1, TCTR_LOW
2641         jnc .Linc_low
2642         pslldq $8, INC
2643         paddq INC, CTR
2644         psrldq $8, INC
2645 .Linc_low:
2646         movaps CTR, IV
2647         PSHUFB_XMM BSWAP_MASK IV
2648         ret
2649 ENDPROC(_aesni_inc)
2650
2651 /*
2652  * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2653  *                    size_t len, u8 *iv)
2654  */
2655 ENTRY(aesni_ctr_enc)
2656         FRAME_BEGIN
2657         cmp $16, LEN
2658         jb .Lctr_enc_just_ret
2659         mov 480(KEYP), KLEN
2660         movups (IVP), IV
2661         call _aesni_inc_init
2662         cmp $64, LEN
2663         jb .Lctr_enc_loop1
2664 .align 4
2665 .Lctr_enc_loop4:
2666         movaps IV, STATE1
2667         call _aesni_inc
2668         movups (INP), IN1
2669         movaps IV, STATE2
2670         call _aesni_inc
2671         movups 0x10(INP), IN2
2672         movaps IV, STATE3
2673         call _aesni_inc
2674         movups 0x20(INP), IN3
2675         movaps IV, STATE4
2676         call _aesni_inc
2677         movups 0x30(INP), IN4
2678         call _aesni_enc4
2679         pxor IN1, STATE1
2680         movups STATE1, (OUTP)
2681         pxor IN2, STATE2
2682         movups STATE2, 0x10(OUTP)
2683         pxor IN3, STATE3
2684         movups STATE3, 0x20(OUTP)
2685         pxor IN4, STATE4
2686         movups STATE4, 0x30(OUTP)
2687         sub $64, LEN
2688         add $64, INP
2689         add $64, OUTP
2690         cmp $64, LEN
2691         jge .Lctr_enc_loop4
2692         cmp $16, LEN
2693         jb .Lctr_enc_ret
2694 .align 4
2695 .Lctr_enc_loop1:
2696         movaps IV, STATE
2697         call _aesni_inc
2698         movups (INP), IN
2699         call _aesni_enc1
2700         pxor IN, STATE
2701         movups STATE, (OUTP)
2702         sub $16, LEN
2703         add $16, INP
2704         add $16, OUTP
2705         cmp $16, LEN
2706         jge .Lctr_enc_loop1
2707 .Lctr_enc_ret:
2708         movups IV, (IVP)
2709 .Lctr_enc_just_ret:
2710         FRAME_END
2711         ret
2712 ENDPROC(aesni_ctr_enc)
2713
2714 /*
2715  * _aesni_gf128mul_x_ble:               internal ABI
2716  *      Multiply in GF(2^128) for XTS IVs
2717  * input:
2718  *      IV:     current IV
2719  *      GF128MUL_MASK == mask with 0x87 and 0x01
2720  * output:
2721  *      IV:     next IV
2722  * changed:
2723  *      CTR:    == temporary value
2724  */
2725 #define _aesni_gf128mul_x_ble() \
2726         pshufd $0x13, IV, CTR; \
2727         paddq IV, IV; \
2728         psrad $31, CTR; \
2729         pand GF128MUL_MASK, CTR; \
2730         pxor CTR, IV;
2731
2732 /*
2733  * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2734  *                       bool enc, u8 *iv)
2735  */
2736 ENTRY(aesni_xts_crypt8)
2737         FRAME_BEGIN
2738         cmpb $0, %cl
2739         movl $0, %ecx
2740         movl $240, %r10d
2741         leaq _aesni_enc4, %r11
2742         leaq _aesni_dec4, %rax
2743         cmovel %r10d, %ecx
2744         cmoveq %rax, %r11
2745
2746         movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2747         movups (IVP), IV
2748
2749         mov 480(KEYP), KLEN
2750         addq %rcx, KEYP
2751
2752         movdqa IV, STATE1
2753         movdqu 0x00(INP), INC
2754         pxor INC, STATE1
2755         movdqu IV, 0x00(OUTP)
2756
2757         _aesni_gf128mul_x_ble()
2758         movdqa IV, STATE2
2759         movdqu 0x10(INP), INC
2760         pxor INC, STATE2
2761         movdqu IV, 0x10(OUTP)
2762
2763         _aesni_gf128mul_x_ble()
2764         movdqa IV, STATE3
2765         movdqu 0x20(INP), INC
2766         pxor INC, STATE3
2767         movdqu IV, 0x20(OUTP)
2768
2769         _aesni_gf128mul_x_ble()
2770         movdqa IV, STATE4
2771         movdqu 0x30(INP), INC
2772         pxor INC, STATE4
2773         movdqu IV, 0x30(OUTP)
2774
2775         CALL_NOSPEC %r11
2776
2777         movdqu 0x00(OUTP), INC
2778         pxor INC, STATE1
2779         movdqu STATE1, 0x00(OUTP)
2780
2781         _aesni_gf128mul_x_ble()
2782         movdqa IV, STATE1
2783         movdqu 0x40(INP), INC
2784         pxor INC, STATE1
2785         movdqu IV, 0x40(OUTP)
2786
2787         movdqu 0x10(OUTP), INC
2788         pxor INC, STATE2
2789         movdqu STATE2, 0x10(OUTP)
2790
2791         _aesni_gf128mul_x_ble()
2792         movdqa IV, STATE2
2793         movdqu 0x50(INP), INC
2794         pxor INC, STATE2
2795         movdqu IV, 0x50(OUTP)
2796
2797         movdqu 0x20(OUTP), INC
2798         pxor INC, STATE3
2799         movdqu STATE3, 0x20(OUTP)
2800
2801         _aesni_gf128mul_x_ble()
2802         movdqa IV, STATE3
2803         movdqu 0x60(INP), INC
2804         pxor INC, STATE3
2805         movdqu IV, 0x60(OUTP)
2806
2807         movdqu 0x30(OUTP), INC
2808         pxor INC, STATE4
2809         movdqu STATE4, 0x30(OUTP)
2810
2811         _aesni_gf128mul_x_ble()
2812         movdqa IV, STATE4
2813         movdqu 0x70(INP), INC
2814         pxor INC, STATE4
2815         movdqu IV, 0x70(OUTP)
2816
2817         _aesni_gf128mul_x_ble()
2818         movups IV, (IVP)
2819
2820         CALL_NOSPEC %r11
2821
2822         movdqu 0x40(OUTP), INC
2823         pxor INC, STATE1
2824         movdqu STATE1, 0x40(OUTP)
2825
2826         movdqu 0x50(OUTP), INC
2827         pxor INC, STATE2
2828         movdqu STATE2, 0x50(OUTP)
2829
2830         movdqu 0x60(OUTP), INC
2831         pxor INC, STATE3
2832         movdqu STATE3, 0x60(OUTP)
2833
2834         movdqu 0x70(OUTP), INC
2835         pxor INC, STATE4
2836         movdqu STATE4, 0x70(OUTP)
2837
2838         FRAME_END
2839         ret
2840 ENDPROC(aesni_xts_crypt8)
2841
2842 #endif