arch/x86/crypto/aes-gcm-aesni-x86_64.S

   1 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
   2 //
   3 // AES-NI optimized AES-GCM for x86_64
   4 //
   5 // Copyright 2024 Google LLC
   6 //
   7 // Author: Eric Biggers <ebiggers@google.com>
   8 //
   9 //------------------------------------------------------------------------------
  10 //
  11 // This file is dual-licensed, meaning that you can use it under your choice of
  12 // either of the following two licenses:
  13 //
  14 // Licensed under the Apache License 2.0 (the "License").  You may obtain a copy
  15 // of the License at
  16 //
  17 //      http://www.apache.org/licenses/LICENSE-2.0
  18 //
  19 // Unless required by applicable law or agreed to in writing, software
  20 // distributed under the License is distributed on an "AS IS" BASIS,
  21 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  22 // See the License for the specific language governing permissions and
  23 // limitations under the License.
  24 //
  25 // or
  26 //
  27 // Redistribution and use in source and binary forms, with or without
  28 // modification, are permitted provided that the following conditions are met:
  29 //
  30 // 1. Redistributions of source code must retain the above copyright notice,
  31 //    this list of conditions and the following disclaimer.
  32 //
  33 // 2. Redistributions in binary form must reproduce the above copyright
  34 //    notice, this list of conditions and the following disclaimer in the
  35 //    documentation and/or other materials provided with the distribution.
  36 //
  37 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  38 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  39 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  40 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  41 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  42 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  43 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  44 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  45 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  46 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  47 // POSSIBILITY OF SUCH DAMAGE.
  48 //
  49 //------------------------------------------------------------------------------
  50 //
  51 // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
  52 // support the original set of AES instructions, i.e. AES-NI.  Two
  53 // implementations are provided, one that uses AVX and one that doesn't.  They
  54 // are very similar, being generated by the same macros.  The only difference is
  55 // that the AVX implementation takes advantage of VEX-coded instructions in some
  56 // places to avoid some 'movdqu' and 'movdqa' instructions.  The AVX
  57 // implementation does *not* use 256-bit vectors, as AES is not supported on
  58 // 256-bit vectors until the VAES feature (which this file doesn't target).
  59 //
  60 // The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
  61 // for the *_aesni functions or AVX for the *_aesni_avx ones.  (But it seems
  62 // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
  63 //
  64 // The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
  65 // more thoroughly commented.  This file has the following notable changes:
  66 //
  67 //    - The vector length is fixed at 128-bit, i.e. xmm registers.  This means
  68 //      there is only one AES block (and GHASH block) per register.
  69 //
  70 //    - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
  71 //      32.  We work around this by being much more careful about using
  72 //      registers, relying heavily on loads to load values as they are needed.
  73 //
  74 //    - Masking is not available either.  We work around this by implementing
  75 //      partial block loads and stores using overlapping scalar loads and stores
  76 //      combined with shifts and SSE4.1 insertion and extraction instructions.
  77 //
  78 //    - The main loop is organized differently due to the different design
  79 //      constraints.  First, with just one AES block per SIMD register, on some
  80 //      CPUs 4 registers don't saturate the 'aesenc' throughput.  We therefore
  81 //      do an 8-register wide loop.  Considering that and the fact that we have
  82 //      just 16 SIMD registers to work with, it's not feasible to cache AES
  83 //      round keys and GHASH key powers in registers across loop iterations.
  84 //      That's not ideal, but also not actually that bad, since loads can run in
  85 //      parallel with other instructions.  Significantly, this also makes it
  86 //      possible to roll up the inner loops, relying on hardware loop unrolling
  87 //      instead of software loop unrolling, greatly reducing code size.
  88 //
  89 //    - We implement the GHASH multiplications in the main loop using Karatsuba
  90 //      multiplication instead of schoolbook multiplication.  This saves one
  91 //      pclmulqdq instruction per block, at the cost of one 64-bit load, one
  92 //      pshufd, and 0.25 pxors per block.  (This is without the three-argument
  93 //      XOR support that would be provided by AVX512 / AVX10, which would be
  94 //      more beneficial to schoolbook than Karatsuba.)
  95 //
  96 //      As a rough approximation, we can assume that Karatsuba multiplication is
  97 //      faster than schoolbook multiplication in this context if one pshufd and
  98 //      0.25 pxors are cheaper than a pclmulqdq.  (We assume that the 64-bit
  99 //      load is "free" due to running in parallel with arithmetic instructions.)
 100 //      This is true on AMD CPUs, including all that support pclmulqdq up to at
 101 //      least Zen 3.  It's also true on older Intel CPUs: Westmere through
 102 //      Haswell on the Core side, and Silvermont through Goldmont Plus on the
 103 //      low-power side.  On some of these CPUs, pclmulqdq is quite slow, and the
 104 //      benefit of Karatsuba should be substantial.  On newer Intel CPUs,
 105 //      schoolbook multiplication should be faster, but only marginally.
 106 //
 107 //      Not all these CPUs were available to be tested.  However, benchmarks on
 108 //      available CPUs suggest that this approximation is plausible.  Switching
 109 //      to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
 110 //      Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
 111 //      Considering that and the fact that Karatsuba should be even more
 112 //      beneficial on older Intel CPUs, it seems like the right choice here.
 113 //
 114 //      An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
 115 //      saved by using a multiplication-less reduction method.  We don't do that
 116 //      because it would require a large number of shift and xor instructions,
 117 //      making it less worthwhile and likely harmful on newer CPUs.
 118 //
 119 //      It does make sense to sometimes use a different reduction optimization
 120 //      that saves a pclmulqdq, though: precompute the hash key times x^64, and
 121 //      multiply the low half of the data block by the hash key with the extra
 122 //      factor of x^64.  This eliminates one step of the reduction.  However,
 123 //      this is incompatible with Karatsuba multiplication.  Therefore, for
 124 //      multi-block processing we use Karatsuba multiplication with a regular
 125 //      reduction.  For single-block processing, we use the x^64 optimization.
 126
 127 #include <linux/linkage.h>
 128
 129 .section .rodata
 130 .p2align 4
 131 .Lbswap_mask:
 132         .octa   0x000102030405060708090a0b0c0d0e0f
 133 .Lgfpoly:
 134         .quad   0xc200000000000000
 135 .Lone:
 136         .quad   1
 137 .Lgfpoly_and_internal_carrybit:
 138         .octa   0xc2000000000000010000000000000001
 139         // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
 140         // 'len' 0xff bytes and the rest zeroes.
 141 .Lzeropad_mask:
 142         .octa   0xffffffffffffffffffffffffffffffff
 143         .octa   0
 144
 145 // Offsets in struct aes_gcm_key_aesni
 146 #define OFFSETOF_AESKEYLEN      480
 147 #define OFFSETOF_H_POWERS       496
 148 #define OFFSETOF_H_POWERS_XORED 624
 149 #define OFFSETOF_H_TIMES_X64    688
 150
 151 .text
 152
 153 // Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq.  The fallback
 154 // assumes that all operands are distinct and that any mem operand is aligned.
 155 .macro  _vpclmulqdq     imm, src1, src2, dst
 156 .if USE_AVX
 157         vpclmulqdq      \imm, \src1, \src2, \dst
 158 .else
 159         movdqa          \src2, \dst
 160         pclmulqdq       \imm, \src1, \dst
 161 .endif
 162 .endm
 163
 164 // Do a vpshufb, or fall back to a movdqa and a pshufb.  The fallback assumes
 165 // that all operands are distinct and that any mem operand is aligned.
 166 .macro  _vpshufb        src1, src2, dst
 167 .if USE_AVX
 168         vpshufb         \src1, \src2, \dst
 169 .else
 170         movdqa          \src2, \dst
 171         pshufb          \src1, \dst
 172 .endif
 173 .endm
 174
 175 // Do a vpand, or fall back to a movdqu and a pand.  The fallback assumes that
 176 // all operands are distinct.
 177 .macro  _vpand          src1, src2, dst
 178 .if USE_AVX
 179         vpand           \src1, \src2, \dst
 180 .else
 181         movdqu          \src1, \dst
 182         pand            \src2, \dst
 183 .endif
 184 .endm
 185
 186 // XOR the unaligned memory operand \mem into the xmm register \reg.  \tmp must
 187 // be a temporary xmm register.
 188 .macro  _xor_mem_to_reg mem, reg, tmp
 189 .if USE_AVX
 190         vpxor           \mem, \reg, \reg
 191 .else
 192         movdqu          \mem, \tmp
 193         pxor            \tmp, \reg
 194 .endif
 195 .endm
 196
 197 // Test the unaligned memory operand \mem against the xmm register \reg.  \tmp
 198 // must be a temporary xmm register.
 199 .macro  _test_mem       mem, reg, tmp
 200 .if USE_AVX
 201         vptest          \mem, \reg
 202 .else
 203         movdqu          \mem, \tmp
 204         ptest           \tmp, \reg
 205 .endif
 206 .endm
 207
 208 // Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
 209 // and zeroize any remaining bytes.  Clobbers %rax, %rcx, and \tmp{64,32}.
 210 .macro  _load_partial_block     src, dst, tmp64, tmp32
 211         sub             $8, %ecx                // LEN - 8
 212         jle             .Lle8\@
 213
 214         // Load 9 <= LEN <= 15 bytes.
 215         movq            (\src), \dst            // Load first 8 bytes
 216         mov             (\src, %rcx), %rax      // Load last 8 bytes
 217         neg             %ecx
 218         shl             $3, %ecx
 219         shr             %cl, %rax               // Discard overlapping bytes
 220         pinsrq          $1, %rax, \dst
 221         jmp             .Ldone\@
 222
 223 .Lle8\@:
 224         add             $4, %ecx                // LEN - 4
 225         jl              .Llt4\@
 226
 227         // Load 4 <= LEN <= 8 bytes.
 228         mov             (\src), %eax            // Load first 4 bytes
 229         mov             (\src, %rcx), \tmp32    // Load last 4 bytes
 230         jmp             .Lcombine\@
 231
 232 .Llt4\@:
 233         // Load 1 <= LEN <= 3 bytes.
 234         add             $2, %ecx                // LEN - 2
 235         movzbl          (\src), %eax            // Load first byte
 236         jl              .Lmovq\@
 237         movzwl          (\src, %rcx), \tmp32    // Load last 2 bytes
 238 .Lcombine\@:
 239         shl             $3, %ecx
 240         shl             %cl, \tmp64
 241         or              \tmp64, %rax            // Combine the two parts
 242 .Lmovq\@:
 243         movq            %rax, \dst
 244 .Ldone\@:
 245 .endm
 246
 247 // Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
 248 // Clobbers %rax, %rcx, and %rsi.
 249 .macro  _store_partial_block    src, dst
 250         sub             $8, %ecx                // LEN - 8
 251         jl              .Llt8\@
 252
 253         // Store 8 <= LEN <= 15 bytes.
 254         pextrq          $1, \src, %rax
 255         mov             %ecx, %esi
 256         shl             $3, %ecx
 257         ror             %cl, %rax
 258         mov             %rax, (\dst, %rsi)      // Store last LEN - 8 bytes
 259         movq            \src, (\dst)            // Store first 8 bytes
 260         jmp             .Ldone\@
 261
 262 .Llt8\@:
 263         add             $4, %ecx                // LEN - 4
 264         jl              .Llt4\@
 265
 266         // Store 4 <= LEN <= 7 bytes.
 267         pextrd          $1, \src, %eax
 268         mov             %ecx, %esi
 269         shl             $3, %ecx
 270         ror             %cl, %eax
 271         mov             %eax, (\dst, %rsi)      // Store last LEN - 4 bytes
 272         movd            \src, (\dst)            // Store first 4 bytes
 273         jmp             .Ldone\@
 274
 275 .Llt4\@:
 276         // Store 1 <= LEN <= 3 bytes.
 277         pextrb          $0, \src, 0(\dst)
 278         cmp             $-2, %ecx               // LEN - 4 == -2, i.e. LEN == 2?
 279         jl              .Ldone\@
 280         pextrb          $1, \src, 1(\dst)
 281         je              .Ldone\@
 282         pextrb          $2, \src, 2(\dst)
 283 .Ldone\@:
 284 .endm
 285
 286 // Do one step of GHASH-multiplying \a by \b and storing the reduced product in
 287 // \b.  To complete all steps, this must be invoked with \i=0 through \i=9.
 288 // \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
 289 // .Lgfpoly constant, and \t0-\t1 must be temporary registers.
 290 .macro  _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1
 291
 292         // MI = (a_L * b_H) + ((a*x^64)_L * b_L)
 293 .if \i == 0
 294         _vpclmulqdq     $0x01, \a, \b, \t0
 295 .elseif \i == 1
 296         _vpclmulqdq     $0x00, \a_times_x64, \b, \t1
 297 .elseif \i == 2
 298         pxor            \t1, \t0
 299
 300         // HI = (a_H * b_H) + ((a*x^64)_H * b_L)
 301 .elseif \i == 3
 302         _vpclmulqdq     $0x11, \a, \b, \t1
 303 .elseif \i == 4
 304         pclmulqdq       $0x10, \a_times_x64, \b
 305 .elseif \i == 5
 306         pxor            \t1, \b
 307 .elseif \i == 6
 308
 309         // Fold MI into HI.
 310         pshufd          $0x4e, \t0, \t1         // Swap halves of MI
 311 .elseif \i == 7
 312         pclmulqdq       $0x00, \gfpoly, \t0     // MI_L*(x^63 + x^62 + x^57)
 313 .elseif \i == 8
 314         pxor            \t1, \b
 315 .elseif \i == 9
 316         pxor            \t0, \b
 317 .endif
 318 .endm
 319
 320 // GHASH-multiply \a by \b and store the reduced product in \b.
 321 // See _ghash_mul_step for details.
 322 .macro  _ghash_mul      a, a_times_x64, b, gfpoly, t0, t1
 323 .irp i, 0,1,2,3,4,5,6,7,8,9
 324         _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
 325 .endr
 326 .endm
 327
 328 // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
 329 // This does Karatsuba multiplication and must be paired with _ghash_reduce.  On
 330 // the first call, \lo, \mi, and \hi must be zero.  \a_xored must contain the
 331 // two halves of \a XOR'd together, i.e. a_L + a_H.  \b is clobbered.
 332 .macro  _ghash_mul_noreduce     a, a_xored, b, lo, mi, hi, t0
 333
 334         // LO += a_L * b_L
 335         _vpclmulqdq     $0x00, \a, \b, \t0
 336         pxor            \t0, \lo
 337
 338         // b_L + b_H
 339         pshufd          $0x4e, \b, \t0
 340         pxor            \b, \t0
 341
 342         // HI += a_H * b_H
 343         pclmulqdq       $0x11, \a, \b
 344         pxor            \b, \hi
 345
 346         // MI += (a_L + a_H) * (b_L + b_H)
 347         pclmulqdq       $0x00, \a_xored, \t0
 348         pxor            \t0, \mi
 349 .endm
 350
 351 // Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
 352 // This assumes that _ghash_mul_noreduce was used.
 353 .macro  _ghash_reduce   lo, mi, hi, dst, t0
 354
 355         movq            .Lgfpoly(%rip), \t0
 356
 357         // MI += LO + HI (needed because we used Karatsuba multiplication)
 358         pxor            \lo, \mi
 359         pxor            \hi, \mi
 360
 361         // Fold LO into MI.
 362         pshufd          $0x4e, \lo, \dst
 363         pclmulqdq       $0x00, \t0, \lo
 364         pxor            \dst, \mi
 365         pxor            \lo, \mi
 366
 367         // Fold MI into HI.
 368         pshufd          $0x4e, \mi, \dst
 369         pclmulqdq       $0x00, \t0, \mi
 370         pxor            \hi, \dst
 371         pxor            \mi, \dst
 372 .endm
 373
 374 // Do the first step of the GHASH update of a set of 8 ciphertext blocks.
 375 //
 376 // The whole GHASH update does:
 377 //
 378 //      GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
 379 //                              blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
 380 //
 381 // This macro just does the first step: it does the unreduced multiplication
 382 // (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
 383 // registers LO, MI, and GHASH_ACC a.k.a. HI.  It also zero-initializes the
 384 // inner block counter in %rax, which is a value that counts up by 8 for each
 385 // block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
 386 //
 387 // To reduce the number of pclmulqdq instructions required, both this macro and
 388 // _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
 389 // multiplication.  See the file comment for more details about this choice.
 390 //
 391 // Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
 392 // encrypting, or SRC if decrypting.  They also expect the precomputed hash key
 393 // powers H^i and their XOR'd-together halves to be available in the struct
 394 // pointed to by KEY.  Both macros clobber TMP[0-2].
 395 .macro  _ghash_update_begin_8x  enc
 396
 397         // Initialize the inner block counter.
 398         xor             %eax, %eax
 399
 400         // Load the highest hash key power, H^8.
 401         movdqa          OFFSETOF_H_POWERS(KEY), TMP0
 402
 403         // Load the first ciphertext block and byte-reflect it.
 404 .if \enc
 405         movdqu          (DST), TMP1
 406 .else
 407         movdqu          (SRC), TMP1
 408 .endif
 409         pshufb          BSWAP_MASK, TMP1
 410
 411         // Add the GHASH accumulator to the ciphertext block to get the block
 412         // 'b' that needs to be multiplied with the hash key power 'a'.
 413         pxor            TMP1, GHASH_ACC
 414
 415         // b_L + b_H
 416         pshufd          $0x4e, GHASH_ACC, MI
 417         pxor            GHASH_ACC, MI
 418
 419         // LO = a_L * b_L
 420         _vpclmulqdq     $0x00, TMP0, GHASH_ACC, LO
 421
 422         // HI = a_H * b_H
 423         pclmulqdq       $0x11, TMP0, GHASH_ACC
 424
 425         // MI = (a_L + a_H) * (b_L + b_H)
 426         pclmulqdq       $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
 427 .endm
 428
 429 // Continue the GHASH update of 8 ciphertext blocks as described above by doing
 430 // an unreduced multiplication of the next ciphertext block by the next lowest
 431 // key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
 432 .macro  _ghash_update_continue_8x enc
 433         add             $8, %eax
 434
 435         // Load the next lowest key power.
 436         movdqa          OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
 437
 438         // Load the next ciphertext block and byte-reflect it.
 439 .if \enc
 440         movdqu          (DST,%rax,2), TMP1
 441 .else
 442         movdqu          (SRC,%rax,2), TMP1
 443 .endif
 444         pshufb          BSWAP_MASK, TMP1
 445
 446         // LO += a_L * b_L
 447         _vpclmulqdq     $0x00, TMP0, TMP1, TMP2
 448         pxor            TMP2, LO
 449
 450         // b_L + b_H
 451         pshufd          $0x4e, TMP1, TMP2
 452         pxor            TMP1, TMP2
 453
 454         // HI += a_H * b_H
 455         pclmulqdq       $0x11, TMP0, TMP1
 456         pxor            TMP1, GHASH_ACC
 457
 458         // MI += (a_L + a_H) * (b_L + b_H)
 459         movq            OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
 460         pclmulqdq       $0x00, TMP1, TMP2
 461         pxor            TMP2, MI
 462 .endm
 463
 464 // Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC.  This is similar to
 465 // _ghash_reduce, but it's hardcoded to use the registers of the main loop and
 466 // it uses the same register for HI and the destination.  It's also divided into
 467 // two steps.  TMP1 must be preserved across steps.
 468 //
 469 // One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
 470 // shuffling LO, XOR'ing LO into MI, and shuffling MI.  However, this would
 471 // increase the critical path length, and it seems to slightly hurt performance.
 472 .macro  _ghash_update_end_8x_step       i
 473 .if \i == 0
 474         movq            .Lgfpoly(%rip), TMP1
 475         pxor            LO, MI
 476         pxor            GHASH_ACC, MI
 477         pshufd          $0x4e, LO, TMP2
 478         pclmulqdq       $0x00, TMP1, LO
 479         pxor            TMP2, MI
 480         pxor            LO, MI
 481 .elseif \i == 1
 482         pshufd          $0x4e, MI, TMP2
 483         pclmulqdq       $0x00, TMP1, MI
 484         pxor            TMP2, GHASH_ACC
 485         pxor            MI, GHASH_ACC
 486 .endif
 487 .endm
 488
 489 // void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
 490 //
 491 // Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
 492 // related fields in the key struct.
 493 .macro  _aes_gcm_precompute
 494
 495         // Function arguments
 496         .set    KEY,            %rdi
 497
 498         // Additional local variables.
 499         // %xmm0-%xmm1 and %rax are used as temporaries.
 500         .set    RNDKEYLAST_PTR, %rsi
 501         .set    H_CUR,          %xmm2
 502         .set    H_POW1,         %xmm3   // H^1
 503         .set    H_POW1_X64,     %xmm4   // H^1 * x^64
 504         .set    GFPOLY,         %xmm5
 505
 506         // Encrypt an all-zeroes block to get the raw hash subkey.
 507         movl            OFFSETOF_AESKEYLEN(KEY), %eax
 508         lea             6*16(KEY,%rax,4), RNDKEYLAST_PTR
 509         movdqa          (KEY), H_POW1  // Zero-th round key XOR all-zeroes block
 510         lea             16(KEY), %rax
 511 1:
 512         aesenc          (%rax), H_POW1
 513         add             $16, %rax
 514         cmp             %rax, RNDKEYLAST_PTR
 515         jne             1b
 516         aesenclast      (RNDKEYLAST_PTR), H_POW1
 517
 518         // Preprocess the raw hash subkey as needed to operate on GHASH's
 519         // bit-reflected values directly: reflect its bytes, then multiply it by
 520         // x^-1 (using the backwards interpretation of polynomial coefficients
 521         // from the GCM spec) or equivalently x^1 (using the alternative,
 522         // natural interpretation of polynomial coefficients).
 523         pshufb          .Lbswap_mask(%rip), H_POW1
 524         movdqa          H_POW1, %xmm0
 525         pshufd          $0xd3, %xmm0, %xmm0
 526         psrad           $31, %xmm0
 527         paddq           H_POW1, H_POW1
 528         pand            .Lgfpoly_and_internal_carrybit(%rip), %xmm0
 529         pxor            %xmm0, H_POW1
 530
 531         // Store H^1.
 532         movdqa          H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
 533
 534         // Compute and store H^1 * x^64.
 535         movq            .Lgfpoly(%rip), GFPOLY
 536         pshufd          $0x4e, H_POW1, %xmm0
 537         _vpclmulqdq     $0x00, H_POW1, GFPOLY, H_POW1_X64
 538         pxor            %xmm0, H_POW1_X64
 539         movdqa          H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
 540
 541         // Compute and store the halves of H^1 XOR'd together.
 542         pxor            H_POW1, %xmm0
 543         movq            %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
 544
 545         // Compute and store the remaining key powers H^2 through H^8.
 546         movdqa          H_POW1, H_CUR
 547         mov             $6*8, %eax
 548 .Lprecompute_next\@:
 549         // Compute H^i = H^{i-1} * H^1.
 550         _ghash_mul      H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
 551         // Store H^i.
 552         movdqa          H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
 553         // Compute and store the halves of H^i XOR'd together.
 554         pshufd          $0x4e, H_CUR, %xmm0
 555         pxor            H_CUR, %xmm0
 556         movq            %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
 557         sub             $8, %eax
 558         jge             .Lprecompute_next\@
 559
 560         RET
 561 .endm
 562
 563 // void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
 564 //                               u8 ghash_acc[16], const u8 *aad, int aadlen);
 565 //
 566 // This function processes the AAD (Additional Authenticated Data) in GCM.
 567 // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
 568 // data given by |aad| and |aadlen|.  On the first call, |ghash_acc| must be all
 569 // zeroes.  |aadlen| must be a multiple of 16, except on the last call where it
 570 // can be any length.  The caller must do any buffering needed to ensure this.
 571 .macro  _aes_gcm_aad_update
 572
 573         // Function arguments
 574         .set    KEY,            %rdi
 575         .set    GHASH_ACC_PTR,  %rsi
 576         .set    AAD,            %rdx
 577         .set    AADLEN,         %ecx
 578         // Note: _load_partial_block relies on AADLEN being in %ecx.
 579
 580         // Additional local variables.
 581         // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
 582         .set    BSWAP_MASK,     %xmm2
 583         .set    GHASH_ACC,      %xmm3
 584         .set    H_POW1,         %xmm4   // H^1
 585         .set    H_POW1_X64,     %xmm5   // H^1 * x^64
 586         .set    GFPOLY,         %xmm6
 587
 588         movdqa          .Lbswap_mask(%rip), BSWAP_MASK
 589         movdqu          (GHASH_ACC_PTR), GHASH_ACC
 590         movdqa          OFFSETOF_H_POWERS+7*16(KEY), H_POW1
 591         movdqa          OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
 592         movq            .Lgfpoly(%rip), GFPOLY
 593
 594         // Process the AAD one full block at a time.
 595         sub             $16, AADLEN
 596         jl              .Laad_loop_1x_done\@
 597 .Laad_loop_1x\@:
 598         movdqu          (AAD), %xmm0
 599         pshufb          BSWAP_MASK, %xmm0
 600         pxor            %xmm0, GHASH_ACC
 601         _ghash_mul      H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
 602         add             $16, AAD
 603         sub             $16, AADLEN
 604         jge             .Laad_loop_1x\@
 605 .Laad_loop_1x_done\@:
 606         // Check whether there is a partial block at the end.
 607         add             $16, AADLEN
 608         jz              .Laad_done\@
 609
 610         // Process a partial block of length 1 <= AADLEN <= 15.
 611         // _load_partial_block assumes that %ecx contains AADLEN.
 612         _load_partial_block     AAD, %xmm0, %r10, %r10d
 613         pshufb          BSWAP_MASK, %xmm0
 614         pxor            %xmm0, GHASH_ACC
 615         _ghash_mul      H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
 616
 617 .Laad_done\@:
 618         movdqu          GHASH_ACC, (GHASH_ACC_PTR)
 619         RET
 620 .endm
 621
 622 // Increment LE_CTR eight times to generate eight little-endian counter blocks,
 623 // swap each to big-endian, and store them in AESDATA[0-7].  Also XOR them with
 624 // the zero-th AES round key.  Clobbers TMP0 and TMP1.
 625 .macro  _ctr_begin_8x
 626         movq            .Lone(%rip), TMP0
 627         movdqa          (KEY), TMP1             // zero-th round key
 628 .irp i, 0,1,2,3,4,5,6,7
 629         _vpshufb        BSWAP_MASK, LE_CTR, AESDATA\i
 630         pxor            TMP1, AESDATA\i
 631         paddd           TMP0, LE_CTR
 632 .endr
 633 .endm
 634
 635 // Do a non-last round of AES on AESDATA[0-7] using \round_key.
 636 .macro  _aesenc_8x      round_key
 637 .irp i, 0,1,2,3,4,5,6,7
 638         aesenc          \round_key, AESDATA\i
 639 .endr
 640 .endm
 641
 642 // Do the last round of AES on AESDATA[0-7] using \round_key.
 643 .macro  _aesenclast_8x  round_key
 644 .irp i, 0,1,2,3,4,5,6,7
 645         aesenclast      \round_key, AESDATA\i
 646 .endr
 647 .endm
 648
 649 // XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
 650 // store the result to DST.  Clobbers TMP0.
 651 .macro  _xor_data_8x
 652 .irp i, 0,1,2,3,4,5,6,7
 653         _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0
 654 .endr
 655 .irp i, 0,1,2,3,4,5,6,7
 656         movdqu          AESDATA\i, \i*16(DST)
 657 .endr
 658 .endm
 659
 660 // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
 661 //                                        const u32 le_ctr[4], u8 ghash_acc[16],
 662 //                                        const u8 *src, u8 *dst, int datalen);
 663 //
 664 // This macro generates a GCM encryption or decryption update function with the
 665 // above prototype (with \enc selecting which one).
 666 //
 667 // This function computes the next portion of the CTR keystream, XOR's it with
 668 // |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
 669 // data to |dst|.  It also updates the GHASH accumulator |ghash_acc| using the
 670 // next |datalen| ciphertext bytes.
 671 //
 672 // |datalen| must be a multiple of 16, except on the last call where it can be
 673 // any length.  The caller must do any buffering needed to ensure this.  Both
 674 // in-place and out-of-place en/decryption are supported.
 675 //
 676 // |le_ctr| must give the current counter in little-endian format.  For a new
 677 // message, the low word of the counter must be 2.  This function loads the
 678 // counter from |le_ctr| and increments the loaded counter as needed, but it
 679 // does *not* store the updated counter back to |le_ctr|.  The caller must
 680 // update |le_ctr| if any more data segments follow.  Internally, only the low
 681 // 32-bit word of the counter is incremented, following the GCM standard.
 682 .macro  _aes_gcm_update enc
 683
 684         // Function arguments
 685         .set    KEY,            %rdi
 686         .set    LE_CTR_PTR,     %rsi    // Note: overlaps with usage as temp reg
 687         .set    GHASH_ACC_PTR,  %rdx
 688         .set    SRC,            %rcx
 689         .set    DST,            %r8
 690         .set    DATALEN,        %r9d
 691         .set    DATALEN64,      %r9     // Zero-extend DATALEN before using!
 692         // Note: the code setting up for _load_partial_block assumes that SRC is
 693         // in %rcx (and that DATALEN is *not* in %rcx).
 694
 695         // Additional local variables
 696
 697         // %rax and %rsi are used as temporary registers.  Note: %rsi overlaps
 698         // with LE_CTR_PTR, which is used only at the beginning.
 699
 700         .set    AESKEYLEN,      %r10d   // AES key length in bytes
 701         .set    AESKEYLEN64,    %r10
 702         .set    RNDKEYLAST_PTR, %r11    // Pointer to last AES round key
 703
 704         // Put the most frequently used values in %xmm0-%xmm7 to reduce code
 705         // size.  (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
 706         .set    TMP0,           %xmm0
 707         .set    TMP1,           %xmm1
 708         .set    TMP2,           %xmm2
 709         .set    LO,             %xmm3   // Low part of unreduced product
 710         .set    MI,             %xmm4   // Middle part of unreduced product
 711         .set    GHASH_ACC,      %xmm5   // GHASH accumulator; in main loop also
 712                                         // the high part of unreduced product
 713         .set    BSWAP_MASK,     %xmm6   // Shuffle mask for reflecting bytes
 714         .set    LE_CTR,         %xmm7   // Little-endian counter value
 715         .set    AESDATA0,       %xmm8
 716         .set    AESDATA1,       %xmm9
 717         .set    AESDATA2,       %xmm10
 718         .set    AESDATA3,       %xmm11
 719         .set    AESDATA4,       %xmm12
 720         .set    AESDATA5,       %xmm13
 721         .set    AESDATA6,       %xmm14
 722         .set    AESDATA7,       %xmm15
 723
 724         movdqa          .Lbswap_mask(%rip), BSWAP_MASK
 725         movdqu          (GHASH_ACC_PTR), GHASH_ACC
 726         movdqu          (LE_CTR_PTR), LE_CTR
 727
 728         movl            OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
 729         lea             6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
 730
 731         // If there are at least 8*16 bytes of data, then continue into the main
 732         // loop, which processes 8*16 bytes of data per iteration.
 733         //
 734         // The main loop interleaves AES and GHASH to improve performance on
 735         // CPUs that can execute these instructions in parallel.  When
 736         // decrypting, the GHASH input (the ciphertext) is immediately
 737         // available.  When encrypting, we instead encrypt a set of 8 blocks
 738         // first and then GHASH those blocks while encrypting the next set of 8,
 739         // repeat that as needed, and finally GHASH the last set of 8 blocks.
 740         //
 741         // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
 742         // as this makes the immediate fit in a signed byte, saving 3 bytes.
 743         add             $-8*16, DATALEN
 744         jl              .Lcrypt_loop_8x_done\@
 745 .if \enc
 746         // Encrypt the first 8 plaintext blocks.
 747         _ctr_begin_8x
 748         lea             16(KEY), %rsi
 749         .p2align 4
 750 1:
 751         movdqa          (%rsi), TMP0
 752         _aesenc_8x      TMP0
 753         add             $16, %rsi
 754         cmp             %rsi, RNDKEYLAST_PTR
 755         jne             1b
 756         movdqa          (%rsi), TMP0
 757         _aesenclast_8x  TMP0
 758         _xor_data_8x
 759         // Don't increment DST until the ciphertext blocks have been hashed.
 760         sub             $-8*16, SRC
 761         add             $-8*16, DATALEN
 762         jl              .Lghash_last_ciphertext_8x\@
 763 .endif
 764
 765         .p2align 4
 766 .Lcrypt_loop_8x\@:
 767
 768         // Generate the next set of 8 counter blocks and start encrypting them.
 769         _ctr_begin_8x
 770         lea             16(KEY), %rsi
 771
 772         // Do a round of AES, and start the GHASH update of 8 ciphertext blocks
 773         // by doing the unreduced multiplication for the first ciphertext block.
 774         movdqa          (%rsi), TMP0
 775         add             $16, %rsi
 776         _aesenc_8x      TMP0
 777         _ghash_update_begin_8x \enc
 778
 779         // Do 7 more rounds of AES, and continue the GHASH update by doing the
 780         // unreduced multiplication for the remaining ciphertext blocks.
 781         .p2align 4
 782 1:
 783         movdqa          (%rsi), TMP0
 784         add             $16, %rsi
 785         _aesenc_8x      TMP0
 786         _ghash_update_continue_8x \enc
 787         cmp             $7*8, %eax
 788         jne             1b
 789
 790         // Do the remaining AES rounds.
 791         .p2align 4
 792 1:
 793         movdqa          (%rsi), TMP0
 794         add             $16, %rsi
 795         _aesenc_8x      TMP0
 796         cmp             %rsi, RNDKEYLAST_PTR
 797         jne             1b
 798
 799         // Do the GHASH reduction and the last round of AES.
 800         movdqa          (RNDKEYLAST_PTR), TMP0
 801         _ghash_update_end_8x_step       0
 802         _aesenclast_8x  TMP0
 803         _ghash_update_end_8x_step       1
 804
 805         // XOR the data with the AES-CTR keystream blocks.
 806 .if \enc
 807         sub             $-8*16, DST
 808 .endif
 809         _xor_data_8x
 810         sub             $-8*16, SRC
 811 .if !\enc
 812         sub             $-8*16, DST
 813 .endif
 814         add             $-8*16, DATALEN
 815         jge             .Lcrypt_loop_8x\@
 816
 817 .if \enc
 818 .Lghash_last_ciphertext_8x\@:
 819         // Update GHASH with the last set of 8 ciphertext blocks.
 820         _ghash_update_begin_8x          \enc
 821         .p2align 4
 822 1:
 823         _ghash_update_continue_8x       \enc
 824         cmp             $7*8, %eax
 825         jne             1b
 826         _ghash_update_end_8x_step       0
 827         _ghash_update_end_8x_step       1
 828         sub             $-8*16, DST
 829 .endif
 830
 831 .Lcrypt_loop_8x_done\@:
 832
 833         sub             $-8*16, DATALEN
 834         jz              .Ldone\@
 835
 836         // Handle the remainder of length 1 <= DATALEN < 8*16 bytes.  We keep
 837         // things simple and keep the code size down by just going one block at
 838         // a time, again taking advantage of hardware loop unrolling.  Since
 839         // there are enough key powers available for all remaining data, we do
 840         // the GHASH multiplications unreduced, and only reduce at the very end.
 841
 842         .set    HI,             TMP2
 843         .set    H_POW,          AESDATA0
 844         .set    H_POW_XORED,    AESDATA1
 845         .set    ONE,            AESDATA2
 846
 847         movq            .Lone(%rip), ONE
 848
 849         // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
 850         pxor            LO, LO
 851         pxor            MI, MI
 852         pxor            HI, HI
 853
 854         // Set up a block counter %rax to contain 8*(8-n), where n is the number
 855         // of blocks that remain, counting any partial block.  This will be used
 856         // to access the key powers H^n through H^1.
 857         mov             DATALEN, %eax
 858         neg             %eax
 859         and             $~15, %eax
 860         sar             $1, %eax
 861         add             $64, %eax
 862
 863         sub             $16, DATALEN
 864         jl              .Lcrypt_loop_1x_done\@
 865
 866         // Process the data one full block at a time.
 867 .Lcrypt_loop_1x\@:
 868
 869         // Encrypt the next counter block.
 870         _vpshufb        BSWAP_MASK, LE_CTR, TMP0
 871         paddd           ONE, LE_CTR
 872         pxor            (KEY), TMP0
 873         lea             -6*16(RNDKEYLAST_PTR), %rsi     // Reduce code size
 874         cmp             $24, AESKEYLEN
 875         jl              128f    // AES-128?
 876         je              192f    // AES-192?
 877         // AES-256
 878         aesenc          -7*16(%rsi), TMP0
 879         aesenc          -6*16(%rsi), TMP0
 880 192:
 881         aesenc          -5*16(%rsi), TMP0
 882         aesenc          -4*16(%rsi), TMP0
 883 128:
 884 .irp i, -3,-2,-1,0,1,2,3,4,5
 885         aesenc          \i*16(%rsi), TMP0
 886 .endr
 887         aesenclast      (RNDKEYLAST_PTR), TMP0
 888
 889         // Load the next key power H^i.
 890         movdqa          OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
 891         movq            OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
 892
 893         // XOR the keystream block that was just generated in TMP0 with the next
 894         // source data block and store the resulting en/decrypted data to DST.
 895 .if \enc
 896         _xor_mem_to_reg (SRC), TMP0, tmp=TMP1
 897         movdqu          TMP0, (DST)
 898 .else
 899         movdqu          (SRC), TMP1
 900         pxor            TMP1, TMP0
 901         movdqu          TMP0, (DST)
 902 .endif
 903
 904         // Update GHASH with the ciphertext block.
 905 .if \enc
 906         pshufb          BSWAP_MASK, TMP0
 907         pxor            TMP0, GHASH_ACC
 908 .else
 909         pshufb          BSWAP_MASK, TMP1
 910         pxor            TMP1, GHASH_ACC
 911 .endif
 912         _ghash_mul_noreduce     H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
 913         pxor            GHASH_ACC, GHASH_ACC
 914
 915         add             $8, %eax
 916         add             $16, SRC
 917         add             $16, DST
 918         sub             $16, DATALEN
 919         jge             .Lcrypt_loop_1x\@
 920 .Lcrypt_loop_1x_done\@:
 921         // Check whether there is a partial block at the end.
 922         add             $16, DATALEN
 923         jz              .Lghash_reduce\@
 924
 925         // Process a partial block of length 1 <= DATALEN <= 15.
 926
 927         // Encrypt a counter block for the last time.
 928         pshufb          BSWAP_MASK, LE_CTR
 929         pxor            (KEY), LE_CTR
 930         lea             16(KEY), %rsi
 931 1:
 932         aesenc          (%rsi), LE_CTR
 933         add             $16, %rsi
 934         cmp             %rsi, RNDKEYLAST_PTR
 935         jne             1b
 936         aesenclast      (RNDKEYLAST_PTR), LE_CTR
 937
 938         // Load the lowest key power, H^1.
 939         movdqa          OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
 940         movq            OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
 941
 942         // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC.  SRC is
 943         // in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
 944         // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
 945         mov             SRC, RNDKEYLAST_PTR
 946         mov             DATALEN, %ecx
 947         _load_partial_block     RNDKEYLAST_PTR, TMP0, %rsi, %esi
 948
 949         // XOR the keystream block that was just generated in LE_CTR with the
 950         // source data block and store the resulting en/decrypted data to DST.
 951         pxor            TMP0, LE_CTR
 952         mov             DATALEN, %ecx
 953         _store_partial_block    LE_CTR, DST
 954
 955         // If encrypting, zero-pad the final ciphertext block for GHASH.  (If
 956         // decrypting, this was already done by _load_partial_block.)
 957 .if \enc
 958         lea             .Lzeropad_mask+16(%rip), %rax
 959         sub             DATALEN64, %rax
 960         _vpand          (%rax), LE_CTR, TMP0
 961 .endif
 962
 963         // Update GHASH with the final ciphertext block.
 964         pshufb          BSWAP_MASK, TMP0
 965         pxor            TMP0, GHASH_ACC
 966         _ghash_mul_noreduce     H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
 967
 968 .Lghash_reduce\@:
 969         // Finally, do the GHASH reduction.
 970         _ghash_reduce   LO, MI, HI, GHASH_ACC, TMP0
 971
 972 .Ldone\@:
 973         // Store the updated GHASH accumulator back to memory.
 974         movdqu          GHASH_ACC, (GHASH_ACC_PTR)
 975
 976         RET
 977 .endm
 978
 979 // void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
 980 //                                 const u32 le_ctr[4], u8 ghash_acc[16],
 981 //                                 u64 total_aadlen, u64 total_datalen);
 982 // bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
 983 //                                 const u32 le_ctr[4], const u8 ghash_acc[16],
 984 //                                 u64 total_aadlen, u64 total_datalen,
 985 //                                 const u8 tag[16], int taglen);
 986 //
 987 // This macro generates one of the above two functions (with \enc selecting
 988 // which one).  Both functions finish computing the GCM authentication tag by
 989 // updating GHASH with the lengths block and encrypting the GHASH accumulator.
 990 // |total_aadlen| and |total_datalen| must be the total length of the additional
 991 // authenticated data and the en/decrypted data in bytes, respectively.
 992 //
 993 // The encryption function then stores the full-length (16-byte) computed
 994 // authentication tag to |ghash_acc|.  The decryption function instead loads the
 995 // expected authentication tag (the one that was transmitted) from the 16-byte
 996 // buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
 997 // computed tag in constant time, and returns true if and only if they match.
 998 .macro  _aes_gcm_final  enc
 999
1000         // Function arguments
1001         .set    KEY,            %rdi
1002         .set    LE_CTR_PTR,     %rsi
1003         .set    GHASH_ACC_PTR,  %rdx
1004         .set    TOTAL_AADLEN,   %rcx
1005         .set    TOTAL_DATALEN,  %r8
1006         .set    TAG,            %r9
1007         .set    TAGLEN,         %r10d   // Originally at 8(%rsp)
1008         .set    TAGLEN64,       %r10
1009
1010         // Additional local variables.
1011         // %rax and %xmm0-%xmm2 are used as temporary registers.
1012         .set    AESKEYLEN,      %r11d
1013         .set    AESKEYLEN64,    %r11
1014         .set    BSWAP_MASK,     %xmm3
1015         .set    GHASH_ACC,      %xmm4
1016         .set    H_POW1,         %xmm5   // H^1
1017         .set    H_POW1_X64,     %xmm6   // H^1 * x^64
1018         .set    GFPOLY,         %xmm7
1019
1020         movdqa          .Lbswap_mask(%rip), BSWAP_MASK
1021         movl            OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
1022
1023         // Set up a counter block with 1 in the low 32-bit word.  This is the
1024         // counter that produces the ciphertext needed to encrypt the auth tag.
1025         movdqu          (LE_CTR_PTR), %xmm0
1026         mov             $1, %eax
1027         pinsrd          $0, %eax, %xmm0
1028
1029         // Build the lengths block and XOR it into the GHASH accumulator.
1030         movq            TOTAL_DATALEN, GHASH_ACC
1031         pinsrq          $1, TOTAL_AADLEN, GHASH_ACC
1032         psllq           $3, GHASH_ACC   // Bytes to bits
1033         _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1
1034
1035         movdqa          OFFSETOF_H_POWERS+7*16(KEY), H_POW1
1036         movdqa          OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
1037         movq            .Lgfpoly(%rip), GFPOLY
1038
1039         // Make %rax point to the 6th from last AES round key.  (Using signed
1040         // byte offsets -7*16 through 6*16 decreases code size.)
1041         lea             (KEY,AESKEYLEN64,4), %rax
1042
1043         // AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
1044         // Interleave the AES and GHASH instructions to improve performance.
1045         pshufb          BSWAP_MASK, %xmm0
1046         pxor            (KEY), %xmm0
1047         cmp             $24, AESKEYLEN
1048         jl              128f    // AES-128?
1049         je              192f    // AES-192?
1050         // AES-256
1051         aesenc          -7*16(%rax), %xmm0
1052         aesenc          -6*16(%rax), %xmm0
1053 192:
1054         aesenc          -5*16(%rax), %xmm0
1055         aesenc          -4*16(%rax), %xmm0
1056 128:
1057 .irp i, 0,1,2,3,4,5,6,7,8
1058         aesenc          (\i-3)*16(%rax), %xmm0
1059         _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
1060 .endr
1061         aesenclast      6*16(%rax), %xmm0
1062         _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
1063
1064         // Undo the byte reflection of the GHASH accumulator.
1065         pshufb          BSWAP_MASK, GHASH_ACC
1066
1067         // Encrypt the GHASH accumulator.
1068         pxor            %xmm0, GHASH_ACC
1069
1070 .if \enc
1071         // Return the computed auth tag.
1072         movdqu          GHASH_ACC, (GHASH_ACC_PTR)
1073 .else
1074         .set            ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
1075
1076         // Verify the auth tag in constant time by XOR'ing the transmitted and
1077         // computed auth tags together and using the ptest instruction to check
1078         // whether the first TAGLEN bytes of the result are zero.
1079         _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0
1080         movl            8(%rsp), TAGLEN
1081         lea             .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
1082         sub             TAGLEN64, ZEROPAD_MASK_PTR
1083         xor             %eax, %eax
1084         _test_mem       (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
1085         sete            %al
1086 .endif
1087         RET
1088 .endm
1089
1090 .set    USE_AVX, 0
1091 SYM_FUNC_START(aes_gcm_precompute_aesni)
1092         _aes_gcm_precompute
1093 SYM_FUNC_END(aes_gcm_precompute_aesni)
1094 SYM_FUNC_START(aes_gcm_aad_update_aesni)
1095         _aes_gcm_aad_update
1096 SYM_FUNC_END(aes_gcm_aad_update_aesni)
1097 SYM_FUNC_START(aes_gcm_enc_update_aesni)
1098         _aes_gcm_update 1
1099 SYM_FUNC_END(aes_gcm_enc_update_aesni)
1100 SYM_FUNC_START(aes_gcm_dec_update_aesni)
1101         _aes_gcm_update 0
1102 SYM_FUNC_END(aes_gcm_dec_update_aesni)
1103 SYM_FUNC_START(aes_gcm_enc_final_aesni)
1104         _aes_gcm_final  1
1105 SYM_FUNC_END(aes_gcm_enc_final_aesni)
1106 SYM_FUNC_START(aes_gcm_dec_final_aesni)
1107         _aes_gcm_final  0
1108 SYM_FUNC_END(aes_gcm_dec_final_aesni)
1109
1110 .set    USE_AVX, 1
1111 SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
1112         _aes_gcm_precompute
1113 SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
1114 SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
1115         _aes_gcm_aad_update
1116 SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
1117 SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
1118         _aes_gcm_update 1
1119 SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
1120 SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
1121         _aes_gcm_update 0
1122 SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
1123 SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
1124         _aes_gcm_final  1
1125 SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
1126 SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
1127         _aes_gcm_final  0
1128 SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)