1 /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
3 // AES-NI optimized AES-GCM for x86_64
5 // Copyright 2024 Google LLC
7 // Author: Eric Biggers <ebiggers@google.com>
9 //------------------------------------------------------------------------------
11 // This file is dual-licensed, meaning that you can use it under your choice of
12 // either of the following two licenses:
14 // Licensed under the Apache License 2.0 (the "License"). You may obtain a copy
17 // http://www.apache.org/licenses/LICENSE-2.0
19 // Unless required by applicable law or agreed to in writing, software
20 // distributed under the License is distributed on an "AS IS" BASIS,
21 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22 // See the License for the specific language governing permissions and
23 // limitations under the License.
27 // Redistribution and use in source and binary forms, with or without
28 // modification, are permitted provided that the following conditions are met:
30 // 1. Redistributions of source code must retain the above copyright notice,
31 // this list of conditions and the following disclaimer.
33 // 2. Redistributions in binary form must reproduce the above copyright
34 // notice, this list of conditions and the following disclaimer in the
35 // documentation and/or other materials provided with the distribution.
37 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
38 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
39 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
40 // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
41 // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
42 // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43 // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
45 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
47 // POSSIBILITY OF SUCH DAMAGE.
49 //------------------------------------------------------------------------------
51 // This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that
52 // support the original set of AES instructions, i.e. AES-NI. Two
53 // implementations are provided, one that uses AVX and one that doesn't. They
54 // are very similar, being generated by the same macros. The only difference is
55 // that the AVX implementation takes advantage of VEX-coded instructions in some
56 // places to avoid some 'movdqu' and 'movdqa' instructions. The AVX
57 // implementation does *not* use 256-bit vectors, as AES is not supported on
58 // 256-bit vectors until the VAES feature (which this file doesn't target).
60 // The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1
61 // for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems
62 // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.)
64 // The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is
65 // more thoroughly commented. This file has the following notable changes:
67 // - The vector length is fixed at 128-bit, i.e. xmm registers. This means
68 // there is only one AES block (and GHASH block) per register.
70 // - Without AVX512 / AVX10, only 16 SIMD registers are available instead of
71 // 32. We work around this by being much more careful about using
72 // registers, relying heavily on loads to load values as they are needed.
74 // - Masking is not available either. We work around this by implementing
75 // partial block loads and stores using overlapping scalar loads and stores
76 // combined with shifts and SSE4.1 insertion and extraction instructions.
78 // - The main loop is organized differently due to the different design
79 // constraints. First, with just one AES block per SIMD register, on some
80 // CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore
81 // do an 8-register wide loop. Considering that and the fact that we have
82 // just 16 SIMD registers to work with, it's not feasible to cache AES
83 // round keys and GHASH key powers in registers across loop iterations.
84 // That's not ideal, but also not actually that bad, since loads can run in
85 // parallel with other instructions. Significantly, this also makes it
86 // possible to roll up the inner loops, relying on hardware loop unrolling
87 // instead of software loop unrolling, greatly reducing code size.
89 // - We implement the GHASH multiplications in the main loop using Karatsuba
90 // multiplication instead of schoolbook multiplication. This saves one
91 // pclmulqdq instruction per block, at the cost of one 64-bit load, one
92 // pshufd, and 0.25 pxors per block. (This is without the three-argument
93 // XOR support that would be provided by AVX512 / AVX10, which would be
94 // more beneficial to schoolbook than Karatsuba.)
96 // As a rough approximation, we can assume that Karatsuba multiplication is
97 // faster than schoolbook multiplication in this context if one pshufd and
98 // 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit
99 // load is "free" due to running in parallel with arithmetic instructions.)
100 // This is true on AMD CPUs, including all that support pclmulqdq up to at
101 // least Zen 3. It's also true on older Intel CPUs: Westmere through
102 // Haswell on the Core side, and Silvermont through Goldmont Plus on the
103 // low-power side. On some of these CPUs, pclmulqdq is quite slow, and the
104 // benefit of Karatsuba should be substantial. On newer Intel CPUs,
105 // schoolbook multiplication should be faster, but only marginally.
107 // Not all these CPUs were available to be tested. However, benchmarks on
108 // available CPUs suggest that this approximation is plausible. Switching
109 // to Karatsuba showed negligible change (< 1%) on Intel Broadwell,
110 // Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%.
111 // Considering that and the fact that Karatsuba should be even more
112 // beneficial on older Intel CPUs, it seems like the right choice here.
114 // An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be
115 // saved by using a multiplication-less reduction method. We don't do that
116 // because it would require a large number of shift and xor instructions,
117 // making it less worthwhile and likely harmful on newer CPUs.
119 // It does make sense to sometimes use a different reduction optimization
120 // that saves a pclmulqdq, though: precompute the hash key times x^64, and
121 // multiply the low half of the data block by the hash key with the extra
122 // factor of x^64. This eliminates one step of the reduction. However,
123 // this is incompatible with Karatsuba multiplication. Therefore, for
124 // multi-block processing we use Karatsuba multiplication with a regular
125 // reduction. For single-block processing, we use the x^64 optimization.
127 #include <linux/linkage.h>
132 .octa 0x000102030405060708090a0b0c0d0e0f
134 .quad 0xc200000000000000
137 .Lgfpoly_and_internal_carrybit:
138 .octa 0xc2000000000000010000000000000001
139 // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of
140 // 'len' 0xff bytes and the rest zeroes.
142 .octa 0xffffffffffffffffffffffffffffffff
145 // Offsets in struct aes_gcm_key_aesni
146 #define OFFSETOF_AESKEYLEN 480
147 #define OFFSETOF_H_POWERS 496
148 #define OFFSETOF_H_POWERS_XORED 624
149 #define OFFSETOF_H_TIMES_X64 688
153 // Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback
154 // assumes that all operands are distinct and that any mem operand is aligned.
155 .macro _vpclmulqdq imm, src1, src2, dst
157 vpclmulqdq \imm, \src1, \src2, \dst
160 pclmulqdq \imm, \src1, \dst
164 // Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes
165 // that all operands are distinct and that any mem operand is aligned.
166 .macro _vpshufb src1, src2, dst
168 vpshufb \src1, \src2, \dst
175 // Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that
176 // all operands are distinct.
177 .macro _vpand src1, src2, dst
179 vpand \src1, \src2, \dst
186 // XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must
187 // be a temporary xmm register.
188 .macro _xor_mem_to_reg mem, reg, tmp
190 vpxor \mem, \reg, \reg
197 // Test the unaligned memory operand \mem against the xmm register \reg. \tmp
198 // must be a temporary xmm register.
199 .macro _test_mem mem, reg, tmp
208 // Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst
209 // and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}.
210 .macro _load_partial_block src, dst, tmp64, tmp32
211 sub $8, %ecx // LEN - 8
214 // Load 9 <= LEN <= 15 bytes.
215 movq (\src), \dst // Load first 8 bytes
216 mov (\src, %rcx), %rax // Load last 8 bytes
219 shr %cl, %rax // Discard overlapping bytes
220 pinsrq $1, %rax, \dst
224 add $4, %ecx // LEN - 4
227 // Load 4 <= LEN <= 8 bytes.
228 mov (\src), %eax // Load first 4 bytes
229 mov (\src, %rcx), \tmp32 // Load last 4 bytes
233 // Load 1 <= LEN <= 3 bytes.
234 add $2, %ecx // LEN - 2
235 movzbl (\src), %eax // Load first byte
237 movzwl (\src, %rcx), \tmp32 // Load last 2 bytes
241 or \tmp64, %rax // Combine the two parts
247 // Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst.
248 // Clobbers %rax, %rcx, and %rsi.
249 .macro _store_partial_block src, dst
250 sub $8, %ecx // LEN - 8
253 // Store 8 <= LEN <= 15 bytes.
254 pextrq $1, \src, %rax
258 mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes
259 movq \src, (\dst) // Store first 8 bytes
263 add $4, %ecx // LEN - 4
266 // Store 4 <= LEN <= 7 bytes.
267 pextrd $1, \src, %eax
271 mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes
272 movd \src, (\dst) // Store first 4 bytes
276 // Store 1 <= LEN <= 3 bytes.
277 pextrb $0, \src, 0(\dst)
278 cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2?
280 pextrb $1, \src, 1(\dst)
282 pextrb $2, \src, 2(\dst)
286 // Do one step of GHASH-multiplying \a by \b and storing the reduced product in
287 // \b. To complete all steps, this must be invoked with \i=0 through \i=9.
288 // \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the
289 // .Lgfpoly constant, and \t0-\t1 must be temporary registers.
290 .macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1
292 // MI = (a_L * b_H) + ((a*x^64)_L * b_L)
294 _vpclmulqdq $0x01, \a, \b, \t0
296 _vpclmulqdq $0x00, \a_times_x64, \b, \t1
300 // HI = (a_H * b_H) + ((a*x^64)_H * b_L)
302 _vpclmulqdq $0x11, \a, \b, \t1
304 pclmulqdq $0x10, \a_times_x64, \b
310 pshufd $0x4e, \t0, \t1 // Swap halves of MI
312 pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57)
320 // GHASH-multiply \a by \b and store the reduced product in \b.
321 // See _ghash_mul_step for details.
322 .macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1
323 .irp i, 0,1,2,3,4,5,6,7,8,9
324 _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1
328 // GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi.
329 // This does Karatsuba multiplication and must be paired with _ghash_reduce. On
330 // the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the
331 // two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered.
332 .macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0
335 _vpclmulqdq $0x00, \a, \b, \t0
339 pshufd $0x4e, \b, \t0
343 pclmulqdq $0x11, \a, \b
346 // MI += (a_L + a_H) * (b_L + b_H)
347 pclmulqdq $0x00, \a_xored, \t0
351 // Reduce the product from \lo, \mi, and \hi, and store the result in \dst.
352 // This assumes that _ghash_mul_noreduce was used.
353 .macro _ghash_reduce lo, mi, hi, dst, t0
355 movq .Lgfpoly(%rip), \t0
357 // MI += LO + HI (needed because we used Karatsuba multiplication)
362 pshufd $0x4e, \lo, \dst
363 pclmulqdq $0x00, \t0, \lo
368 pshufd $0x4e, \mi, \dst
369 pclmulqdq $0x00, \t0, \mi
374 // Do the first step of the GHASH update of a set of 8 ciphertext blocks.
376 // The whole GHASH update does:
378 // GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 +
379 // blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1
381 // This macro just does the first step: it does the unreduced multiplication
382 // (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm
383 // registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the
384 // inner block counter in %rax, which is a value that counts up by 8 for each
385 // block in the set of 8 and is used later to index by 8*blknum and 16*blknum.
387 // To reduce the number of pclmulqdq instructions required, both this macro and
388 // _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook
389 // multiplication. See the file comment for more details about this choice.
391 // Both macros expect the ciphertext blocks blk[0-7] to be available at DST if
392 // encrypting, or SRC if decrypting. They also expect the precomputed hash key
393 // powers H^i and their XOR'd-together halves to be available in the struct
394 // pointed to by KEY. Both macros clobber TMP[0-2].
395 .macro _ghash_update_begin_8x enc
397 // Initialize the inner block counter.
400 // Load the highest hash key power, H^8.
401 movdqa OFFSETOF_H_POWERS(KEY), TMP0
403 // Load the first ciphertext block and byte-reflect it.
409 pshufb BSWAP_MASK, TMP1
411 // Add the GHASH accumulator to the ciphertext block to get the block
412 // 'b' that needs to be multiplied with the hash key power 'a'.
416 pshufd $0x4e, GHASH_ACC, MI
420 _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO
423 pclmulqdq $0x11, TMP0, GHASH_ACC
425 // MI = (a_L + a_H) * (b_L + b_H)
426 pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI
429 // Continue the GHASH update of 8 ciphertext blocks as described above by doing
430 // an unreduced multiplication of the next ciphertext block by the next lowest
431 // key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI.
432 .macro _ghash_update_continue_8x enc
435 // Load the next lowest key power.
436 movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0
438 // Load the next ciphertext block and byte-reflect it.
440 movdqu (DST,%rax,2), TMP1
442 movdqu (SRC,%rax,2), TMP1
444 pshufb BSWAP_MASK, TMP1
447 _vpclmulqdq $0x00, TMP0, TMP1, TMP2
451 pshufd $0x4e, TMP1, TMP2
455 pclmulqdq $0x11, TMP0, TMP1
458 // MI += (a_L + a_H) * (b_L + b_H)
459 movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1
460 pclmulqdq $0x00, TMP1, TMP2
464 // Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to
465 // _ghash_reduce, but it's hardcoded to use the registers of the main loop and
466 // it uses the same register for HI and the destination. It's also divided into
467 // two steps. TMP1 must be preserved across steps.
469 // One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of
470 // shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would
471 // increase the critical path length, and it seems to slightly hurt performance.
472 .macro _ghash_update_end_8x_step i
474 movq .Lgfpoly(%rip), TMP1
477 pshufd $0x4e, LO, TMP2
478 pclmulqdq $0x00, TMP1, LO
482 pshufd $0x4e, MI, TMP2
483 pclmulqdq $0x00, TMP1, MI
489 // void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key);
491 // Given the expanded AES key, derive the GHASH subkey and initialize the GHASH
492 // related fields in the key struct.
493 .macro _aes_gcm_precompute
495 // Function arguments
498 // Additional local variables.
499 // %xmm0-%xmm1 and %rax are used as temporaries.
500 .set RNDKEYLAST_PTR, %rsi
502 .set H_POW1, %xmm3 // H^1
503 .set H_POW1_X64, %xmm4 // H^1 * x^64
506 // Encrypt an all-zeroes block to get the raw hash subkey.
507 movl OFFSETOF_AESKEYLEN(KEY), %eax
508 lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR
509 movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block
512 aesenc (%rax), H_POW1
514 cmp %rax, RNDKEYLAST_PTR
516 aesenclast (RNDKEYLAST_PTR), H_POW1
518 // Preprocess the raw hash subkey as needed to operate on GHASH's
519 // bit-reflected values directly: reflect its bytes, then multiply it by
520 // x^-1 (using the backwards interpretation of polynomial coefficients
521 // from the GCM spec) or equivalently x^1 (using the alternative,
522 // natural interpretation of polynomial coefficients).
523 pshufb .Lbswap_mask(%rip), H_POW1
525 pshufd $0xd3, %xmm0, %xmm0
528 pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0
532 movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY)
534 // Compute and store H^1 * x^64.
535 movq .Lgfpoly(%rip), GFPOLY
536 pshufd $0x4e, H_POW1, %xmm0
537 _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64
538 pxor %xmm0, H_POW1_X64
539 movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY)
541 // Compute and store the halves of H^1 XOR'd together.
543 movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY)
545 // Compute and store the remaining key powers H^2 through H^8.
549 // Compute H^i = H^{i-1} * H^1.
550 _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1
552 movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2)
553 // Compute and store the halves of H^i XOR'd together.
554 pshufd $0x4e, H_CUR, %xmm0
556 movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax)
558 jge .Lprecompute_next\@
563 // void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key,
564 // u8 ghash_acc[16], const u8 *aad, int aadlen);
566 // This function processes the AAD (Additional Authenticated Data) in GCM.
567 // Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the
568 // data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all
569 // zeroes. |aadlen| must be a multiple of 16, except on the last call where it
570 // can be any length. The caller must do any buffering needed to ensure this.
571 .macro _aes_gcm_aad_update
573 // Function arguments
575 .set GHASH_ACC_PTR, %rsi
578 // Note: _load_partial_block relies on AADLEN being in %ecx.
580 // Additional local variables.
581 // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers.
582 .set BSWAP_MASK, %xmm2
583 .set GHASH_ACC, %xmm3
584 .set H_POW1, %xmm4 // H^1
585 .set H_POW1_X64, %xmm5 // H^1 * x^64
588 movdqa .Lbswap_mask(%rip), BSWAP_MASK
589 movdqu (GHASH_ACC_PTR), GHASH_ACC
590 movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
591 movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
592 movq .Lgfpoly(%rip), GFPOLY
594 // Process the AAD one full block at a time.
596 jl .Laad_loop_1x_done\@
599 pshufb BSWAP_MASK, %xmm0
600 pxor %xmm0, GHASH_ACC
601 _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
605 .Laad_loop_1x_done\@:
606 // Check whether there is a partial block at the end.
610 // Process a partial block of length 1 <= AADLEN <= 15.
611 // _load_partial_block assumes that %ecx contains AADLEN.
612 _load_partial_block AAD, %xmm0, %r10, %r10d
613 pshufb BSWAP_MASK, %xmm0
614 pxor %xmm0, GHASH_ACC
615 _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1
618 movdqu GHASH_ACC, (GHASH_ACC_PTR)
622 // Increment LE_CTR eight times to generate eight little-endian counter blocks,
623 // swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with
624 // the zero-th AES round key. Clobbers TMP0 and TMP1.
626 movq .Lone(%rip), TMP0
627 movdqa (KEY), TMP1 // zero-th round key
628 .irp i, 0,1,2,3,4,5,6,7
629 _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i
635 // Do a non-last round of AES on AESDATA[0-7] using \round_key.
636 .macro _aesenc_8x round_key
637 .irp i, 0,1,2,3,4,5,6,7
638 aesenc \round_key, AESDATA\i
642 // Do the last round of AES on AESDATA[0-7] using \round_key.
643 .macro _aesenclast_8x round_key
644 .irp i, 0,1,2,3,4,5,6,7
645 aesenclast \round_key, AESDATA\i
649 // XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and
650 // store the result to DST. Clobbers TMP0.
652 .irp i, 0,1,2,3,4,5,6,7
653 _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0
655 .irp i, 0,1,2,3,4,5,6,7
656 movdqu AESDATA\i, \i*16(DST)
660 // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key,
661 // const u32 le_ctr[4], u8 ghash_acc[16],
662 // const u8 *src, u8 *dst, int datalen);
664 // This macro generates a GCM encryption or decryption update function with the
665 // above prototype (with \enc selecting which one).
667 // This function computes the next portion of the CTR keystream, XOR's it with
668 // |datalen| bytes from |src|, and writes the resulting encrypted or decrypted
669 // data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the
670 // next |datalen| ciphertext bytes.
672 // |datalen| must be a multiple of 16, except on the last call where it can be
673 // any length. The caller must do any buffering needed to ensure this. Both
674 // in-place and out-of-place en/decryption are supported.
676 // |le_ctr| must give the current counter in little-endian format. For a new
677 // message, the low word of the counter must be 2. This function loads the
678 // counter from |le_ctr| and increments the loaded counter as needed, but it
679 // does *not* store the updated counter back to |le_ctr|. The caller must
680 // update |le_ctr| if any more data segments follow. Internally, only the low
681 // 32-bit word of the counter is incremented, following the GCM standard.
682 .macro _aes_gcm_update enc
684 // Function arguments
686 .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg
687 .set GHASH_ACC_PTR, %rdx
691 .set DATALEN64, %r9 // Zero-extend DATALEN before using!
692 // Note: the code setting up for _load_partial_block assumes that SRC is
693 // in %rcx (and that DATALEN is *not* in %rcx).
695 // Additional local variables
697 // %rax and %rsi are used as temporary registers. Note: %rsi overlaps
698 // with LE_CTR_PTR, which is used only at the beginning.
700 .set AESKEYLEN, %r10d // AES key length in bytes
701 .set AESKEYLEN64, %r10
702 .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key
704 // Put the most frequently used values in %xmm0-%xmm7 to reduce code
705 // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.)
709 .set LO, %xmm3 // Low part of unreduced product
710 .set MI, %xmm4 // Middle part of unreduced product
711 .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also
712 // the high part of unreduced product
713 .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes
714 .set LE_CTR, %xmm7 // Little-endian counter value
717 .set AESDATA2, %xmm10
718 .set AESDATA3, %xmm11
719 .set AESDATA4, %xmm12
720 .set AESDATA5, %xmm13
721 .set AESDATA6, %xmm14
722 .set AESDATA7, %xmm15
724 movdqa .Lbswap_mask(%rip), BSWAP_MASK
725 movdqu (GHASH_ACC_PTR), GHASH_ACC
726 movdqu (LE_CTR_PTR), LE_CTR
728 movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
729 lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR
731 // If there are at least 8*16 bytes of data, then continue into the main
732 // loop, which processes 8*16 bytes of data per iteration.
734 // The main loop interleaves AES and GHASH to improve performance on
735 // CPUs that can execute these instructions in parallel. When
736 // decrypting, the GHASH input (the ciphertext) is immediately
737 // available. When encrypting, we instead encrypt a set of 8 blocks
738 // first and then GHASH those blocks while encrypting the next set of 8,
739 // repeat that as needed, and finally GHASH the last set of 8 blocks.
741 // Code size optimization: Prefer adding or subtracting -8*16 over 8*16,
742 // as this makes the immediate fit in a signed byte, saving 3 bytes.
744 jl .Lcrypt_loop_8x_done\@
746 // Encrypt the first 8 plaintext blocks.
754 cmp %rsi, RNDKEYLAST_PTR
759 // Don't increment DST until the ciphertext blocks have been hashed.
762 jl .Lghash_last_ciphertext_8x\@
768 // Generate the next set of 8 counter blocks and start encrypting them.
772 // Do a round of AES, and start the GHASH update of 8 ciphertext blocks
773 // by doing the unreduced multiplication for the first ciphertext block.
777 _ghash_update_begin_8x \enc
779 // Do 7 more rounds of AES, and continue the GHASH update by doing the
780 // unreduced multiplication for the remaining ciphertext blocks.
786 _ghash_update_continue_8x \enc
790 // Do the remaining AES rounds.
796 cmp %rsi, RNDKEYLAST_PTR
799 // Do the GHASH reduction and the last round of AES.
800 movdqa (RNDKEYLAST_PTR), TMP0
801 _ghash_update_end_8x_step 0
803 _ghash_update_end_8x_step 1
805 // XOR the data with the AES-CTR keystream blocks.
815 jge .Lcrypt_loop_8x\@
818 .Lghash_last_ciphertext_8x\@:
819 // Update GHASH with the last set of 8 ciphertext blocks.
820 _ghash_update_begin_8x \enc
823 _ghash_update_continue_8x \enc
826 _ghash_update_end_8x_step 0
827 _ghash_update_end_8x_step 1
831 .Lcrypt_loop_8x_done\@:
836 // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep
837 // things simple and keep the code size down by just going one block at
838 // a time, again taking advantage of hardware loop unrolling. Since
839 // there are enough key powers available for all remaining data, we do
840 // the GHASH multiplications unreduced, and only reduce at the very end.
844 .set H_POW_XORED, AESDATA1
847 movq .Lone(%rip), ONE
849 // Start collecting the unreduced GHASH intermediate value LO, MI, HI.
854 // Set up a block counter %rax to contain 8*(8-n), where n is the number
855 // of blocks that remain, counting any partial block. This will be used
856 // to access the key powers H^n through H^1.
864 jl .Lcrypt_loop_1x_done\@
866 // Process the data one full block at a time.
869 // Encrypt the next counter block.
870 _vpshufb BSWAP_MASK, LE_CTR, TMP0
873 lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size
878 aesenc -7*16(%rsi), TMP0
879 aesenc -6*16(%rsi), TMP0
881 aesenc -5*16(%rsi), TMP0
882 aesenc -4*16(%rsi), TMP0
884 .irp i, -3,-2,-1,0,1,2,3,4,5
885 aesenc \i*16(%rsi), TMP0
887 aesenclast (RNDKEYLAST_PTR), TMP0
889 // Load the next key power H^i.
890 movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
891 movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
893 // XOR the keystream block that was just generated in TMP0 with the next
894 // source data block and store the resulting en/decrypted data to DST.
896 _xor_mem_to_reg (SRC), TMP0, tmp=TMP1
904 // Update GHASH with the ciphertext block.
906 pshufb BSWAP_MASK, TMP0
909 pshufb BSWAP_MASK, TMP1
912 _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
913 pxor GHASH_ACC, GHASH_ACC
919 jge .Lcrypt_loop_1x\@
920 .Lcrypt_loop_1x_done\@:
921 // Check whether there is a partial block at the end.
925 // Process a partial block of length 1 <= DATALEN <= 15.
927 // Encrypt a counter block for the last time.
928 pshufb BSWAP_MASK, LE_CTR
932 aesenc (%rsi), LE_CTR
934 cmp %rsi, RNDKEYLAST_PTR
936 aesenclast (RNDKEYLAST_PTR), LE_CTR
938 // Load the lowest key power, H^1.
939 movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW
940 movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED
942 // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is
943 // in %rcx, but _load_partial_block needs DATALEN in %rcx instead.
944 // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC.
945 mov SRC, RNDKEYLAST_PTR
947 _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi
949 // XOR the keystream block that was just generated in LE_CTR with the
950 // source data block and store the resulting en/decrypted data to DST.
953 _store_partial_block LE_CTR, DST
955 // If encrypting, zero-pad the final ciphertext block for GHASH. (If
956 // decrypting, this was already done by _load_partial_block.)
958 lea .Lzeropad_mask+16(%rip), %rax
960 _vpand (%rax), LE_CTR, TMP0
963 // Update GHASH with the final ciphertext block.
964 pshufb BSWAP_MASK, TMP0
966 _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0
969 // Finally, do the GHASH reduction.
970 _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0
973 // Store the updated GHASH accumulator back to memory.
974 movdqu GHASH_ACC, (GHASH_ACC_PTR)
979 // void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key,
980 // const u32 le_ctr[4], u8 ghash_acc[16],
981 // u64 total_aadlen, u64 total_datalen);
982 // bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key,
983 // const u32 le_ctr[4], const u8 ghash_acc[16],
984 // u64 total_aadlen, u64 total_datalen,
985 // const u8 tag[16], int taglen);
987 // This macro generates one of the above two functions (with \enc selecting
988 // which one). Both functions finish computing the GCM authentication tag by
989 // updating GHASH with the lengths block and encrypting the GHASH accumulator.
990 // |total_aadlen| and |total_datalen| must be the total length of the additional
991 // authenticated data and the en/decrypted data in bytes, respectively.
993 // The encryption function then stores the full-length (16-byte) computed
994 // authentication tag to |ghash_acc|. The decryption function instead loads the
995 // expected authentication tag (the one that was transmitted) from the 16-byte
996 // buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the
997 // computed tag in constant time, and returns true if and only if they match.
998 .macro _aes_gcm_final enc
1000 // Function arguments
1002 .set LE_CTR_PTR, %rsi
1003 .set GHASH_ACC_PTR, %rdx
1004 .set TOTAL_AADLEN, %rcx
1005 .set TOTAL_DATALEN, %r8
1007 .set TAGLEN, %r10d // Originally at 8(%rsp)
1010 // Additional local variables.
1011 // %rax and %xmm0-%xmm2 are used as temporary registers.
1012 .set AESKEYLEN, %r11d
1013 .set AESKEYLEN64, %r11
1014 .set BSWAP_MASK, %xmm3
1015 .set GHASH_ACC, %xmm4
1016 .set H_POW1, %xmm5 // H^1
1017 .set H_POW1_X64, %xmm6 // H^1 * x^64
1020 movdqa .Lbswap_mask(%rip), BSWAP_MASK
1021 movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN
1023 // Set up a counter block with 1 in the low 32-bit word. This is the
1024 // counter that produces the ciphertext needed to encrypt the auth tag.
1025 movdqu (LE_CTR_PTR), %xmm0
1027 pinsrd $0, %eax, %xmm0
1029 // Build the lengths block and XOR it into the GHASH accumulator.
1030 movq TOTAL_DATALEN, GHASH_ACC
1031 pinsrq $1, TOTAL_AADLEN, GHASH_ACC
1032 psllq $3, GHASH_ACC // Bytes to bits
1033 _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1
1035 movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1
1036 movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64
1037 movq .Lgfpoly(%rip), GFPOLY
1039 // Make %rax point to the 6th from last AES round key. (Using signed
1040 // byte offsets -7*16 through 6*16 decreases code size.)
1041 lea (KEY,AESKEYLEN64,4), %rax
1043 // AES-encrypt the counter block and also multiply GHASH_ACC by H^1.
1044 // Interleave the AES and GHASH instructions to improve performance.
1045 pshufb BSWAP_MASK, %xmm0
1051 aesenc -7*16(%rax), %xmm0
1052 aesenc -6*16(%rax), %xmm0
1054 aesenc -5*16(%rax), %xmm0
1055 aesenc -4*16(%rax), %xmm0
1057 .irp i, 0,1,2,3,4,5,6,7,8
1058 aesenc (\i-3)*16(%rax), %xmm0
1059 _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
1061 aesenclast 6*16(%rax), %xmm0
1062 _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2
1064 // Undo the byte reflection of the GHASH accumulator.
1065 pshufb BSWAP_MASK, GHASH_ACC
1067 // Encrypt the GHASH accumulator.
1068 pxor %xmm0, GHASH_ACC
1071 // Return the computed auth tag.
1072 movdqu GHASH_ACC, (GHASH_ACC_PTR)
1074 .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN!
1076 // Verify the auth tag in constant time by XOR'ing the transmitted and
1077 // computed auth tags together and using the ptest instruction to check
1078 // whether the first TAGLEN bytes of the result are zero.
1079 _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0
1080 movl 8(%rsp), TAGLEN
1081 lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR
1082 sub TAGLEN64, ZEROPAD_MASK_PTR
1084 _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0
1091 SYM_FUNC_START(aes_gcm_precompute_aesni)
1093 SYM_FUNC_END(aes_gcm_precompute_aesni)
1094 SYM_FUNC_START(aes_gcm_aad_update_aesni)
1096 SYM_FUNC_END(aes_gcm_aad_update_aesni)
1097 SYM_FUNC_START(aes_gcm_enc_update_aesni)
1099 SYM_FUNC_END(aes_gcm_enc_update_aesni)
1100 SYM_FUNC_START(aes_gcm_dec_update_aesni)
1102 SYM_FUNC_END(aes_gcm_dec_update_aesni)
1103 SYM_FUNC_START(aes_gcm_enc_final_aesni)
1105 SYM_FUNC_END(aes_gcm_enc_final_aesni)
1106 SYM_FUNC_START(aes_gcm_dec_final_aesni)
1108 SYM_FUNC_END(aes_gcm_dec_final_aesni)
1111 SYM_FUNC_START(aes_gcm_precompute_aesni_avx)
1113 SYM_FUNC_END(aes_gcm_precompute_aesni_avx)
1114 SYM_FUNC_START(aes_gcm_aad_update_aesni_avx)
1116 SYM_FUNC_END(aes_gcm_aad_update_aesni_avx)
1117 SYM_FUNC_START(aes_gcm_enc_update_aesni_avx)
1119 SYM_FUNC_END(aes_gcm_enc_update_aesni_avx)
1120 SYM_FUNC_START(aes_gcm_dec_update_aesni_avx)
1122 SYM_FUNC_END(aes_gcm_dec_update_aesni_avx)
1123 SYM_FUNC_START(aes_gcm_enc_final_aesni_avx)
1125 SYM_FUNC_END(aes_gcm_enc_final_aesni_avx)
1126 SYM_FUNC_START(aes_gcm_dec_final_aesni_avx)
1128 SYM_FUNC_END(aes_gcm_dec_final_aesni_avx)