1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * AES-XTS for modern x86_64 CPUs
5 * Copyright 2024 Google LLC
7 * Author: Eric Biggers <ebiggers@google.com>
11 * This file implements AES-XTS for modern x86_64 CPUs. To handle the
12 * complexities of coding for x86 SIMD, e.g. where every vector length needs
13 * different code, it uses a macro to generate several implementations that
14 * share similar source code but are targeted at different CPUs, listed below:
17 * - 128-bit vectors (1 AES block per vector)
18 * - VEX-coded instructions
20 * - This is for older CPUs that lack VAES but do have AVX.
22 * VAES + VPCLMULQDQ + AVX2
23 * - 256-bit vectors (2 AES blocks per vector)
24 * - VEX-coded instructions
26 * - This is for CPUs that have VAES but lack AVX512 or AVX10,
27 * e.g. Intel's Alder Lake and AMD's Zen 3.
29 * VAES + VPCLMULQDQ + AVX10/256 + BMI2
30 * - 256-bit vectors (2 AES blocks per vector)
31 * - EVEX-coded instructions
33 * - This is for CPUs that have AVX512 but where using zmm registers causes
34 * downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
35 * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
36 * To avoid confusion with 512-bit, we just write AVX10/256.
38 * VAES + VPCLMULQDQ + AVX10/512 + BMI2
39 * - Same as the previous one, but upgrades to 512-bit vectors
40 * (4 AES blocks per vector) in zmm0-zmm31.
41 * - This is for CPUs that have good AVX512 or AVX10/512 support.
43 * This file doesn't have an implementation for AES-NI alone (without AVX), as
44 * the lack of VEX would make all the assembly code different.
46 * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
47 * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be
48 * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might
49 * need to start also providing an implementation using VAES alone.
51 * The AES-XTS implementations in this file support everything required by the
52 * crypto API, including support for arbitrary input lengths and multi-part
53 * processing. However, they are most heavily optimized for the common case of
54 * power-of-2 length inputs that are processed in a single part (disk sectors).
57 #include <linux/linkage.h>
58 #include <linux/cfi_types.h>
63 // The low 64 bits of this value represent the polynomial x^7 + x^2 + x
64 // + 1. It is the value that must be XOR'd into the low 64 bits of the
65 // tweak each time a 1 is carried out of the high 64 bits.
67 // The high 64 bits of this value is just the internal carry bit that
68 // exists when there's a carry out of the low 64 bits of the tweak.
71 // This table contains constants for vpshufb and vpblendvb, used to
72 // handle variable byte shifts and blending during ciphertext stealing
73 // on CPUs that don't support AVX10-style masking.
75 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
76 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
77 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
78 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
79 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
80 .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
83 // Function parameters
84 .set KEY, %rdi // Initially points to crypto_aes_ctx, then is
85 // advanced to point to 7th-from-last round key
86 .set SRC, %rsi // Pointer to next source data
87 .set DST, %rdx // Pointer to next destination data
88 .set LEN, %ecx // Remaining length in bytes
91 .set TWEAK, %r8 // Pointer to next tweak
93 // %rax holds the AES key length in bytes.
97 // %r9-r11 are available as temporaries.
107 .error "Unsupported Vector Length (VL)"
111 .macro _define_aliases
112 // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
113 // are available, that map to the xmm, ymm, or zmm registers according
114 // to the selected Vector Length (VL).
150 // V0-V3 hold the data blocks during the main loop, or temporary values
151 // otherwise. V4-V5 hold temporary values.
153 // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak.
154 .set TWEAK0_XMM, %xmm6
156 .set TWEAK1_XMM, %xmm7
161 // V10-V13 are used for computing the next values of TWEAK[0-3].
162 .set NEXT_TWEAK0, V10
163 .set NEXT_TWEAK1, V11
164 .set NEXT_TWEAK2, V12
165 .set NEXT_TWEAK3, V13
167 // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
168 .set GF_POLY_XMM, %xmm14
171 // V15 holds the key for AES "round 0", copied to all 128-bit lanes.
172 .set KEY0_XMM, %xmm15
175 // If 32 SIMD registers are available, then V16-V29 hold the remaining
176 // AES round keys, copied to all 128-bit lanes.
178 // AES-128, AES-192, and AES-256 use different numbers of round keys.
179 // To allow handling all three variants efficiently, we align the round
180 // keys to the *end* of this register range. I.e., AES-128 uses
181 // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
182 // (All also use KEY0 for the XOR-only "round" at the beginning.)
184 .set KEY1_XMM, %xmm16
186 .set KEY2_XMM, %xmm17
188 .set KEY3_XMM, %xmm18
190 .set KEY4_XMM, %xmm19
192 .set KEY5_XMM, %xmm20
194 .set KEY6_XMM, %xmm21
196 .set KEY7_XMM, %xmm22
198 .set KEY8_XMM, %xmm23
200 .set KEY9_XMM, %xmm24
202 .set KEY10_XMM, %xmm25
204 .set KEY11_XMM, %xmm26
206 .set KEY12_XMM, %xmm27
208 .set KEY13_XMM, %xmm28
210 .set KEY14_XMM, %xmm29
213 // V30-V31 are currently unused.
216 // Move a vector between memory and a register.
217 .macro _vmovdqu src, dst
225 // Broadcast a 128-bit value into a vector.
226 .macro _vbroadcast128 src, dst
227 .if VL == 16 && !USE_AVX10
229 .elseif VL == 32 && !USE_AVX10
230 vbroadcasti128 \src, \dst
232 vbroadcasti32x4 \src, \dst
236 // XOR two vectors together.
237 .macro _vpxor src1, src2, dst
239 vpxord \src1, \src2, \dst
241 vpxor \src1, \src2, \dst
245 // XOR three vectors together.
246 .macro _xor3 src1, src2, src3_and_dst
248 // vpternlogd with immediate 0x96 is a three-argument XOR.
249 vpternlogd $0x96, \src1, \src2, \src3_and_dst
251 vpxor \src1, \src3_and_dst, \src3_and_dst
252 vpxor \src2, \src3_and_dst, \src3_and_dst
256 // Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
257 // (by multiplying by the polynomial 'x') and write it to \dst.
258 .macro _next_tweak src, tmp, dst
259 vpshufd $0x13, \src, \tmp
260 vpaddq \src, \src, \dst
261 vpsrad $31, \tmp, \tmp
262 vpand GF_POLY_XMM, \tmp, \tmp
263 vpxor \tmp, \dst, \dst
266 // Given the XTS tweak(s) in the vector \src, compute the next vector of
267 // tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
269 // If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
270 // all tweaks in the vector in parallel. If VL=16, we just do the regular
271 // computation without vpclmulqdq, as it's the faster method for a single tweak.
272 .macro _next_tweakvec src, tmp1, tmp2, dst
274 _next_tweak \src, \tmp1, \dst
276 vpsrlq $64 - VL/16, \src, \tmp1
277 vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2
278 vpslldq $8, \tmp1, \tmp1
279 vpsllq $VL/16, \src, \dst
280 _xor3 \tmp1, \tmp2, \dst
284 // Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
285 // store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5.
286 .macro _compute_first_set_of_tweaks
287 vmovdqu (TWEAK), TWEAK0_XMM
288 _vbroadcast128 .Lgf_poly(%rip), GF_POLY
290 // With VL=16, multiplying by x serially is fastest.
291 _next_tweak TWEAK0, %xmm0, TWEAK1
292 _next_tweak TWEAK1, %xmm0, TWEAK2
293 _next_tweak TWEAK2, %xmm0, TWEAK3
296 // Compute the second block of TWEAK0.
297 _next_tweak TWEAK0_XMM, %xmm0, %xmm1
298 vinserti128 $1, %xmm1, TWEAK0, TWEAK0
300 // Compute the remaining blocks of TWEAK0.
301 _next_tweak TWEAK0_XMM, %xmm0, %xmm1
302 _next_tweak %xmm1, %xmm0, %xmm2
303 _next_tweak %xmm2, %xmm0, %xmm3
304 vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0
305 vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0
306 vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0
308 // Compute TWEAK[1-3] from TWEAK0.
309 vpsrlq $64 - 1*VL/16, TWEAK0, V0
310 vpsrlq $64 - 2*VL/16, TWEAK0, V2
311 vpsrlq $64 - 3*VL/16, TWEAK0, V4
312 vpclmulqdq $0x01, GF_POLY, V0, V1
313 vpclmulqdq $0x01, GF_POLY, V2, V3
314 vpclmulqdq $0x01, GF_POLY, V4, V5
318 vpsllq $1*VL/16, TWEAK0, TWEAK1
319 vpsllq $2*VL/16, TWEAK0, TWEAK2
320 vpsllq $3*VL/16, TWEAK0, TWEAK3
322 vpternlogd $0x96, V0, V1, TWEAK1
323 vpternlogd $0x96, V2, V3, TWEAK2
324 vpternlogd $0x96, V4, V5, TWEAK3
326 vpxor V0, TWEAK1, TWEAK1
327 vpxor V2, TWEAK2, TWEAK2
328 vpxor V4, TWEAK3, TWEAK3
329 vpxor V1, TWEAK1, TWEAK1
330 vpxor V3, TWEAK2, TWEAK2
331 vpxor V5, TWEAK3, TWEAK3
336 // Do one step in computing the next set of tweaks using the method of just
337 // multiplying by x repeatedly (the same method _next_tweak uses).
338 .macro _tweak_step_mulx i
340 .set PREV_TWEAK, TWEAK3
341 .set NEXT_TWEAK, NEXT_TWEAK0
343 .set PREV_TWEAK, NEXT_TWEAK0
344 .set NEXT_TWEAK, NEXT_TWEAK1
346 .set PREV_TWEAK, NEXT_TWEAK1
347 .set NEXT_TWEAK, NEXT_TWEAK2
349 .set PREV_TWEAK, NEXT_TWEAK2
350 .set NEXT_TWEAK, NEXT_TWEAK3
352 .if \i >= 0 && \i < 20 && \i % 5 == 0
353 vpshufd $0x13, PREV_TWEAK, V5
354 .elseif \i >= 0 && \i < 20 && \i % 5 == 1
355 vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
356 .elseif \i >= 0 && \i < 20 && \i % 5 == 2
358 .elseif \i >= 0 && \i < 20 && \i % 5 == 3
359 vpand GF_POLY, V5, V5
360 .elseif \i >= 0 && \i < 20 && \i % 5 == 4
361 vpxor V5, NEXT_TWEAK, NEXT_TWEAK
363 vmovdqa NEXT_TWEAK0, TWEAK0
364 vmovdqa NEXT_TWEAK1, TWEAK1
365 vmovdqa NEXT_TWEAK2, TWEAK2
366 vmovdqa NEXT_TWEAK3, TWEAK3
370 // Do one step in computing the next set of tweaks using the VPCLMULQDQ method
371 // (the same method _next_tweakvec uses for VL > 16). This means multiplying
372 // each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8
373 // when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
374 // which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
375 .macro _tweak_step_pclmul i
377 vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
379 vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
381 vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
383 vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
385 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
387 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
389 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
391 vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
393 vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
394 vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1
395 vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2
396 vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3
397 _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0
398 _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1
399 _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2
400 _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3
404 // _tweak_step does one step of the computation of the next set of tweaks from
405 // TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
406 // \i that include at least 0 through 19, then 1000 which signals the last step.
408 // This is used to interleave the computation of the next set of tweaks with the
409 // AES en/decryptions, which increases performance in some cases.
414 _tweak_step_pclmul \i
418 .macro _setup_round_keys enc
420 // Select either the encryption round keys or the decryption round keys.
427 // Load the round key for "round 0".
428 _vbroadcast128 OFFS(KEY), KEY0
430 // Increment KEY to make it so that 7*16(KEY) is the last round key.
431 // For AES-128, increment by 3*16, resulting in the 10 round keys (not
432 // counting the zero-th round key which was just loaded into KEY0) being
433 // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
434 // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
435 // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
437 // This rebasing provides two benefits. First, it makes the offset to
438 // any round key be in the range [-96, 112], fitting in a signed byte.
439 // This shortens VEX-encoded instructions that access the later round
440 // keys which otherwise would need 4-byte offsets. Second, it makes it
441 // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
442 // beginning. Skipping rounds at the end doesn't work as well because
443 // the last round needs different instructions.
445 // An alternative approach would be to roll up all the round loops. We
446 // don't do that because it isn't compatible with caching the round keys
447 // in registers which we do when possible (see below), and also because
448 // it seems unwise to rely *too* heavily on the CPU's branch predictor.
449 lea OFFS-16(KEY, KEYLEN64, 4), KEY
451 // If all 32 SIMD registers are available, cache all the round keys.
456 _vbroadcast128 -6*16(KEY), KEY1
457 _vbroadcast128 -5*16(KEY), KEY2
459 _vbroadcast128 -4*16(KEY), KEY3
460 _vbroadcast128 -3*16(KEY), KEY4
462 _vbroadcast128 -2*16(KEY), KEY5
463 _vbroadcast128 -1*16(KEY), KEY6
464 _vbroadcast128 0*16(KEY), KEY7
465 _vbroadcast128 1*16(KEY), KEY8
466 _vbroadcast128 2*16(KEY), KEY9
467 _vbroadcast128 3*16(KEY), KEY10
468 _vbroadcast128 4*16(KEY), KEY11
469 _vbroadcast128 5*16(KEY), KEY12
470 _vbroadcast128 6*16(KEY), KEY13
471 _vbroadcast128 7*16(KEY), KEY14
475 // Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
476 // on the block(s) in \data using the round key(s) in \key. The register length
477 // determines the number of AES blocks en/decrypted.
478 .macro _vaes enc, last, key, data
481 vaesenclast \key, \data, \data
483 vaesenc \key, \data, \data
487 vaesdeclast \key, \data, \data
489 vaesdec \key, \data, \data
494 // Do a single round of AES en/decryption on the block(s) in \data, using the
495 // same key for all block(s). The round key is loaded from the appropriate
496 // register or memory location for round \i. May clobber V4.
497 .macro _vaes_1x enc, last, i, xmm_suffix, data
499 _vaes \enc, \last, KEY\i\xmm_suffix, \data
502 _vaes \enc, \last, (\i-7)*16(KEY), \data
504 _vbroadcast128 (\i-7)*16(KEY), V4
505 _vaes \enc, \last, V4, \data
510 // Do a single round of AES en/decryption on the blocks in registers V0-V3,
511 // using the same key for all blocks. The round key is loaded from the
512 // appropriate register or memory location for round \i. In addition, does two
513 // steps of the computation of the next set of tweaks. May clobber V4.
514 .macro _vaes_4x enc, last, i
516 _tweak_step (2*(\i-5))
517 _vaes \enc, \last, KEY\i, V0
518 _vaes \enc, \last, KEY\i, V1
519 _tweak_step (2*(\i-5) + 1)
520 _vaes \enc, \last, KEY\i, V2
521 _vaes \enc, \last, KEY\i, V3
523 _vbroadcast128 (\i-7)*16(KEY), V4
524 _tweak_step (2*(\i-5))
525 _vaes \enc, \last, V4, V0
526 _vaes \enc, \last, V4, V1
527 _tweak_step (2*(\i-5) + 1)
528 _vaes \enc, \last, V4, V2
529 _vaes \enc, \last, V4, V3
533 // Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
534 // then XOR with \tweak again) of the block(s) in \data. To process a single
535 // block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of
536 // length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
537 .macro _aes_crypt enc, xmm_suffix, tweak, data
538 _xor3 KEY0\xmm_suffix, \tweak, \data
542 _vaes_1x \enc, 0, 1, \xmm_suffix, \data
543 _vaes_1x \enc, 0, 2, \xmm_suffix, \data
545 _vaes_1x \enc, 0, 3, \xmm_suffix, \data
546 _vaes_1x \enc, 0, 4, \xmm_suffix, \data
548 _vaes_1x \enc, 0, 5, \xmm_suffix, \data
549 _vaes_1x \enc, 0, 6, \xmm_suffix, \data
550 _vaes_1x \enc, 0, 7, \xmm_suffix, \data
551 _vaes_1x \enc, 0, 8, \xmm_suffix, \data
552 _vaes_1x \enc, 0, 9, \xmm_suffix, \data
553 _vaes_1x \enc, 0, 10, \xmm_suffix, \data
554 _vaes_1x \enc, 0, 11, \xmm_suffix, \data
555 _vaes_1x \enc, 0, 12, \xmm_suffix, \data
556 _vaes_1x \enc, 0, 13, \xmm_suffix, \data
557 _vaes_1x \enc, 1, 14, \xmm_suffix, \data
558 _vpxor \tweak, \data, \data
561 .macro _aes_xts_crypt enc
565 // When decrypting a message whose length isn't a multiple of the AES
566 // block length, exclude the last full block from the main loop by
567 // subtracting 16 from LEN. This is needed because ciphertext stealing
568 // decryption uses the last two tweaks in reverse order. We'll handle
569 // the last full block and the partial block specially at the end.
575 // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
576 movl 480(KEY), KEYLEN
578 // Setup the pointer to the round keys and cache as many as possible.
579 _setup_round_keys \enc
581 // Compute the first set of tweaks TWEAK[0-3].
582 _compute_first_set_of_tweaks
585 jl .Lhandle_remainder\@
588 // This is the main loop, en/decrypting 4*VL bytes per iteration.
590 // XOR each source block with its tweak and the zero-th round key.
592 vmovdqu8 0*VL(SRC), V0
593 vmovdqu8 1*VL(SRC), V1
594 vmovdqu8 2*VL(SRC), V2
595 vmovdqu8 3*VL(SRC), V3
596 vpternlogd $0x96, TWEAK0, KEY0, V0
597 vpternlogd $0x96, TWEAK1, KEY0, V1
598 vpternlogd $0x96, TWEAK2, KEY0, V2
599 vpternlogd $0x96, TWEAK3, KEY0, V3
601 vpxor 0*VL(SRC), KEY0, V0
602 vpxor 1*VL(SRC), KEY0, V1
603 vpxor 2*VL(SRC), KEY0, V2
604 vpxor 3*VL(SRC), KEY0, V3
613 // Do all the AES rounds on the data blocks, interleaved with
614 // the computation of the next set of tweaks.
632 // XOR in the tweaks again.
633 _vpxor TWEAK0, V0, V0
634 _vpxor TWEAK1, V1, V1
635 _vpxor TWEAK2, V2, V2
636 _vpxor TWEAK3, V3, V3
638 // Store the destination blocks.
639 _vmovdqu V0, 0*VL(DST)
640 _vmovdqu V1, 1*VL(DST)
641 _vmovdqu V2, 2*VL(DST)
642 _vmovdqu V3, 3*VL(DST)
644 // Finish computing the next set of tweaks.
652 // Check for the uncommon case where the data length isn't a multiple of
653 // 4*VL. Handle it out-of-line in order to optimize for the common
654 // case. In the common case, just fall through to the ret.
656 jnz .Lhandle_remainder\@
658 // Store the next tweak back to *TWEAK to support continuation calls.
659 vmovdqu TWEAK0_XMM, (TWEAK)
665 .Lhandle_remainder\@:
667 // En/decrypt any remaining full blocks, one vector at a time.
669 add $3*VL, LEN // Undo extra sub of 4*VL, then sub VL.
670 jl .Lvec_at_a_time_done\@
673 _aes_crypt \enc, , TWEAK0, V0
675 _next_tweakvec TWEAK0, V0, V1, TWEAK0
679 jge .Lvec_at_a_time\@
680 .Lvec_at_a_time_done\@:
681 add $VL-16, LEN // Undo extra sub of VL, then sub 16.
683 add $4*VL-16, LEN // Undo extra sub of 4*VL, then sub 16.
686 // En/decrypt any remaining full blocks, one at a time.
687 jl .Lblock_at_a_time_done\@
690 _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
692 _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM
696 jge .Lblock_at_a_time\@
697 .Lblock_at_a_time_done\@:
698 add $16, LEN // Undo the extra sub of 16.
699 // Now 0 <= LEN <= 15. If LEN is zero, we're done.
702 // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
703 // Do ciphertext stealing to process the last 16 + LEN bytes.
706 // If encrypting, the main loop already encrypted the last full block to
707 // create the CTS intermediate ciphertext. Prepare for the rest of CTS
708 // by rewinding the pointers and loading the intermediate ciphertext.
713 // If decrypting, the main loop didn't decrypt the last full block
714 // because CTS decryption uses the last two tweaks in reverse order.
715 // Do it now by advancing the tweak and decrypting the last full block.
716 _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
718 _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
722 // Create a mask that has the first LEN bits set.
727 // Swap the first LEN bytes of the en/decryption of the last full block
728 // with the partial block. Note that to support in-place en/decryption,
729 // the load from the src partial block must happen before the store to
730 // the dst partial block.
732 vmovdqu8 16(SRC), %xmm0{%k1}
733 vmovdqu8 %xmm1, 16(DST){%k1}
735 lea .Lcts_permute_table(%rip), %r9
737 // Load the src partial block, left-aligned. Note that to support
738 // in-place en/decryption, this must happen before the store to the dst
740 vmovdqu (SRC, LEN64, 1), %xmm1
742 // Shift the first LEN bytes of the en/decryption of the last full block
743 // to the end of a register, then store it to DST+LEN. This stores the
744 // dst partial block. It also writes to the second part of the dst last
745 // full block, but that part is overwritten later.
746 vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
747 vmovdqu %xmm2, (DST, LEN64, 1)
749 // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
751 vmovdqu 32(%r9), %xmm3
753 // Shift the src partial block to the beginning of its register.
754 vpshufb %xmm3, %xmm1, %xmm1
756 // Do a blend to generate the src partial block followed by the second
757 // part of the en/decryption of the last full block.
758 vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
760 // En/decrypt again and store the last full block.
761 _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
766 // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
767 // u8 iv[AES_BLOCK_SIZE]);
768 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
769 vmovdqu (%rsi), %xmm0
770 vpxor (%rdi), %xmm0, %xmm0
771 movl 480(%rdi), %eax // AES key length
772 lea -16(%rdi, %rax, 4), %rdi
774 jl .Lencrypt_iv_aes128
775 je .Lencrypt_iv_aes192
776 vaesenc -6*16(%rdi), %xmm0, %xmm0
777 vaesenc -5*16(%rdi), %xmm0, %xmm0
779 vaesenc -4*16(%rdi), %xmm0, %xmm0
780 vaesenc -3*16(%rdi), %xmm0, %xmm0
782 vaesenc -2*16(%rdi), %xmm0, %xmm0
783 vaesenc -1*16(%rdi), %xmm0, %xmm0
784 vaesenc 0*16(%rdi), %xmm0, %xmm0
785 vaesenc 1*16(%rdi), %xmm0, %xmm0
786 vaesenc 2*16(%rdi), %xmm0, %xmm0
787 vaesenc 3*16(%rdi), %xmm0, %xmm0
788 vaesenc 4*16(%rdi), %xmm0, %xmm0
789 vaesenc 5*16(%rdi), %xmm0, %xmm0
790 vaesenc 6*16(%rdi), %xmm0, %xmm0
791 vaesenclast 7*16(%rdi), %xmm0, %xmm0
792 vmovdqu %xmm0, (%rsi)
794 SYM_FUNC_END(aes_xts_encrypt_iv)
796 // Below are the actual AES-XTS encryption and decryption functions,
797 // instantiated from the above macro. They all have the following prototype:
799 // void (*xts_asm_func)(const struct crypto_aes_ctx *key,
800 // const u8 *src, u8 *dst, unsigned int len,
801 // u8 tweak[AES_BLOCK_SIZE]);
803 // |key| is the data key. |tweak| contains the next tweak; the encryption of
804 // the original IV with the tweak key was already done. This function supports
805 // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
806 // |len| must be a multiple of 16 except on the last call. If |len| is a
807 // multiple of 16, then this function updates |tweak| to contain the next tweak.
811 SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
813 SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
814 SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
816 SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
818 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
821 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
823 SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
824 SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
826 SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
830 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
832 SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
833 SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
835 SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
839 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
841 SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
842 SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
844 SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
845 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */