arch/x86/crypto/aes-xts-avx-x86_64.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * AES-XTS for modern x86_64 CPUs
   4  *
   5  * Copyright 2024 Google LLC
   6  *
   7  * Author: Eric Biggers <ebiggers@google.com>
   8  */
   9
  10 /*
  11  * This file implements AES-XTS for modern x86_64 CPUs.  To handle the
  12  * complexities of coding for x86 SIMD, e.g. where every vector length needs
  13  * different code, it uses a macro to generate several implementations that
  14  * share similar source code but are targeted at different CPUs, listed below:
  15  *
  16  * AES-NI + AVX
  17  *    - 128-bit vectors (1 AES block per vector)
  18  *    - VEX-coded instructions
  19  *    - xmm0-xmm15
  20  *    - This is for older CPUs that lack VAES but do have AVX.
  21  *
  22  * VAES + VPCLMULQDQ + AVX2
  23  *    - 256-bit vectors (2 AES blocks per vector)
  24  *    - VEX-coded instructions
  25  *    - ymm0-ymm15
  26  *    - This is for CPUs that have VAES but lack AVX512 or AVX10,
  27  *      e.g. Intel's Alder Lake and AMD's Zen 3.
  28  *
  29  * VAES + VPCLMULQDQ + AVX10/256 + BMI2
  30  *    - 256-bit vectors (2 AES blocks per vector)
  31  *    - EVEX-coded instructions
  32  *    - ymm0-ymm31
  33  *    - This is for CPUs that have AVX512 but where using zmm registers causes
  34  *      downclocking, and for CPUs that have AVX10/256 but not AVX10/512.
  35  *    - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256.
  36  *      To avoid confusion with 512-bit, we just write AVX10/256.
  37  *
  38  * VAES + VPCLMULQDQ + AVX10/512 + BMI2
  39  *    - Same as the previous one, but upgrades to 512-bit vectors
  40  *      (4 AES blocks per vector) in zmm0-zmm31.
  41  *    - This is for CPUs that have good AVX512 or AVX10/512 support.
  42  *
  43  * This file doesn't have an implementation for AES-NI alone (without AVX), as
  44  * the lack of VEX would make all the assembly code different.
  45  *
  46  * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of
  47  * the XTS tweaks.  This avoids a bottleneck.  Currently there don't seem to be
  48  * any CPUs that support VAES but not VPCLMULQDQ.  If that changes, we might
  49  * need to start also providing an implementation using VAES alone.
  50  *
  51  * The AES-XTS implementations in this file support everything required by the
  52  * crypto API, including support for arbitrary input lengths and multi-part
  53  * processing.  However, they are most heavily optimized for the common case of
  54  * power-of-2 length inputs that are processed in a single part (disk sectors).
  55  */
  56
  57 #include <linux/linkage.h>
  58 #include <linux/cfi_types.h>
  59
  60 .section .rodata
  61 .p2align 4
  62 .Lgf_poly:
  63         // The low 64 bits of this value represent the polynomial x^7 + x^2 + x
  64         // + 1.  It is the value that must be XOR'd into the low 64 bits of the
  65         // tweak each time a 1 is carried out of the high 64 bits.
  66         //
  67         // The high 64 bits of this value is just the internal carry bit that
  68         // exists when there's a carry out of the low 64 bits of the tweak.
  69         .quad   0x87, 1
  70
  71         // This table contains constants for vpshufb and vpblendvb, used to
  72         // handle variable byte shifts and blending during ciphertext stealing
  73         // on CPUs that don't support AVX10-style masking.
  74 .Lcts_permute_table:
  75         .byte   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
  76         .byte   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
  77         .byte   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
  78         .byte   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
  79         .byte   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
  80         .byte   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
  81 .text
  82
  83 // Function parameters
  84 .set    KEY,            %rdi    // Initially points to crypto_aes_ctx, then is
  85                                 // advanced to point to 7th-from-last round key
  86 .set    SRC,            %rsi    // Pointer to next source data
  87 .set    DST,            %rdx    // Pointer to next destination data
  88 .set    LEN,            %ecx    // Remaining length in bytes
  89 .set    LEN8,           %cl
  90 .set    LEN64,          %rcx
  91 .set    TWEAK,          %r8     // Pointer to next tweak
  92
  93 // %rax holds the AES key length in bytes.
  94 .set    KEYLEN,         %eax
  95 .set    KEYLEN64,       %rax
  96
  97 // %r9-r11 are available as temporaries.
  98
  99 .macro  _define_Vi      i
 100 .if VL == 16
 101         .set    V\i,            %xmm\i
 102 .elseif VL == 32
 103         .set    V\i,            %ymm\i
 104 .elseif VL == 64
 105         .set    V\i,            %zmm\i
 106 .else
 107         .error "Unsupported Vector Length (VL)"
 108 .endif
 109 .endm
 110
 111 .macro _define_aliases
 112         // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers
 113         // are available, that map to the xmm, ymm, or zmm registers according
 114         // to the selected Vector Length (VL).
 115         _define_Vi      0
 116         _define_Vi      1
 117         _define_Vi      2
 118         _define_Vi      3
 119         _define_Vi      4
 120         _define_Vi      5
 121         _define_Vi      6
 122         _define_Vi      7
 123         _define_Vi      8
 124         _define_Vi      9
 125         _define_Vi      10
 126         _define_Vi      11
 127         _define_Vi      12
 128         _define_Vi      13
 129         _define_Vi      14
 130         _define_Vi      15
 131 .if USE_AVX10
 132         _define_Vi      16
 133         _define_Vi      17
 134         _define_Vi      18
 135         _define_Vi      19
 136         _define_Vi      20
 137         _define_Vi      21
 138         _define_Vi      22
 139         _define_Vi      23
 140         _define_Vi      24
 141         _define_Vi      25
 142         _define_Vi      26
 143         _define_Vi      27
 144         _define_Vi      28
 145         _define_Vi      29
 146         _define_Vi      30
 147         _define_Vi      31
 148 .endif
 149
 150         // V0-V3 hold the data blocks during the main loop, or temporary values
 151         // otherwise.  V4-V5 hold temporary values.
 152
 153         // V6-V9 hold XTS tweaks.  Each 128-bit lane holds one tweak.
 154         .set    TWEAK0_XMM,     %xmm6
 155         .set    TWEAK0,         V6
 156         .set    TWEAK1_XMM,     %xmm7
 157         .set    TWEAK1,         V7
 158         .set    TWEAK2,         V8
 159         .set    TWEAK3,         V9
 160
 161         // V10-V13 are used for computing the next values of TWEAK[0-3].
 162         .set    NEXT_TWEAK0,    V10
 163         .set    NEXT_TWEAK1,    V11
 164         .set    NEXT_TWEAK2,    V12
 165         .set    NEXT_TWEAK3,    V13
 166
 167         // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes.
 168         .set    GF_POLY_XMM,    %xmm14
 169         .set    GF_POLY,        V14
 170
 171         // V15 holds the key for AES "round 0", copied to all 128-bit lanes.
 172         .set    KEY0_XMM,       %xmm15
 173         .set    KEY0,           V15
 174
 175         // If 32 SIMD registers are available, then V16-V29 hold the remaining
 176         // AES round keys, copied to all 128-bit lanes.
 177         //
 178         // AES-128, AES-192, and AES-256 use different numbers of round keys.
 179         // To allow handling all three variants efficiently, we align the round
 180         // keys to the *end* of this register range.  I.e., AES-128 uses
 181         // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
 182         // (All also use KEY0 for the XOR-only "round" at the beginning.)
 183 .if USE_AVX10
 184         .set    KEY1_XMM,       %xmm16
 185         .set    KEY1,           V16
 186         .set    KEY2_XMM,       %xmm17
 187         .set    KEY2,           V17
 188         .set    KEY3_XMM,       %xmm18
 189         .set    KEY3,           V18
 190         .set    KEY4_XMM,       %xmm19
 191         .set    KEY4,           V19
 192         .set    KEY5_XMM,       %xmm20
 193         .set    KEY5,           V20
 194         .set    KEY6_XMM,       %xmm21
 195         .set    KEY6,           V21
 196         .set    KEY7_XMM,       %xmm22
 197         .set    KEY7,           V22
 198         .set    KEY8_XMM,       %xmm23
 199         .set    KEY8,           V23
 200         .set    KEY9_XMM,       %xmm24
 201         .set    KEY9,           V24
 202         .set    KEY10_XMM,      %xmm25
 203         .set    KEY10,          V25
 204         .set    KEY11_XMM,      %xmm26
 205         .set    KEY11,          V26
 206         .set    KEY12_XMM,      %xmm27
 207         .set    KEY12,          V27
 208         .set    KEY13_XMM,      %xmm28
 209         .set    KEY13,          V28
 210         .set    KEY14_XMM,      %xmm29
 211         .set    KEY14,          V29
 212 .endif
 213         // V30-V31 are currently unused.
 214 .endm
 215
 216 // Move a vector between memory and a register.
 217 .macro  _vmovdqu        src, dst
 218 .if VL < 64
 219         vmovdqu         \src, \dst
 220 .else
 221         vmovdqu8        \src, \dst
 222 .endif
 223 .endm
 224
 225 // Broadcast a 128-bit value into a vector.
 226 .macro  _vbroadcast128  src, dst
 227 .if VL == 16 && !USE_AVX10
 228         vmovdqu         \src, \dst
 229 .elseif VL == 32 && !USE_AVX10
 230         vbroadcasti128  \src, \dst
 231 .else
 232         vbroadcasti32x4 \src, \dst
 233 .endif
 234 .endm
 235
 236 // XOR two vectors together.
 237 .macro  _vpxor  src1, src2, dst
 238 .if USE_AVX10
 239         vpxord          \src1, \src2, \dst
 240 .else
 241         vpxor           \src1, \src2, \dst
 242 .endif
 243 .endm
 244
 245 // XOR three vectors together.
 246 .macro  _xor3   src1, src2, src3_and_dst
 247 .if USE_AVX10
 248         // vpternlogd with immediate 0x96 is a three-argument XOR.
 249         vpternlogd      $0x96, \src1, \src2, \src3_and_dst
 250 .else
 251         vpxor           \src1, \src3_and_dst, \src3_and_dst
 252         vpxor           \src2, \src3_and_dst, \src3_and_dst
 253 .endif
 254 .endm
 255
 256 // Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak
 257 // (by multiplying by the polynomial 'x') and write it to \dst.
 258 .macro  _next_tweak     src, tmp, dst
 259         vpshufd         $0x13, \src, \tmp
 260         vpaddq          \src, \src, \dst
 261         vpsrad          $31, \tmp, \tmp
 262         vpand           GF_POLY_XMM, \tmp, \tmp
 263         vpxor           \tmp, \dst, \dst
 264 .endm
 265
 266 // Given the XTS tweak(s) in the vector \src, compute the next vector of
 267 // tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst.
 268 //
 269 // If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute
 270 // all tweaks in the vector in parallel.  If VL=16, we just do the regular
 271 // computation without vpclmulqdq, as it's the faster method for a single tweak.
 272 .macro  _next_tweakvec  src, tmp1, tmp2, dst
 273 .if VL == 16
 274         _next_tweak     \src, \tmp1, \dst
 275 .else
 276         vpsrlq          $64 - VL/16, \src, \tmp1
 277         vpclmulqdq      $0x01, GF_POLY, \tmp1, \tmp2
 278         vpslldq         $8, \tmp1, \tmp1
 279         vpsllq          $VL/16, \src, \dst
 280         _xor3           \tmp1, \tmp2, \dst
 281 .endif
 282 .endm
 283
 284 // Given the first XTS tweak at (TWEAK), compute the first set of tweaks and
 285 // store them in the vector registers TWEAK0-TWEAK3.  Clobbers V0-V5.
 286 .macro  _compute_first_set_of_tweaks
 287         vmovdqu         (TWEAK), TWEAK0_XMM
 288         _vbroadcast128  .Lgf_poly(%rip), GF_POLY
 289 .if VL == 16
 290         // With VL=16, multiplying by x serially is fastest.
 291         _next_tweak     TWEAK0, %xmm0, TWEAK1
 292         _next_tweak     TWEAK1, %xmm0, TWEAK2
 293         _next_tweak     TWEAK2, %xmm0, TWEAK3
 294 .else
 295 .if VL == 32
 296         // Compute the second block of TWEAK0.
 297         _next_tweak     TWEAK0_XMM, %xmm0, %xmm1
 298         vinserti128     $1, %xmm1, TWEAK0, TWEAK0
 299 .elseif VL == 64
 300         // Compute the remaining blocks of TWEAK0.
 301         _next_tweak     TWEAK0_XMM, %xmm0, %xmm1
 302         _next_tweak     %xmm1, %xmm0, %xmm2
 303         _next_tweak     %xmm2, %xmm0, %xmm3
 304         vinserti32x4    $1, %xmm1, TWEAK0, TWEAK0
 305         vinserti32x4    $2, %xmm2, TWEAK0, TWEAK0
 306         vinserti32x4    $3, %xmm3, TWEAK0, TWEAK0
 307 .endif
 308         // Compute TWEAK[1-3] from TWEAK0.
 309         vpsrlq          $64 - 1*VL/16, TWEAK0, V0
 310         vpsrlq          $64 - 2*VL/16, TWEAK0, V2
 311         vpsrlq          $64 - 3*VL/16, TWEAK0, V4
 312         vpclmulqdq      $0x01, GF_POLY, V0, V1
 313         vpclmulqdq      $0x01, GF_POLY, V2, V3
 314         vpclmulqdq      $0x01, GF_POLY, V4, V5
 315         vpslldq         $8, V0, V0
 316         vpslldq         $8, V2, V2
 317         vpslldq         $8, V4, V4
 318         vpsllq          $1*VL/16, TWEAK0, TWEAK1
 319         vpsllq          $2*VL/16, TWEAK0, TWEAK2
 320         vpsllq          $3*VL/16, TWEAK0, TWEAK3
 321 .if USE_AVX10
 322         vpternlogd      $0x96, V0, V1, TWEAK1
 323         vpternlogd      $0x96, V2, V3, TWEAK2
 324         vpternlogd      $0x96, V4, V5, TWEAK3
 325 .else
 326         vpxor           V0, TWEAK1, TWEAK1
 327         vpxor           V2, TWEAK2, TWEAK2
 328         vpxor           V4, TWEAK3, TWEAK3
 329         vpxor           V1, TWEAK1, TWEAK1
 330         vpxor           V3, TWEAK2, TWEAK2
 331         vpxor           V5, TWEAK3, TWEAK3
 332 .endif
 333 .endif
 334 .endm
 335
 336 // Do one step in computing the next set of tweaks using the method of just
 337 // multiplying by x repeatedly (the same method _next_tweak uses).
 338 .macro  _tweak_step_mulx        i
 339 .if \i == 0
 340         .set PREV_TWEAK, TWEAK3
 341         .set NEXT_TWEAK, NEXT_TWEAK0
 342 .elseif \i == 5
 343         .set PREV_TWEAK, NEXT_TWEAK0
 344         .set NEXT_TWEAK, NEXT_TWEAK1
 345 .elseif \i == 10
 346         .set PREV_TWEAK, NEXT_TWEAK1
 347         .set NEXT_TWEAK, NEXT_TWEAK2
 348 .elseif \i == 15
 349         .set PREV_TWEAK, NEXT_TWEAK2
 350         .set NEXT_TWEAK, NEXT_TWEAK3
 351 .endif
 352 .if \i >= 0 && \i < 20 && \i % 5 == 0
 353         vpshufd         $0x13, PREV_TWEAK, V5
 354 .elseif \i >= 0 && \i < 20 && \i % 5 == 1
 355         vpaddq          PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
 356 .elseif \i >= 0 && \i < 20 && \i % 5 == 2
 357         vpsrad          $31, V5, V5
 358 .elseif \i >= 0 && \i < 20 && \i % 5 == 3
 359         vpand           GF_POLY, V5, V5
 360 .elseif \i >= 0 && \i < 20 && \i % 5 == 4
 361         vpxor           V5, NEXT_TWEAK, NEXT_TWEAK
 362 .elseif \i == 1000
 363         vmovdqa         NEXT_TWEAK0, TWEAK0
 364         vmovdqa         NEXT_TWEAK1, TWEAK1
 365         vmovdqa         NEXT_TWEAK2, TWEAK2
 366         vmovdqa         NEXT_TWEAK3, TWEAK3
 367 .endif
 368 .endm
 369
 370 // Do one step in computing the next set of tweaks using the VPCLMULQDQ method
 371 // (the same method _next_tweakvec uses for VL > 16).  This means multiplying
 372 // each tweak by x^(4*VL/16) independently.  Since 4*VL/16 is a multiple of 8
 373 // when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
 374 // which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
 375 .macro  _tweak_step_pclmul      i
 376 .if \i == 0
 377         vpsrldq         $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
 378 .elseif \i == 2
 379         vpsrldq         $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
 380 .elseif \i == 4
 381         vpsrldq         $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
 382 .elseif \i == 6
 383         vpsrldq         $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
 384 .elseif \i == 8
 385         vpclmulqdq      $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
 386 .elseif \i == 10
 387         vpclmulqdq      $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
 388 .elseif \i == 12
 389         vpclmulqdq      $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
 390 .elseif \i == 14
 391         vpclmulqdq      $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
 392 .elseif \i == 1000
 393         vpslldq         $(4*VL/16) / 8, TWEAK0, TWEAK0
 394         vpslldq         $(4*VL/16) / 8, TWEAK1, TWEAK1
 395         vpslldq         $(4*VL/16) / 8, TWEAK2, TWEAK2
 396         vpslldq         $(4*VL/16) / 8, TWEAK3, TWEAK3
 397         _vpxor          NEXT_TWEAK0, TWEAK0, TWEAK0
 398         _vpxor          NEXT_TWEAK1, TWEAK1, TWEAK1
 399         _vpxor          NEXT_TWEAK2, TWEAK2, TWEAK2
 400         _vpxor          NEXT_TWEAK3, TWEAK3, TWEAK3
 401 .endif
 402 .endm
 403
 404 // _tweak_step does one step of the computation of the next set of tweaks from
 405 // TWEAK[0-3].  To complete all steps, this is invoked with increasing values of
 406 // \i that include at least 0 through 19, then 1000 which signals the last step.
 407 //
 408 // This is used to interleave the computation of the next set of tweaks with the
 409 // AES en/decryptions, which increases performance in some cases.
 410 .macro  _tweak_step     i
 411 .if VL == 16
 412         _tweak_step_mulx        \i
 413 .else
 414         _tweak_step_pclmul      \i
 415 .endif
 416 .endm
 417
 418 .macro  _setup_round_keys       enc
 419
 420         // Select either the encryption round keys or the decryption round keys.
 421 .if \enc
 422         .set    OFFS, 0
 423 .else
 424         .set    OFFS, 240
 425 .endif
 426
 427         // Load the round key for "round 0".
 428         _vbroadcast128  OFFS(KEY), KEY0
 429
 430         // Increment KEY to make it so that 7*16(KEY) is the last round key.
 431         // For AES-128, increment by 3*16, resulting in the 10 round keys (not
 432         // counting the zero-th round key which was just loaded into KEY0) being
 433         // -2*16(KEY) through 7*16(KEY).  For AES-192, increment by 5*16 and use
 434         // 12 round keys -4*16(KEY) through 7*16(KEY).  For AES-256, increment
 435         // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
 436         //
 437         // This rebasing provides two benefits.  First, it makes the offset to
 438         // any round key be in the range [-96, 112], fitting in a signed byte.
 439         // This shortens VEX-encoded instructions that access the later round
 440         // keys which otherwise would need 4-byte offsets.  Second, it makes it
 441         // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
 442         // beginning.  Skipping rounds at the end doesn't work as well because
 443         // the last round needs different instructions.
 444         //
 445         // An alternative approach would be to roll up all the round loops.  We
 446         // don't do that because it isn't compatible with caching the round keys
 447         // in registers which we do when possible (see below), and also because
 448         // it seems unwise to rely *too* heavily on the CPU's branch predictor.
 449         lea             OFFS-16(KEY, KEYLEN64, 4), KEY
 450
 451         // If all 32 SIMD registers are available, cache all the round keys.
 452 .if USE_AVX10
 453         cmp             $24, KEYLEN
 454         jl              .Laes128\@
 455         je              .Laes192\@
 456         _vbroadcast128  -6*16(KEY), KEY1
 457         _vbroadcast128  -5*16(KEY), KEY2
 458 .Laes192\@:
 459         _vbroadcast128  -4*16(KEY), KEY3
 460         _vbroadcast128  -3*16(KEY), KEY4
 461 .Laes128\@:
 462         _vbroadcast128  -2*16(KEY), KEY5
 463         _vbroadcast128  -1*16(KEY), KEY6
 464         _vbroadcast128  0*16(KEY), KEY7
 465         _vbroadcast128  1*16(KEY), KEY8
 466         _vbroadcast128  2*16(KEY), KEY9
 467         _vbroadcast128  3*16(KEY), KEY10
 468         _vbroadcast128  4*16(KEY), KEY11
 469         _vbroadcast128  5*16(KEY), KEY12
 470         _vbroadcast128  6*16(KEY), KEY13
 471         _vbroadcast128  7*16(KEY), KEY14
 472 .endif
 473 .endm
 474
 475 // Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0)
 476 // on the block(s) in \data using the round key(s) in \key.  The register length
 477 // determines the number of AES blocks en/decrypted.
 478 .macro  _vaes   enc, last, key, data
 479 .if \enc
 480 .if \last
 481         vaesenclast     \key, \data, \data
 482 .else
 483         vaesenc         \key, \data, \data
 484 .endif
 485 .else
 486 .if \last
 487         vaesdeclast     \key, \data, \data
 488 .else
 489         vaesdec         \key, \data, \data
 490 .endif
 491 .endif
 492 .endm
 493
 494 // Do a single round of AES en/decryption on the block(s) in \data, using the
 495 // same key for all block(s).  The round key is loaded from the appropriate
 496 // register or memory location for round \i.  May clobber V4.
 497 .macro _vaes_1x         enc, last, i, xmm_suffix, data
 498 .if USE_AVX10
 499         _vaes           \enc, \last, KEY\i\xmm_suffix, \data
 500 .else
 501 .ifnb \xmm_suffix
 502         _vaes           \enc, \last, (\i-7)*16(KEY), \data
 503 .else
 504         _vbroadcast128  (\i-7)*16(KEY), V4
 505         _vaes           \enc, \last, V4, \data
 506 .endif
 507 .endif
 508 .endm
 509
 510 // Do a single round of AES en/decryption on the blocks in registers V0-V3,
 511 // using the same key for all blocks.  The round key is loaded from the
 512 // appropriate register or memory location for round \i.  In addition, does two
 513 // steps of the computation of the next set of tweaks.  May clobber V4.
 514 .macro  _vaes_4x        enc, last, i
 515 .if USE_AVX10
 516         _tweak_step     (2*(\i-5))
 517         _vaes           \enc, \last, KEY\i, V0
 518         _vaes           \enc, \last, KEY\i, V1
 519         _tweak_step     (2*(\i-5) + 1)
 520         _vaes           \enc, \last, KEY\i, V2
 521         _vaes           \enc, \last, KEY\i, V3
 522 .else
 523         _vbroadcast128  (\i-7)*16(KEY), V4
 524         _tweak_step     (2*(\i-5))
 525         _vaes           \enc, \last, V4, V0
 526         _vaes           \enc, \last, V4, V1
 527         _tweak_step     (2*(\i-5) + 1)
 528         _vaes           \enc, \last, V4, V2
 529         _vaes           \enc, \last, V4, V3
 530 .endif
 531 .endm
 532
 533 // Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt,
 534 // then XOR with \tweak again) of the block(s) in \data.  To process a single
 535 // block, use xmm registers and set \xmm_suffix=_XMM.  To process a vector of
 536 // length VL, use V* registers and leave \xmm_suffix empty.  May clobber V4.
 537 .macro  _aes_crypt      enc, xmm_suffix, tweak, data
 538         _xor3           KEY0\xmm_suffix, \tweak, \data
 539         cmp             $24, KEYLEN
 540         jl              .Laes128\@
 541         je              .Laes192\@
 542         _vaes_1x        \enc, 0, 1, \xmm_suffix, \data
 543         _vaes_1x        \enc, 0, 2, \xmm_suffix, \data
 544 .Laes192\@:
 545         _vaes_1x        \enc, 0, 3, \xmm_suffix, \data
 546         _vaes_1x        \enc, 0, 4, \xmm_suffix, \data
 547 .Laes128\@:
 548         _vaes_1x        \enc, 0, 5, \xmm_suffix, \data
 549         _vaes_1x        \enc, 0, 6, \xmm_suffix, \data
 550         _vaes_1x        \enc, 0, 7, \xmm_suffix, \data
 551         _vaes_1x        \enc, 0, 8, \xmm_suffix, \data
 552         _vaes_1x        \enc, 0, 9, \xmm_suffix, \data
 553         _vaes_1x        \enc, 0, 10, \xmm_suffix, \data
 554         _vaes_1x        \enc, 0, 11, \xmm_suffix, \data
 555         _vaes_1x        \enc, 0, 12, \xmm_suffix, \data
 556         _vaes_1x        \enc, 0, 13, \xmm_suffix, \data
 557         _vaes_1x        \enc, 1, 14, \xmm_suffix, \data
 558         _vpxor          \tweak, \data, \data
 559 .endm
 560
 561 .macro  _aes_xts_crypt  enc
 562         _define_aliases
 563
 564 .if !\enc
 565         // When decrypting a message whose length isn't a multiple of the AES
 566         // block length, exclude the last full block from the main loop by
 567         // subtracting 16 from LEN.  This is needed because ciphertext stealing
 568         // decryption uses the last two tweaks in reverse order.  We'll handle
 569         // the last full block and the partial block specially at the end.
 570         lea             -16(LEN), %eax
 571         test            $15, LEN8
 572         cmovnz          %eax, LEN
 573 .endif
 574
 575         // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
 576         movl            480(KEY), KEYLEN
 577
 578         // Setup the pointer to the round keys and cache as many as possible.
 579         _setup_round_keys       \enc
 580
 581         // Compute the first set of tweaks TWEAK[0-3].
 582         _compute_first_set_of_tweaks
 583
 584         sub             $4*VL, LEN
 585         jl              .Lhandle_remainder\@
 586
 587 .Lmain_loop\@:
 588         // This is the main loop, en/decrypting 4*VL bytes per iteration.
 589
 590         // XOR each source block with its tweak and the zero-th round key.
 591 .if USE_AVX10
 592         vmovdqu8        0*VL(SRC), V0
 593         vmovdqu8        1*VL(SRC), V1
 594         vmovdqu8        2*VL(SRC), V2
 595         vmovdqu8        3*VL(SRC), V3
 596         vpternlogd      $0x96, TWEAK0, KEY0, V0
 597         vpternlogd      $0x96, TWEAK1, KEY0, V1
 598         vpternlogd      $0x96, TWEAK2, KEY0, V2
 599         vpternlogd      $0x96, TWEAK3, KEY0, V3
 600 .else
 601         vpxor           0*VL(SRC), KEY0, V0
 602         vpxor           1*VL(SRC), KEY0, V1
 603         vpxor           2*VL(SRC), KEY0, V2
 604         vpxor           3*VL(SRC), KEY0, V3
 605         vpxor           TWEAK0, V0, V0
 606         vpxor           TWEAK1, V1, V1
 607         vpxor           TWEAK2, V2, V2
 608         vpxor           TWEAK3, V3, V3
 609 .endif
 610         cmp             $24, KEYLEN
 611         jl              .Laes128\@
 612         je              .Laes192\@
 613         // Do all the AES rounds on the data blocks, interleaved with
 614         // the computation of the next set of tweaks.
 615         _vaes_4x        \enc, 0, 1
 616         _vaes_4x        \enc, 0, 2
 617 .Laes192\@:
 618         _vaes_4x        \enc, 0, 3
 619         _vaes_4x        \enc, 0, 4
 620 .Laes128\@:
 621         _vaes_4x        \enc, 0, 5
 622         _vaes_4x        \enc, 0, 6
 623         _vaes_4x        \enc, 0, 7
 624         _vaes_4x        \enc, 0, 8
 625         _vaes_4x        \enc, 0, 9
 626         _vaes_4x        \enc, 0, 10
 627         _vaes_4x        \enc, 0, 11
 628         _vaes_4x        \enc, 0, 12
 629         _vaes_4x        \enc, 0, 13
 630         _vaes_4x        \enc, 1, 14
 631
 632         // XOR in the tweaks again.
 633         _vpxor          TWEAK0, V0, V0
 634         _vpxor          TWEAK1, V1, V1
 635         _vpxor          TWEAK2, V2, V2
 636         _vpxor          TWEAK3, V3, V3
 637
 638         // Store the destination blocks.
 639         _vmovdqu        V0, 0*VL(DST)
 640         _vmovdqu        V1, 1*VL(DST)
 641         _vmovdqu        V2, 2*VL(DST)
 642         _vmovdqu        V3, 3*VL(DST)
 643
 644         // Finish computing the next set of tweaks.
 645         _tweak_step     1000
 646
 647         add             $4*VL, SRC
 648         add             $4*VL, DST
 649         sub             $4*VL, LEN
 650         jge             .Lmain_loop\@
 651
 652         // Check for the uncommon case where the data length isn't a multiple of
 653         // 4*VL.  Handle it out-of-line in order to optimize for the common
 654         // case.  In the common case, just fall through to the ret.
 655         test            $4*VL-1, LEN8
 656         jnz             .Lhandle_remainder\@
 657 .Ldone\@:
 658         // Store the next tweak back to *TWEAK to support continuation calls.
 659         vmovdqu         TWEAK0_XMM, (TWEAK)
 660 .if VL > 16
 661         vzeroupper
 662 .endif
 663         RET
 664
 665 .Lhandle_remainder\@:
 666
 667         // En/decrypt any remaining full blocks, one vector at a time.
 668 .if VL > 16
 669         add             $3*VL, LEN      // Undo extra sub of 4*VL, then sub VL.
 670         jl              .Lvec_at_a_time_done\@
 671 .Lvec_at_a_time\@:
 672         _vmovdqu        (SRC), V0
 673         _aes_crypt      \enc, , TWEAK0, V0
 674         _vmovdqu        V0, (DST)
 675         _next_tweakvec  TWEAK0, V0, V1, TWEAK0
 676         add             $VL, SRC
 677         add             $VL, DST
 678         sub             $VL, LEN
 679         jge             .Lvec_at_a_time\@
 680 .Lvec_at_a_time_done\@:
 681         add             $VL-16, LEN     // Undo extra sub of VL, then sub 16.
 682 .else
 683         add             $4*VL-16, LEN   // Undo extra sub of 4*VL, then sub 16.
 684 .endif
 685
 686         // En/decrypt any remaining full blocks, one at a time.
 687         jl              .Lblock_at_a_time_done\@
 688 .Lblock_at_a_time\@:
 689         vmovdqu         (SRC), %xmm0
 690         _aes_crypt      \enc, _XMM, TWEAK0_XMM, %xmm0
 691         vmovdqu         %xmm0, (DST)
 692         _next_tweak     TWEAK0_XMM, %xmm0, TWEAK0_XMM
 693         add             $16, SRC
 694         add             $16, DST
 695         sub             $16, LEN
 696         jge             .Lblock_at_a_time\@
 697 .Lblock_at_a_time_done\@:
 698         add             $16, LEN        // Undo the extra sub of 16.
 699         // Now 0 <= LEN <= 15.  If LEN is zero, we're done.
 700         jz              .Ldone\@
 701
 702         // Otherwise 1 <= LEN <= 15, but the real remaining length is 16 + LEN.
 703         // Do ciphertext stealing to process the last 16 + LEN bytes.
 704
 705 .if \enc
 706         // If encrypting, the main loop already encrypted the last full block to
 707         // create the CTS intermediate ciphertext.  Prepare for the rest of CTS
 708         // by rewinding the pointers and loading the intermediate ciphertext.
 709         sub             $16, SRC
 710         sub             $16, DST
 711         vmovdqu         (DST), %xmm0
 712 .else
 713         // If decrypting, the main loop didn't decrypt the last full block
 714         // because CTS decryption uses the last two tweaks in reverse order.
 715         // Do it now by advancing the tweak and decrypting the last full block.
 716         _next_tweak     TWEAK0_XMM, %xmm0, TWEAK1_XMM
 717         vmovdqu         (SRC), %xmm0
 718         _aes_crypt      \enc, _XMM, TWEAK1_XMM, %xmm0
 719 .endif
 720
 721 .if USE_AVX10
 722         // Create a mask that has the first LEN bits set.
 723         mov             $-1, %r9d
 724         bzhi            LEN, %r9d, %r9d
 725         kmovd           %r9d, %k1
 726
 727         // Swap the first LEN bytes of the en/decryption of the last full block
 728         // with the partial block.  Note that to support in-place en/decryption,
 729         // the load from the src partial block must happen before the store to
 730         // the dst partial block.
 731         vmovdqa         %xmm0, %xmm1
 732         vmovdqu8        16(SRC), %xmm0{%k1}
 733         vmovdqu8        %xmm1, 16(DST){%k1}
 734 .else
 735         lea             .Lcts_permute_table(%rip), %r9
 736
 737         // Load the src partial block, left-aligned.  Note that to support
 738         // in-place en/decryption, this must happen before the store to the dst
 739         // partial block.
 740         vmovdqu         (SRC, LEN64, 1), %xmm1
 741
 742         // Shift the first LEN bytes of the en/decryption of the last full block
 743         // to the end of a register, then store it to DST+LEN.  This stores the
 744         // dst partial block.  It also writes to the second part of the dst last
 745         // full block, but that part is overwritten later.
 746         vpshufb         (%r9, LEN64, 1), %xmm0, %xmm2
 747         vmovdqu         %xmm2, (DST, LEN64, 1)
 748
 749         // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
 750         sub             LEN64, %r9
 751         vmovdqu         32(%r9), %xmm3
 752
 753         // Shift the src partial block to the beginning of its register.
 754         vpshufb         %xmm3, %xmm1, %xmm1
 755
 756         // Do a blend to generate the src partial block followed by the second
 757         // part of the en/decryption of the last full block.
 758         vpblendvb       %xmm3, %xmm0, %xmm1, %xmm0
 759 .endif
 760         // En/decrypt again and store the last full block.
 761         _aes_crypt      \enc, _XMM, TWEAK0_XMM, %xmm0
 762         vmovdqu         %xmm0, (DST)
 763         jmp             .Ldone\@
 764 .endm
 765
 766 // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
 767 //                         u8 iv[AES_BLOCK_SIZE]);
 768 SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
 769         vmovdqu         (%rsi), %xmm0
 770         vpxor           (%rdi), %xmm0, %xmm0
 771         movl            480(%rdi), %eax         // AES key length
 772         lea             -16(%rdi, %rax, 4), %rdi
 773         cmp             $24, %eax
 774         jl              .Lencrypt_iv_aes128
 775         je              .Lencrypt_iv_aes192
 776         vaesenc         -6*16(%rdi), %xmm0, %xmm0
 777         vaesenc         -5*16(%rdi), %xmm0, %xmm0
 778 .Lencrypt_iv_aes192:
 779         vaesenc         -4*16(%rdi), %xmm0, %xmm0
 780         vaesenc         -3*16(%rdi), %xmm0, %xmm0
 781 .Lencrypt_iv_aes128:
 782         vaesenc         -2*16(%rdi), %xmm0, %xmm0
 783         vaesenc         -1*16(%rdi), %xmm0, %xmm0
 784         vaesenc         0*16(%rdi), %xmm0, %xmm0
 785         vaesenc         1*16(%rdi), %xmm0, %xmm0
 786         vaesenc         2*16(%rdi), %xmm0, %xmm0
 787         vaesenc         3*16(%rdi), %xmm0, %xmm0
 788         vaesenc         4*16(%rdi), %xmm0, %xmm0
 789         vaesenc         5*16(%rdi), %xmm0, %xmm0
 790         vaesenc         6*16(%rdi), %xmm0, %xmm0
 791         vaesenclast     7*16(%rdi), %xmm0, %xmm0
 792         vmovdqu         %xmm0, (%rsi)
 793         RET
 794 SYM_FUNC_END(aes_xts_encrypt_iv)
 795
 796 // Below are the actual AES-XTS encryption and decryption functions,
 797 // instantiated from the above macro.  They all have the following prototype:
 798 //
 799 // void (*xts_asm_func)(const struct crypto_aes_ctx *key,
 800 //                      const u8 *src, u8 *dst, unsigned int len,
 801 //                      u8 tweak[AES_BLOCK_SIZE]);
 802 //
 803 // |key| is the data key.  |tweak| contains the next tweak; the encryption of
 804 // the original IV with the tweak key was already done.  This function supports
 805 // incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
 806 // |len| must be a multiple of 16 except on the last call.  If |len| is a
 807 // multiple of 16, then this function updates |tweak| to contain the next tweak.
 808
 809 .set    VL, 16
 810 .set    USE_AVX10, 0
 811 SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx)
 812         _aes_xts_crypt  1
 813 SYM_FUNC_END(aes_xts_encrypt_aesni_avx)
 814 SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx)
 815         _aes_xts_crypt  0
 816 SYM_FUNC_END(aes_xts_decrypt_aesni_avx)
 817
 818 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
 819 .set    VL, 32
 820 .set    USE_AVX10, 0
 821 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2)
 822         _aes_xts_crypt  1
 823 SYM_FUNC_END(aes_xts_encrypt_vaes_avx2)
 824 SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2)
 825         _aes_xts_crypt  0
 826 SYM_FUNC_END(aes_xts_decrypt_vaes_avx2)
 827
 828 .set    VL, 32
 829 .set    USE_AVX10, 1
 830 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256)
 831         _aes_xts_crypt  1
 832 SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
 833 SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
 834         _aes_xts_crypt  0
 835 SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
 836
 837 .set    VL, 64
 838 .set    USE_AVX10, 1
 839 SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
 840         _aes_xts_crypt  1
 841 SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
 842 SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
 843         _aes_xts_crypt  0
 844 SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
 845 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */