arch/arm/crypto/aes-ce-core.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
   4  *
   5  * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
   6  */
   7
   8 #include <linux/linkage.h>
   9 #include <asm/assembler.h>
  10
  11         .text
  12         .arch           armv8-a
  13         .fpu            crypto-neon-fp-armv8
  14         .align          3
  15
  16         .macro          enc_round, state, key
  17         aese.8          \state, \key
  18         aesmc.8         \state, \state
  19         .endm
  20
  21         .macro          dec_round, state, key
  22         aesd.8          \state, \key
  23         aesimc.8        \state, \state
  24         .endm
  25
  26         .macro          enc_dround, key1, key2
  27         enc_round       q0, \key1
  28         enc_round       q0, \key2
  29         .endm
  30
  31         .macro          dec_dround, key1, key2
  32         dec_round       q0, \key1
  33         dec_round       q0, \key2
  34         .endm
  35
  36         .macro          enc_fround, key1, key2, key3
  37         enc_round       q0, \key1
  38         aese.8          q0, \key2
  39         veor            q0, q0, \key3
  40         .endm
  41
  42         .macro          dec_fround, key1, key2, key3
  43         dec_round       q0, \key1
  44         aesd.8          q0, \key2
  45         veor            q0, q0, \key3
  46         .endm
  47
  48         .macro          enc_dround_4x, key1, key2
  49         enc_round       q0, \key1
  50         enc_round       q1, \key1
  51         enc_round       q2, \key1
  52         enc_round       q3, \key1
  53         enc_round       q0, \key2
  54         enc_round       q1, \key2
  55         enc_round       q2, \key2
  56         enc_round       q3, \key2
  57         .endm
  58
  59         .macro          dec_dround_4x, key1, key2
  60         dec_round       q0, \key1
  61         dec_round       q1, \key1
  62         dec_round       q2, \key1
  63         dec_round       q3, \key1
  64         dec_round       q0, \key2
  65         dec_round       q1, \key2
  66         dec_round       q2, \key2
  67         dec_round       q3, \key2
  68         .endm
  69
  70         .macro          enc_fround_4x, key1, key2, key3
  71         enc_round       q0, \key1
  72         enc_round       q1, \key1
  73         enc_round       q2, \key1
  74         enc_round       q3, \key1
  75         aese.8          q0, \key2
  76         aese.8          q1, \key2
  77         aese.8          q2, \key2
  78         aese.8          q3, \key2
  79         veor            q0, q0, \key3
  80         veor            q1, q1, \key3
  81         veor            q2, q2, \key3
  82         veor            q3, q3, \key3
  83         .endm
  84
  85         .macro          dec_fround_4x, key1, key2, key3
  86         dec_round       q0, \key1
  87         dec_round       q1, \key1
  88         dec_round       q2, \key1
  89         dec_round       q3, \key1
  90         aesd.8          q0, \key2
  91         aesd.8          q1, \key2
  92         aesd.8          q2, \key2
  93         aesd.8          q3, \key2
  94         veor            q0, q0, \key3
  95         veor            q1, q1, \key3
  96         veor            q2, q2, \key3
  97         veor            q3, q3, \key3
  98         .endm
  99
 100         .macro          do_block, dround, fround
 101         cmp             r3, #12                 @ which key size?
 102         vld1.32         {q10-q11}, [ip]!
 103         \dround         q8, q9
 104         vld1.32         {q12-q13}, [ip]!
 105         \dround         q10, q11
 106         vld1.32         {q10-q11}, [ip]!
 107         \dround         q12, q13
 108         vld1.32         {q12-q13}, [ip]!
 109         \dround         q10, q11
 110         blo             0f                      @ AES-128: 10 rounds
 111         vld1.32         {q10-q11}, [ip]!
 112         \dround         q12, q13
 113         beq             1f                      @ AES-192: 12 rounds
 114         vld1.32         {q12-q13}, [ip]
 115         \dround         q10, q11
 116 0:      \fround         q12, q13, q14
 117         bx              lr
 118
 119 1:      \fround         q10, q11, q14
 120         bx              lr
 121         .endm
 122
 123         /*
 124          * Internal, non-AAPCS compliant functions that implement the core AES
 125          * transforms. These should preserve all registers except q0 - q2 and ip
 126          * Arguments:
 127          *   q0        : first in/output block
 128          *   q1        : second in/output block (_4x version only)
 129          *   q2        : third in/output block (_4x version only)
 130          *   q3        : fourth in/output block (_4x version only)
 131          *   q8        : first round key
 132          *   q9        : secound round key
 133          *   q14       : final round key
 134          *   r2        : address of round key array
 135          *   r3        : number of rounds
 136          */
 137         .align          6
 138 aes_encrypt:
 139         add             ip, r2, #32             @ 3rd round key
 140 .Laes_encrypt_tweak:
 141         do_block        enc_dround, enc_fround
 142 ENDPROC(aes_encrypt)
 143
 144         .align          6
 145 aes_decrypt:
 146         add             ip, r2, #32             @ 3rd round key
 147         do_block        dec_dround, dec_fround
 148 ENDPROC(aes_decrypt)
 149
 150         .align          6
 151 aes_encrypt_4x:
 152         add             ip, r2, #32             @ 3rd round key
 153         do_block        enc_dround_4x, enc_fround_4x
 154 ENDPROC(aes_encrypt_4x)
 155
 156         .align          6
 157 aes_decrypt_4x:
 158         add             ip, r2, #32             @ 3rd round key
 159         do_block        dec_dround_4x, dec_fround_4x
 160 ENDPROC(aes_decrypt_4x)
 161
 162         .macro          prepare_key, rk, rounds
 163         add             ip, \rk, \rounds, lsl #4
 164         vld1.32         {q8-q9}, [\rk]          @ load first 2 round keys
 165         vld1.32         {q14}, [ip]             @ load last round key
 166         .endm
 167
 168         /*
 169          * aes_ecb_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
 170          *                 int blocks)
 171          * aes_ecb_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
 172          *                 int blocks)
 173          */
 174 ENTRY(ce_aes_ecb_encrypt)
 175         push            {r4, lr}
 176         ldr             r4, [sp, #8]
 177         prepare_key     r2, r3
 178 .Lecbencloop4x:
 179         subs            r4, r4, #4
 180         bmi             .Lecbenc1x
 181         vld1.8          {q0-q1}, [r1]!
 182         vld1.8          {q2-q3}, [r1]!
 183         bl              aes_encrypt_4x
 184         vst1.8          {q0-q1}, [r0]!
 185         vst1.8          {q2-q3}, [r0]!
 186         b               .Lecbencloop4x
 187 .Lecbenc1x:
 188         adds            r4, r4, #4
 189         beq             .Lecbencout
 190 .Lecbencloop:
 191         vld1.8          {q0}, [r1]!
 192         bl              aes_encrypt
 193         vst1.8          {q0}, [r0]!
 194         subs            r4, r4, #1
 195         bne             .Lecbencloop
 196 .Lecbencout:
 197         pop             {r4, pc}
 198 ENDPROC(ce_aes_ecb_encrypt)
 199
 200 ENTRY(ce_aes_ecb_decrypt)
 201         push            {r4, lr}
 202         ldr             r4, [sp, #8]
 203         prepare_key     r2, r3
 204 .Lecbdecloop4x:
 205         subs            r4, r4, #4
 206         bmi             .Lecbdec1x
 207         vld1.8          {q0-q1}, [r1]!
 208         vld1.8          {q2-q3}, [r1]!
 209         bl              aes_decrypt_4x
 210         vst1.8          {q0-q1}, [r0]!
 211         vst1.8          {q2-q3}, [r0]!
 212         b               .Lecbdecloop4x
 213 .Lecbdec1x:
 214         adds            r4, r4, #4
 215         beq             .Lecbdecout
 216 .Lecbdecloop:
 217         vld1.8          {q0}, [r1]!
 218         bl              aes_decrypt
 219         vst1.8          {q0}, [r0]!
 220         subs            r4, r4, #1
 221         bne             .Lecbdecloop
 222 .Lecbdecout:
 223         pop             {r4, pc}
 224 ENDPROC(ce_aes_ecb_decrypt)
 225
 226         /*
 227          * aes_cbc_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
 228          *                 int blocks, u8 iv[])
 229          * aes_cbc_decrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
 230          *                 int blocks, u8 iv[])
 231          */
 232 ENTRY(ce_aes_cbc_encrypt)
 233         push            {r4-r6, lr}
 234         ldrd            r4, r5, [sp, #16]
 235         vld1.8          {q0}, [r5]
 236         prepare_key     r2, r3
 237 .Lcbcencloop:
 238         vld1.8          {q1}, [r1]!             @ get next pt block
 239         veor            q0, q0, q1              @ ..and xor with iv
 240         bl              aes_encrypt
 241         vst1.8          {q0}, [r0]!
 242         subs            r4, r4, #1
 243         bne             .Lcbcencloop
 244         vst1.8          {q0}, [r5]
 245         pop             {r4-r6, pc}
 246 ENDPROC(ce_aes_cbc_encrypt)
 247
 248 ENTRY(ce_aes_cbc_decrypt)
 249         push            {r4-r6, lr}
 250         ldrd            r4, r5, [sp, #16]
 251         vld1.8          {q15}, [r5]             @ keep iv in q15
 252         prepare_key     r2, r3
 253 .Lcbcdecloop4x:
 254         subs            r4, r4, #4
 255         bmi             .Lcbcdec1x
 256         vld1.8          {q0-q1}, [r1]!
 257         vld1.8          {q2-q3}, [r1]!
 258         vmov            q4, q0
 259         vmov            q5, q1
 260         vmov            q6, q2
 261         vmov            q7, q3
 262         bl              aes_decrypt_4x
 263         veor            q0, q0, q15
 264         veor            q1, q1, q4
 265         veor            q2, q2, q5
 266         veor            q3, q3, q6
 267         vmov            q15, q7
 268         vst1.8          {q0-q1}, [r0]!
 269         vst1.8          {q2-q3}, [r0]!
 270         b               .Lcbcdecloop4x
 271 .Lcbcdec1x:
 272         adds            r4, r4, #4
 273         beq             .Lcbcdecout
 274         vmov            q6, q14                 @ preserve last round key
 275 .Lcbcdecloop:
 276         vld1.8          {q0}, [r1]!             @ get next ct block
 277         veor            q14, q15, q6            @ combine prev ct with last key
 278         vmov            q15, q0
 279         bl              aes_decrypt
 280         vst1.8          {q0}, [r0]!
 281         subs            r4, r4, #1
 282         bne             .Lcbcdecloop
 283 .Lcbcdecout:
 284         vst1.8          {q15}, [r5]             @ keep iv in q15
 285         pop             {r4-r6, pc}
 286 ENDPROC(ce_aes_cbc_decrypt)
 287
 288
 289         /*
 290          * ce_aes_cbc_cts_encrypt(u8 out[], u8 const in[], u32 const rk[],
 291          *                        int rounds, int bytes, u8 const iv[])
 292          * ce_aes_cbc_cts_decrypt(u8 out[], u8 const in[], u32 const rk[],
 293          *                        int rounds, int bytes, u8 const iv[])
 294          */
 295
 296 ENTRY(ce_aes_cbc_cts_encrypt)
 297         push            {r4-r6, lr}
 298         ldrd            r4, r5, [sp, #16]
 299
 300         movw            ip, :lower16:.Lcts_permute_table
 301         movt            ip, :upper16:.Lcts_permute_table
 302         sub             r4, r4, #16
 303         add             lr, ip, #32
 304         add             ip, ip, r4
 305         sub             lr, lr, r4
 306         vld1.8          {q5}, [ip]
 307         vld1.8          {q6}, [lr]
 308
 309         add             ip, r1, r4
 310         vld1.8          {q0}, [r1]                      @ overlapping loads
 311         vld1.8          {q3}, [ip]
 312
 313         vld1.8          {q1}, [r5]                      @ get iv
 314         prepare_key     r2, r3
 315
 316         veor            q0, q0, q1                      @ xor with iv
 317         bl              aes_encrypt
 318
 319         vtbl.8          d4, {d0-d1}, d10
 320         vtbl.8          d5, {d0-d1}, d11
 321         vtbl.8          d2, {d6-d7}, d12
 322         vtbl.8          d3, {d6-d7}, d13
 323
 324         veor            q0, q0, q1
 325         bl              aes_encrypt
 326
 327         add             r4, r0, r4
 328         vst1.8          {q2}, [r4]                      @ overlapping stores
 329         vst1.8          {q0}, [r0]
 330
 331         pop             {r4-r6, pc}
 332 ENDPROC(ce_aes_cbc_cts_encrypt)
 333
 334 ENTRY(ce_aes_cbc_cts_decrypt)
 335         push            {r4-r6, lr}
 336         ldrd            r4, r5, [sp, #16]
 337
 338         movw            ip, :lower16:.Lcts_permute_table
 339         movt            ip, :upper16:.Lcts_permute_table
 340         sub             r4, r4, #16
 341         add             lr, ip, #32
 342         add             ip, ip, r4
 343         sub             lr, lr, r4
 344         vld1.8          {q5}, [ip]
 345         vld1.8          {q6}, [lr]
 346
 347         add             ip, r1, r4
 348         vld1.8          {q0}, [r1]                      @ overlapping loads
 349         vld1.8          {q1}, [ip]
 350
 351         vld1.8          {q3}, [r5]                      @ get iv
 352         prepare_key     r2, r3
 353
 354         bl              aes_decrypt
 355
 356         vtbl.8          d4, {d0-d1}, d10
 357         vtbl.8          d5, {d0-d1}, d11
 358         vtbx.8          d0, {d2-d3}, d12
 359         vtbx.8          d1, {d2-d3}, d13
 360
 361         veor            q1, q1, q2
 362         bl              aes_decrypt
 363         veor            q0, q0, q3                      @ xor with iv
 364
 365         add             r4, r0, r4
 366         vst1.8          {q1}, [r4]                      @ overlapping stores
 367         vst1.8          {q0}, [r0]
 368
 369         pop             {r4-r6, pc}
 370 ENDPROC(ce_aes_cbc_cts_decrypt)
 371
 372
 373         /*
 374          * aes_ctr_encrypt(u8 out[], u8 const in[], u32 const rk[], int rounds,
 375          *                 int blocks, u8 ctr[])
 376          */
 377 ENTRY(ce_aes_ctr_encrypt)
 378         push            {r4-r6, lr}
 379         ldrd            r4, r5, [sp, #16]
 380         vld1.8          {q7}, [r5]              @ load ctr
 381         prepare_key     r2, r3
 382         vmov            r6, s31                 @ keep swabbed ctr in r6
 383         rev             r6, r6
 384         cmn             r6, r4                  @ 32 bit overflow?
 385         bcs             .Lctrloop
 386 .Lctrloop4x:
 387         subs            r4, r4, #4
 388         bmi             .Lctr1x
 389
 390         /*
 391          * NOTE: the sequence below has been carefully tweaked to avoid
 392          * a silicon erratum that exists in Cortex-A57 (#1742098) and
 393          * Cortex-A72 (#1655431) cores, where AESE/AESMC instruction pairs
 394          * may produce an incorrect result if they take their input from a
 395          * register of which a single 32-bit lane has been updated the last
 396          * time it was modified. To work around this, the lanes of registers
 397          * q0-q3 below are not manipulated individually, and the different
 398          * counter values are prepared by successive manipulations of q7.
 399          */
 400         add             ip, r6, #1
 401         vmov            q0, q7
 402         rev             ip, ip
 403         add             lr, r6, #2
 404         vmov            s31, ip                 @ set lane 3 of q1 via q7
 405         add             ip, r6, #3
 406         rev             lr, lr
 407         vmov            q1, q7
 408         vmov            s31, lr                 @ set lane 3 of q2 via q7
 409         rev             ip, ip
 410         vmov            q2, q7
 411         vmov            s31, ip                 @ set lane 3 of q3 via q7
 412         add             r6, r6, #4
 413         vmov            q3, q7
 414
 415         vld1.8          {q4-q5}, [r1]!
 416         vld1.8          {q6}, [r1]!
 417         vld1.8          {q15}, [r1]!
 418         bl              aes_encrypt_4x
 419         veor            q0, q0, q4
 420         veor            q1, q1, q5
 421         veor            q2, q2, q6
 422         veor            q3, q3, q15
 423         rev             ip, r6
 424         vst1.8          {q0-q1}, [r0]!
 425         vst1.8          {q2-q3}, [r0]!
 426         vmov            s31, ip
 427         b               .Lctrloop4x
 428 .Lctr1x:
 429         adds            r4, r4, #4
 430         beq             .Lctrout
 431 .Lctrloop:
 432         vmov            q0, q7
 433         bl              aes_encrypt
 434
 435         adds            r6, r6, #1              @ increment BE ctr
 436         rev             ip, r6
 437         vmov            s31, ip
 438         bcs             .Lctrcarry
 439
 440 .Lctrcarrydone:
 441         subs            r4, r4, #1
 442         bmi             .Lctrtailblock          @ blocks < 0 means tail block
 443         vld1.8          {q3}, [r1]!
 444         veor            q3, q0, q3
 445         vst1.8          {q3}, [r0]!
 446         bne             .Lctrloop
 447
 448 .Lctrout:
 449         vst1.8          {q7}, [r5]              @ return next CTR value
 450         pop             {r4-r6, pc}
 451
 452 .Lctrtailblock:
 453         vst1.8          {q0}, [r0, :64]         @ return the key stream
 454         b               .Lctrout
 455
 456 .Lctrcarry:
 457         .irp            sreg, s30, s29, s28
 458         vmov            ip, \sreg               @ load next word of ctr
 459         rev             ip, ip                  @ ... to handle the carry
 460         adds            ip, ip, #1
 461         rev             ip, ip
 462         vmov            \sreg, ip
 463         bcc             .Lctrcarrydone
 464         .endr
 465         b               .Lctrcarrydone
 466 ENDPROC(ce_aes_ctr_encrypt)
 467
 468         /*
 469          * aes_xts_encrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
 470          *                 int bytes, u8 iv[], u32 const rk2[], int first)
 471          * aes_xts_decrypt(u8 out[], u8 const in[], u32 const rk1[], int rounds,
 472          *                 int bytes, u8 iv[], u32 const rk2[], int first)
 473          */
 474
 475         .macro          next_tweak, out, in, const, tmp
 476         vshr.s64        \tmp, \in, #63
 477         vand            \tmp, \tmp, \const
 478         vadd.u64        \out, \in, \in
 479         vext.8          \tmp, \tmp, \tmp, #8
 480         veor            \out, \out, \tmp
 481         .endm
 482
 483 ce_aes_xts_init:
 484         vmov.i32        d30, #0x87              @ compose tweak mask vector
 485         vmovl.u32       q15, d30
 486         vshr.u64        d30, d31, #7
 487
 488         ldrd            r4, r5, [sp, #16]       @ load args
 489         ldr             r6, [sp, #28]
 490         vld1.8          {q0}, [r5]              @ load iv
 491         teq             r6, #1                  @ start of a block?
 492         bxne            lr
 493
 494         @ Encrypt the IV in q0 with the second AES key. This should only
 495         @ be done at the start of a block.
 496         ldr             r6, [sp, #24]           @ load AES key 2
 497         prepare_key     r6, r3
 498         add             ip, r6, #32             @ 3rd round key of key 2
 499         b               .Laes_encrypt_tweak     @ tail call
 500 ENDPROC(ce_aes_xts_init)
 501
 502 ENTRY(ce_aes_xts_encrypt)
 503         push            {r4-r6, lr}
 504
 505         bl              ce_aes_xts_init         @ run shared prologue
 506         prepare_key     r2, r3
 507         vmov            q4, q0
 508
 509         teq             r6, #0                  @ start of a block?
 510         bne             .Lxtsenc4x
 511
 512 .Lxtsencloop4x:
 513         next_tweak      q4, q4, q15, q10
 514 .Lxtsenc4x:
 515         subs            r4, r4, #64
 516         bmi             .Lxtsenc1x
 517         vld1.8          {q0-q1}, [r1]!          @ get 4 pt blocks
 518         vld1.8          {q2-q3}, [r1]!
 519         next_tweak      q5, q4, q15, q10
 520         veor            q0, q0, q4
 521         next_tweak      q6, q5, q15, q10
 522         veor            q1, q1, q5
 523         next_tweak      q7, q6, q15, q10
 524         veor            q2, q2, q6
 525         veor            q3, q3, q7
 526         bl              aes_encrypt_4x
 527         veor            q0, q0, q4
 528         veor            q1, q1, q5
 529         veor            q2, q2, q6
 530         veor            q3, q3, q7
 531         vst1.8          {q0-q1}, [r0]!          @ write 4 ct blocks
 532         vst1.8          {q2-q3}, [r0]!
 533         vmov            q4, q7
 534         teq             r4, #0
 535         beq             .Lxtsencret
 536         b               .Lxtsencloop4x
 537 .Lxtsenc1x:
 538         adds            r4, r4, #64
 539         beq             .Lxtsencout
 540         subs            r4, r4, #16
 541         bmi             .LxtsencctsNx
 542 .Lxtsencloop:
 543         vld1.8          {q0}, [r1]!
 544 .Lxtsencctsout:
 545         veor            q0, q0, q4
 546         bl              aes_encrypt
 547         veor            q0, q0, q4
 548         teq             r4, #0
 549         beq             .Lxtsencout
 550         subs            r4, r4, #16
 551         next_tweak      q4, q4, q15, q6
 552         bmi             .Lxtsenccts
 553         vst1.8          {q0}, [r0]!
 554         b               .Lxtsencloop
 555 .Lxtsencout:
 556         vst1.8          {q0}, [r0]
 557 .Lxtsencret:
 558         vst1.8          {q4}, [r5]
 559         pop             {r4-r6, pc}
 560
 561 .LxtsencctsNx:
 562         vmov            q0, q3
 563         sub             r0, r0, #16
 564 .Lxtsenccts:
 565         movw            ip, :lower16:.Lcts_permute_table
 566         movt            ip, :upper16:.Lcts_permute_table
 567
 568         add             r1, r1, r4              @ rewind input pointer
 569         add             r4, r4, #16             @ # bytes in final block
 570         add             lr, ip, #32
 571         add             ip, ip, r4
 572         sub             lr, lr, r4
 573         add             r4, r0, r4              @ output address of final block
 574
 575         vld1.8          {q1}, [r1]              @ load final partial block
 576         vld1.8          {q2}, [ip]
 577         vld1.8          {q3}, [lr]
 578
 579         vtbl.8          d4, {d0-d1}, d4
 580         vtbl.8          d5, {d0-d1}, d5
 581         vtbx.8          d0, {d2-d3}, d6
 582         vtbx.8          d1, {d2-d3}, d7
 583
 584         vst1.8          {q2}, [r4]              @ overlapping stores
 585         mov             r4, #0
 586         b               .Lxtsencctsout
 587 ENDPROC(ce_aes_xts_encrypt)
 588
 589
 590 ENTRY(ce_aes_xts_decrypt)
 591         push            {r4-r6, lr}
 592
 593         bl              ce_aes_xts_init         @ run shared prologue
 594         prepare_key     r2, r3
 595         vmov            q4, q0
 596
 597         /* subtract 16 bytes if we are doing CTS */
 598         tst             r4, #0xf
 599         subne           r4, r4, #0x10
 600
 601         teq             r6, #0                  @ start of a block?
 602         bne             .Lxtsdec4x
 603
 604 .Lxtsdecloop4x:
 605         next_tweak      q4, q4, q15, q10
 606 .Lxtsdec4x:
 607         subs            r4, r4, #64
 608         bmi             .Lxtsdec1x
 609         vld1.8          {q0-q1}, [r1]!          @ get 4 ct blocks
 610         vld1.8          {q2-q3}, [r1]!
 611         next_tweak      q5, q4, q15, q10
 612         veor            q0, q0, q4
 613         next_tweak      q6, q5, q15, q10
 614         veor            q1, q1, q5
 615         next_tweak      q7, q6, q15, q10
 616         veor            q2, q2, q6
 617         veor            q3, q3, q7
 618         bl              aes_decrypt_4x
 619         veor            q0, q0, q4
 620         veor            q1, q1, q5
 621         veor            q2, q2, q6
 622         veor            q3, q3, q7
 623         vst1.8          {q0-q1}, [r0]!          @ write 4 pt blocks
 624         vst1.8          {q2-q3}, [r0]!
 625         vmov            q4, q7
 626         teq             r4, #0
 627         beq             .Lxtsdecout
 628         b               .Lxtsdecloop4x
 629 .Lxtsdec1x:
 630         adds            r4, r4, #64
 631         beq             .Lxtsdecout
 632         subs            r4, r4, #16
 633 .Lxtsdecloop:
 634         vld1.8          {q0}, [r1]!
 635         bmi             .Lxtsdeccts
 636 .Lxtsdecctsout:
 637         veor            q0, q0, q4
 638         bl              aes_decrypt
 639         veor            q0, q0, q4
 640         vst1.8          {q0}, [r0]!
 641         teq             r4, #0
 642         beq             .Lxtsdecout
 643         subs            r4, r4, #16
 644         next_tweak      q4, q4, q15, q6
 645         b               .Lxtsdecloop
 646 .Lxtsdecout:
 647         vst1.8          {q4}, [r5]
 648         pop             {r4-r6, pc}
 649
 650 .Lxtsdeccts:
 651         movw            ip, :lower16:.Lcts_permute_table
 652         movt            ip, :upper16:.Lcts_permute_table
 653
 654         add             r1, r1, r4              @ rewind input pointer
 655         add             r4, r4, #16             @ # bytes in final block
 656         add             lr, ip, #32
 657         add             ip, ip, r4
 658         sub             lr, lr, r4
 659         add             r4, r0, r4              @ output address of final block
 660
 661         next_tweak      q5, q4, q15, q6
 662
 663         vld1.8          {q1}, [r1]              @ load final partial block
 664         vld1.8          {q2}, [ip]
 665         vld1.8          {q3}, [lr]
 666
 667         veor            q0, q0, q5
 668         bl              aes_decrypt
 669         veor            q0, q0, q5
 670
 671         vtbl.8          d4, {d0-d1}, d4
 672         vtbl.8          d5, {d0-d1}, d5
 673         vtbx.8          d0, {d2-d3}, d6
 674         vtbx.8          d1, {d2-d3}, d7
 675
 676         vst1.8          {q2}, [r4]              @ overlapping stores
 677         mov             r4, #0
 678         b               .Lxtsdecctsout
 679 ENDPROC(ce_aes_xts_decrypt)
 680
 681         /*
 682          * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
 683          *                             AES sbox substitution on each byte in
 684          *                             'input'
 685          */
 686 ENTRY(ce_aes_sub)
 687         vdup.32         q1, r0
 688         veor            q0, q0, q0
 689         aese.8          q0, q1
 690         vmov            r0, s0
 691         bx              lr
 692 ENDPROC(ce_aes_sub)
 693
 694         /*
 695          * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
 696          *                                        operation on round key *src
 697          */
 698 ENTRY(ce_aes_invert)
 699         vld1.32         {q0}, [r1]
 700         aesimc.8        q0, q0
 701         vst1.32         {q0}, [r0]
 702         bx              lr
 703 ENDPROC(ce_aes_invert)
 704
 705         .section        ".rodata", "a"
 706         .align          6
 707 .Lcts_permute_table:
 708         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 709         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 710         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
 711         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
 712         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 713         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff