arch/arm/crypto/chacha-neon-core.S

   1 /*
   2  * ChaCha/XChaCha NEON helper functions
   3  *
   4  * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 as
   8  * published by the Free Software Foundation.
   9  *
  10  * Based on:
  11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
  12  *
  13  * Copyright (C) 2015 Martin Willi
  14  *
  15  * This program is free software; you can redistribute it and/or modify
  16  * it under the terms of the GNU General Public License as published by
  17  * the Free Software Foundation; either version 2 of the License, or
  18  * (at your option) any later version.
  19  */
  20
  21  /*
  22   * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
  23   *
  24   * (a)  vshl.u32 + vsri.u32            (needs temporary register)
  25   * (b)  vshl.u32 + vshr.u32 + vorr     (needs temporary register)
  26   * (c)  vrev32.16                      (16-bit rotations only)
  27   * (d)  vtbl.8 + vtbl.8                (multiple of 8 bits rotations only,
  28   *                                      needs index vector)
  29   *
  30   * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
  31   * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
  32   * cycles of (b) on both Cortex-A7 and Cortex-A53.
  33   *
  34   * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
  35   * and doesn't need a temporary register.
  36   *
  37   * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
  38   * is twice as fast as (a), even when doing (a) on multiple registers
  39   * simultaneously to eliminate the stall between vshl and vsri.  Also, it
  40   * parallelizes better when temporary registers are scarce.
  41   *
  42   * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
  43   * (a), so the need to load the rotation table actually makes the vtbl method
  44   * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
  45   * seems to be a good compromise to get a more significant speed boost on some
  46   * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
  47   */
  48
  49 #include <linux/linkage.h>
  50
  51         .text
  52         .fpu            neon
  53         .align          5
  54
  55 /*
  56  * chacha_permute - permute one block
  57  *
  58  * Permute one 64-byte block where the state matrix is stored in the four NEON
  59  * registers q0-q3.  It performs matrix operations on four words in parallel,
  60  * but requires shuffling to rearrange the words after each round.
  61  *
  62  * The round count is given in r3.
  63  *
  64  * Clobbers: r3, ip, q4-q5
  65  */
  66 chacha_permute:
  67
  68         adr             ip, .Lrol8_table
  69         vld1.8          {d10}, [ip, :64]
  70
  71 .Ldoubleround:
  72         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  73         vadd.i32        q0, q0, q1
  74         veor            q3, q3, q0
  75         vrev32.16       q3, q3
  76
  77         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  78         vadd.i32        q2, q2, q3
  79         veor            q4, q1, q2
  80         vshl.u32        q1, q4, #12
  81         vsri.u32        q1, q4, #20
  82
  83         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  84         vadd.i32        q0, q0, q1
  85         veor            q3, q3, q0
  86         vtbl.8          d6, {d6}, d10
  87         vtbl.8          d7, {d7}, d10
  88
  89         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  90         vadd.i32        q2, q2, q3
  91         veor            q4, q1, q2
  92         vshl.u32        q1, q4, #7
  93         vsri.u32        q1, q4, #25
  94
  95         // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  96         vext.8          q1, q1, q1, #4
  97         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  98         vext.8          q2, q2, q2, #8
  99         // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
 100         vext.8          q3, q3, q3, #12
 101
 102         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 103         vadd.i32        q0, q0, q1
 104         veor            q3, q3, q0
 105         vrev32.16       q3, q3
 106
 107         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 108         vadd.i32        q2, q2, q3
 109         veor            q4, q1, q2
 110         vshl.u32        q1, q4, #12
 111         vsri.u32        q1, q4, #20
 112
 113         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 114         vadd.i32        q0, q0, q1
 115         veor            q3, q3, q0
 116         vtbl.8          d6, {d6}, d10
 117         vtbl.8          d7, {d7}, d10
 118
 119         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 120         vadd.i32        q2, q2, q3
 121         veor            q4, q1, q2
 122         vshl.u32        q1, q4, #7
 123         vsri.u32        q1, q4, #25
 124
 125         // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
 126         vext.8          q1, q1, q1, #12
 127         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
 128         vext.8          q2, q2, q2, #8
 129         // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 130         vext.8          q3, q3, q3, #4
 131
 132         subs            r3, r3, #2
 133         bne             .Ldoubleround
 134
 135         bx              lr
 136 ENDPROC(chacha_permute)
 137
 138 ENTRY(chacha_block_xor_neon)
 139         // r0: Input state matrix, s
 140         // r1: 1 data block output, o
 141         // r2: 1 data block input, i
 142         // r3: nrounds
 143         push            {lr}
 144
 145         // x0..3 = s0..3
 146         add             ip, r0, #0x20
 147         vld1.32         {q0-q1}, [r0]
 148         vld1.32         {q2-q3}, [ip]
 149
 150         vmov            q8, q0
 151         vmov            q9, q1
 152         vmov            q10, q2
 153         vmov            q11, q3
 154
 155         bl              chacha_permute
 156
 157         add             ip, r2, #0x20
 158         vld1.8          {q4-q5}, [r2]
 159         vld1.8          {q6-q7}, [ip]
 160
 161         // o0 = i0 ^ (x0 + s0)
 162         vadd.i32        q0, q0, q8
 163         veor            q0, q0, q4
 164
 165         // o1 = i1 ^ (x1 + s1)
 166         vadd.i32        q1, q1, q9
 167         veor            q1, q1, q5
 168
 169         // o2 = i2 ^ (x2 + s2)
 170         vadd.i32        q2, q2, q10
 171         veor            q2, q2, q6
 172
 173         // o3 = i3 ^ (x3 + s3)
 174         vadd.i32        q3, q3, q11
 175         veor            q3, q3, q7
 176
 177         add             ip, r1, #0x20
 178         vst1.8          {q0-q1}, [r1]
 179         vst1.8          {q2-q3}, [ip]
 180
 181         pop             {pc}
 182 ENDPROC(chacha_block_xor_neon)
 183
 184 ENTRY(hchacha_block_neon)
 185         // r0: Input state matrix, s
 186         // r1: output (8 32-bit words)
 187         // r2: nrounds
 188         push            {lr}
 189
 190         vld1.32         {q0-q1}, [r0]!
 191         vld1.32         {q2-q3}, [r0]
 192
 193         mov             r3, r2
 194         bl              chacha_permute
 195
 196         vst1.32         {q0}, [r1]!
 197         vst1.32         {q3}, [r1]
 198
 199         pop             {pc}
 200 ENDPROC(hchacha_block_neon)
 201
 202         .align          4
 203 .Lctrinc:       .word   0, 1, 2, 3
 204 .Lrol8_table:   .byte   3, 0, 1, 2, 7, 4, 5, 6
 205
 206         .align          5
 207 ENTRY(chacha_4block_xor_neon)
 208         push            {r4-r5}
 209         mov             r4, sp                  // preserve the stack pointer
 210         sub             ip, sp, #0x20           // allocate a 32 byte buffer
 211         bic             ip, ip, #0x1f           // aligned to 32 bytes
 212         mov             sp, ip
 213
 214         // r0: Input state matrix, s
 215         // r1: 4 data blocks output, o
 216         // r2: 4 data blocks input, i
 217         // r3: nrounds
 218
 219         //
 220         // This function encrypts four consecutive ChaCha blocks by loading
 221         // the state matrix in NEON registers four times. The algorithm performs
 222         // each operation on the corresponding word of each state matrix, hence
 223         // requires no word shuffling. The words are re-interleaved before the
 224         // final addition of the original state and the XORing step.
 225         //
 226
 227         // x0..15[0-3] = s0..15[0-3]
 228         add             ip, r0, #0x20
 229         vld1.32         {q0-q1}, [r0]
 230         vld1.32         {q2-q3}, [ip]
 231
 232         adr             r5, .Lctrinc
 233         vdup.32         q15, d7[1]
 234         vdup.32         q14, d7[0]
 235         vld1.32         {q4}, [r5, :128]
 236         vdup.32         q13, d6[1]
 237         vdup.32         q12, d6[0]
 238         vdup.32         q11, d5[1]
 239         vdup.32         q10, d5[0]
 240         vadd.u32        q12, q12, q4            // x12 += counter values 0-3
 241         vdup.32         q9, d4[1]
 242         vdup.32         q8, d4[0]
 243         vdup.32         q7, d3[1]
 244         vdup.32         q6, d3[0]
 245         vdup.32         q5, d2[1]
 246         vdup.32         q4, d2[0]
 247         vdup.32         q3, d1[1]
 248         vdup.32         q2, d1[0]
 249         vdup.32         q1, d0[1]
 250         vdup.32         q0, d0[0]
 251
 252         adr             ip, .Lrol8_table
 253         b               1f
 254
 255 .Ldoubleround4:
 256         vld1.32         {q8-q9}, [sp, :256]
 257 1:
 258         // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 259         // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
 260         // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
 261         // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
 262         vadd.i32        q0, q0, q4
 263         vadd.i32        q1, q1, q5
 264         vadd.i32        q2, q2, q6
 265         vadd.i32        q3, q3, q7
 266
 267         veor            q12, q12, q0
 268         veor            q13, q13, q1
 269         veor            q14, q14, q2
 270         veor            q15, q15, q3
 271
 272         vrev32.16       q12, q12
 273         vrev32.16       q13, q13
 274         vrev32.16       q14, q14
 275         vrev32.16       q15, q15
 276
 277         // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
 278         // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
 279         // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
 280         // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
 281         vadd.i32        q8, q8, q12
 282         vadd.i32        q9, q9, q13
 283         vadd.i32        q10, q10, q14
 284         vadd.i32        q11, q11, q15
 285
 286         vst1.32         {q8-q9}, [sp, :256]
 287
 288         veor            q8, q4, q8
 289         veor            q9, q5, q9
 290         vshl.u32        q4, q8, #12
 291         vshl.u32        q5, q9, #12
 292         vsri.u32        q4, q8, #20
 293         vsri.u32        q5, q9, #20
 294
 295         veor            q8, q6, q10
 296         veor            q9, q7, q11
 297         vshl.u32        q6, q8, #12
 298         vshl.u32        q7, q9, #12
 299         vsri.u32        q6, q8, #20
 300         vsri.u32        q7, q9, #20
 301
 302         // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
 303         // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 304         // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 305         // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
 306         vld1.8          {d16}, [ip, :64]
 307         vadd.i32        q0, q0, q4
 308         vadd.i32        q1, q1, q5
 309         vadd.i32        q2, q2, q6
 310         vadd.i32        q3, q3, q7
 311
 312         veor            q12, q12, q0
 313         veor            q13, q13, q1
 314         veor            q14, q14, q2
 315         veor            q15, q15, q3
 316
 317         vtbl.8          d24, {d24}, d16
 318         vtbl.8          d25, {d25}, d16
 319         vtbl.8          d26, {d26}, d16
 320         vtbl.8          d27, {d27}, d16
 321         vtbl.8          d28, {d28}, d16
 322         vtbl.8          d29, {d29}, d16
 323         vtbl.8          d30, {d30}, d16
 324         vtbl.8          d31, {d31}, d16
 325
 326         vld1.32         {q8-q9}, [sp, :256]
 327
 328         // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
 329         // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
 330         // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
 331         // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
 332         vadd.i32        q8, q8, q12
 333         vadd.i32        q9, q9, q13
 334         vadd.i32        q10, q10, q14
 335         vadd.i32        q11, q11, q15
 336
 337         vst1.32         {q8-q9}, [sp, :256]
 338
 339         veor            q8, q4, q8
 340         veor            q9, q5, q9
 341         vshl.u32        q4, q8, #7
 342         vshl.u32        q5, q9, #7
 343         vsri.u32        q4, q8, #25
 344         vsri.u32        q5, q9, #25
 345
 346         veor            q8, q6, q10
 347         veor            q9, q7, q11
 348         vshl.u32        q6, q8, #7
 349         vshl.u32        q7, q9, #7
 350         vsri.u32        q6, q8, #25
 351         vsri.u32        q7, q9, #25
 352
 353         vld1.32         {q8-q9}, [sp, :256]
 354
 355         // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
 356         // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
 357         // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
 358         // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
 359         vadd.i32        q0, q0, q5
 360         vadd.i32        q1, q1, q6
 361         vadd.i32        q2, q2, q7
 362         vadd.i32        q3, q3, q4
 363
 364         veor            q15, q15, q0
 365         veor            q12, q12, q1
 366         veor            q13, q13, q2
 367         veor            q14, q14, q3
 368
 369         vrev32.16       q15, q15
 370         vrev32.16       q12, q12
 371         vrev32.16       q13, q13
 372         vrev32.16       q14, q14
 373
 374         // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
 375         // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
 376         // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
 377         // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
 378         vadd.i32        q10, q10, q15
 379         vadd.i32        q11, q11, q12
 380         vadd.i32        q8, q8, q13
 381         vadd.i32        q9, q9, q14
 382
 383         vst1.32         {q8-q9}, [sp, :256]
 384
 385         veor            q8, q7, q8
 386         veor            q9, q4, q9
 387         vshl.u32        q7, q8, #12
 388         vshl.u32        q4, q9, #12
 389         vsri.u32        q7, q8, #20
 390         vsri.u32        q4, q9, #20
 391
 392         veor            q8, q5, q10
 393         veor            q9, q6, q11
 394         vshl.u32        q5, q8, #12
 395         vshl.u32        q6, q9, #12
 396         vsri.u32        q5, q8, #20
 397         vsri.u32        q6, q9, #20
 398
 399         // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
 400         // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 401         // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 402         // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
 403         vld1.8          {d16}, [ip, :64]
 404         vadd.i32        q0, q0, q5
 405         vadd.i32        q1, q1, q6
 406         vadd.i32        q2, q2, q7
 407         vadd.i32        q3, q3, q4
 408
 409         veor            q15, q15, q0
 410         veor            q12, q12, q1
 411         veor            q13, q13, q2
 412         veor            q14, q14, q3
 413
 414         vtbl.8          d30, {d30}, d16
 415         vtbl.8          d31, {d31}, d16
 416         vtbl.8          d24, {d24}, d16
 417         vtbl.8          d25, {d25}, d16
 418         vtbl.8          d26, {d26}, d16
 419         vtbl.8          d27, {d27}, d16
 420         vtbl.8          d28, {d28}, d16
 421         vtbl.8          d29, {d29}, d16
 422
 423         vld1.32         {q8-q9}, [sp, :256]
 424
 425         // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
 426         // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
 427         // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
 428         // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
 429         vadd.i32        q10, q10, q15
 430         vadd.i32        q11, q11, q12
 431         vadd.i32        q8, q8, q13
 432         vadd.i32        q9, q9, q14
 433
 434         vst1.32         {q8-q9}, [sp, :256]
 435
 436         veor            q8, q7, q8
 437         veor            q9, q4, q9
 438         vshl.u32        q7, q8, #7
 439         vshl.u32        q4, q9, #7
 440         vsri.u32        q7, q8, #25
 441         vsri.u32        q4, q9, #25
 442
 443         veor            q8, q5, q10
 444         veor            q9, q6, q11
 445         vshl.u32        q5, q8, #7
 446         vshl.u32        q6, q9, #7
 447         vsri.u32        q5, q8, #25
 448         vsri.u32        q6, q9, #25
 449
 450         subs            r3, r3, #2
 451         bne             .Ldoubleround4
 452
 453         // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
 454         // x8..9[0-3] are on the stack.
 455
 456         // Re-interleave the words in the first two rows of each block (x0..7).
 457         // Also add the counter values 0-3 to x12[0-3].
 458           vld1.32       {q8}, [r5, :128]        // load counter values 0-3
 459         vzip.32         q0, q1                  // => (0 1 0 1) (0 1 0 1)
 460         vzip.32         q2, q3                  // => (2 3 2 3) (2 3 2 3)
 461         vzip.32         q4, q5                  // => (4 5 4 5) (4 5 4 5)
 462         vzip.32         q6, q7                  // => (6 7 6 7) (6 7 6 7)
 463           vadd.u32      q12, q8                 // x12 += counter values 0-3
 464         vswp            d1, d4
 465         vswp            d3, d6
 466           vld1.32       {q8-q9}, [r0]!          // load s0..7
 467         vswp            d9, d12
 468         vswp            d11, d14
 469
 470         // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
 471         // after XORing the first 32 bytes.
 472         vswp            q1, q4
 473
 474         // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
 475
 476         // x0..3[0-3] += s0..3[0-3]     (add orig state to 1st row of each block)
 477         vadd.u32        q0, q0, q8
 478         vadd.u32        q2, q2, q8
 479         vadd.u32        q4, q4, q8
 480         vadd.u32        q3, q3, q8
 481
 482         // x4..7[0-3] += s4..7[0-3]     (add orig state to 2nd row of each block)
 483         vadd.u32        q1, q1, q9
 484         vadd.u32        q6, q6, q9
 485         vadd.u32        q5, q5, q9
 486         vadd.u32        q7, q7, q9
 487
 488         // XOR first 32 bytes using keystream from first two rows of first block
 489         vld1.8          {q8-q9}, [r2]!
 490         veor            q8, q8, q0
 491         veor            q9, q9, q1
 492         vst1.8          {q8-q9}, [r1]!
 493
 494         // Re-interleave the words in the last two rows of each block (x8..15).
 495         vld1.32         {q8-q9}, [sp, :256]
 496         vzip.32         q12, q13        // => (12 13 12 13) (12 13 12 13)
 497         vzip.32         q14, q15        // => (14 15 14 15) (14 15 14 15)
 498         vzip.32         q8, q9          // => (8 9 8 9) (8 9 8 9)
 499         vzip.32         q10, q11        // => (10 11 10 11) (10 11 10 11)
 500           vld1.32       {q0-q1}, [r0]   // load s8..15
 501         vswp            d25, d28
 502         vswp            d27, d30
 503         vswp            d17, d20
 504         vswp            d19, d22
 505
 506         // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
 507
 508         // x8..11[0-3] += s8..11[0-3]   (add orig state to 3rd row of each block)
 509         vadd.u32        q8,  q8,  q0
 510         vadd.u32        q10, q10, q0
 511         vadd.u32        q9,  q9,  q0
 512         vadd.u32        q11, q11, q0
 513
 514         // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
 515         vadd.u32        q12, q12, q1
 516         vadd.u32        q14, q14, q1
 517         vadd.u32        q13, q13, q1
 518         vadd.u32        q15, q15, q1
 519
 520         // XOR the rest of the data with the keystream
 521
 522         vld1.8          {q0-q1}, [r2]!
 523         veor            q0, q0, q8
 524         veor            q1, q1, q12
 525         vst1.8          {q0-q1}, [r1]!
 526
 527         vld1.8          {q0-q1}, [r2]!
 528         veor            q0, q0, q2
 529         veor            q1, q1, q6
 530         vst1.8          {q0-q1}, [r1]!
 531
 532         vld1.8          {q0-q1}, [r2]!
 533         veor            q0, q0, q10
 534         veor            q1, q1, q14
 535         vst1.8          {q0-q1}, [r1]!
 536
 537         vld1.8          {q0-q1}, [r2]!
 538         veor            q0, q0, q4
 539         veor            q1, q1, q5
 540         vst1.8          {q0-q1}, [r1]!
 541
 542         vld1.8          {q0-q1}, [r2]!
 543         veor            q0, q0, q9
 544         veor            q1, q1, q13
 545         vst1.8          {q0-q1}, [r1]!
 546
 547         vld1.8          {q0-q1}, [r2]!
 548         veor            q0, q0, q3
 549         veor            q1, q1, q7
 550         vst1.8          {q0-q1}, [r1]!
 551
 552         vld1.8          {q0-q1}, [r2]
 553           mov           sp, r4          // restore original stack pointer
 554         veor            q0, q0, q11
 555         veor            q1, q1, q15
 556         vst1.8          {q0-q1}, [r1]
 557
 558         pop             {r4-r5}
 559         bx              lr
 560 ENDPROC(chacha_4block_xor_neon)