arch/arm/crypto/chacha-neon-core.S

   1 /*
   2  * ChaCha/XChaCha NEON helper functions
   3  *
   4  * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 as
   8  * published by the Free Software Foundation.
   9  *
  10  * Based on:
  11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
  12  *
  13  * Copyright (C) 2015 Martin Willi
  14  *
  15  * This program is free software; you can redistribute it and/or modify
  16  * it under the terms of the GNU General Public License as published by
  17  * the Free Software Foundation; either version 2 of the License, or
  18  * (at your option) any later version.
  19  */
  20
  21  /*
  22   * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
  23   *
  24   * (a)  vshl.u32 + vsri.u32            (needs temporary register)
  25   * (b)  vshl.u32 + vshr.u32 + vorr     (needs temporary register)
  26   * (c)  vrev32.16                      (16-bit rotations only)
  27   * (d)  vtbl.8 + vtbl.8                (multiple of 8 bits rotations only,
  28   *                                      needs index vector)
  29   *
  30   * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
  31   * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
  32   * cycles of (b) on both Cortex-A7 and Cortex-A53.
  33   *
  34   * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
  35   * and doesn't need a temporary register.
  36   *
  37   * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
  38   * is twice as fast as (a), even when doing (a) on multiple registers
  39   * simultaneously to eliminate the stall between vshl and vsri.  Also, it
  40   * parallelizes better when temporary registers are scarce.
  41   *
  42   * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
  43   * (a), so the need to load the rotation table actually makes the vtbl method
  44   * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
  45   * seems to be a good compromise to get a more significant speed boost on some
  46   * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
  47   */
  48
  49 #include <linux/linkage.h>
  50 #include <asm/cache.h>
  51
  52         .text
  53         .fpu            neon
  54         .align          5
  55
  56 /*
  57  * chacha_permute - permute one block
  58  *
  59  * Permute one 64-byte block where the state matrix is stored in the four NEON
  60  * registers q0-q3.  It performs matrix operations on four words in parallel,
  61  * but requires shuffling to rearrange the words after each round.
  62  *
  63  * The round count is given in r3.
  64  *
  65  * Clobbers: r3, ip, q4-q5
  66  */
  67 chacha_permute:
  68
  69         adr             ip, .Lrol8_table
  70         vld1.8          {d10}, [ip, :64]
  71
  72 .Ldoubleround:
  73         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  74         vadd.i32        q0, q0, q1
  75         veor            q3, q3, q0
  76         vrev32.16       q3, q3
  77
  78         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  79         vadd.i32        q2, q2, q3
  80         veor            q4, q1, q2
  81         vshl.u32        q1, q4, #12
  82         vsri.u32        q1, q4, #20
  83
  84         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  85         vadd.i32        q0, q0, q1
  86         veor            q3, q3, q0
  87         vtbl.8          d6, {d6}, d10
  88         vtbl.8          d7, {d7}, d10
  89
  90         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  91         vadd.i32        q2, q2, q3
  92         veor            q4, q1, q2
  93         vshl.u32        q1, q4, #7
  94         vsri.u32        q1, q4, #25
  95
  96         // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  97         vext.8          q1, q1, q1, #4
  98         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  99         vext.8          q2, q2, q2, #8
 100         // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
 101         vext.8          q3, q3, q3, #12
 102
 103         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 104         vadd.i32        q0, q0, q1
 105         veor            q3, q3, q0
 106         vrev32.16       q3, q3
 107
 108         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 109         vadd.i32        q2, q2, q3
 110         veor            q4, q1, q2
 111         vshl.u32        q1, q4, #12
 112         vsri.u32        q1, q4, #20
 113
 114         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
 115         vadd.i32        q0, q0, q1
 116         veor            q3, q3, q0
 117         vtbl.8          d6, {d6}, d10
 118         vtbl.8          d7, {d7}, d10
 119
 120         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
 121         vadd.i32        q2, q2, q3
 122         veor            q4, q1, q2
 123         vshl.u32        q1, q4, #7
 124         vsri.u32        q1, q4, #25
 125
 126         // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
 127         vext.8          q1, q1, q1, #12
 128         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
 129         vext.8          q2, q2, q2, #8
 130         // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 131         vext.8          q3, q3, q3, #4
 132
 133         subs            r3, r3, #2
 134         bne             .Ldoubleround
 135
 136         bx              lr
 137 ENDPROC(chacha_permute)
 138
 139 ENTRY(chacha_block_xor_neon)
 140         // r0: Input state matrix, s
 141         // r1: 1 data block output, o
 142         // r2: 1 data block input, i
 143         // r3: nrounds
 144         push            {lr}
 145
 146         // x0..3 = s0..3
 147         add             ip, r0, #0x20
 148         vld1.32         {q0-q1}, [r0]
 149         vld1.32         {q2-q3}, [ip]
 150
 151         vmov            q8, q0
 152         vmov            q9, q1
 153         vmov            q10, q2
 154         vmov            q11, q3
 155
 156         bl              chacha_permute
 157
 158         add             ip, r2, #0x20
 159         vld1.8          {q4-q5}, [r2]
 160         vld1.8          {q6-q7}, [ip]
 161
 162         // o0 = i0 ^ (x0 + s0)
 163         vadd.i32        q0, q0, q8
 164         veor            q0, q0, q4
 165
 166         // o1 = i1 ^ (x1 + s1)
 167         vadd.i32        q1, q1, q9
 168         veor            q1, q1, q5
 169
 170         // o2 = i2 ^ (x2 + s2)
 171         vadd.i32        q2, q2, q10
 172         veor            q2, q2, q6
 173
 174         // o3 = i3 ^ (x3 + s3)
 175         vadd.i32        q3, q3, q11
 176         veor            q3, q3, q7
 177
 178         add             ip, r1, #0x20
 179         vst1.8          {q0-q1}, [r1]
 180         vst1.8          {q2-q3}, [ip]
 181
 182         pop             {pc}
 183 ENDPROC(chacha_block_xor_neon)
 184
 185 ENTRY(hchacha_block_neon)
 186         // r0: Input state matrix, s
 187         // r1: output (8 32-bit words)
 188         // r2: nrounds
 189         push            {lr}
 190
 191         vld1.32         {q0-q1}, [r0]!
 192         vld1.32         {q2-q3}, [r0]
 193
 194         mov             r3, r2
 195         bl              chacha_permute
 196
 197         vst1.32         {q0}, [r1]!
 198         vst1.32         {q3}, [r1]
 199
 200         pop             {pc}
 201 ENDPROC(hchacha_block_neon)
 202
 203         .align          4
 204 .Lctrinc:       .word   0, 1, 2, 3
 205 .Lrol8_table:   .byte   3, 0, 1, 2, 7, 4, 5, 6
 206
 207         .align          5
 208 ENTRY(chacha_4block_xor_neon)
 209         push            {r4, lr}
 210         mov             r4, sp                  // preserve the stack pointer
 211         sub             ip, sp, #0x20           // allocate a 32 byte buffer
 212         bic             ip, ip, #0x1f           // aligned to 32 bytes
 213         mov             sp, ip
 214
 215         // r0: Input state matrix, s
 216         // r1: 4 data blocks output, o
 217         // r2: 4 data blocks input, i
 218         // r3: nrounds
 219
 220         //
 221         // This function encrypts four consecutive ChaCha blocks by loading
 222         // the state matrix in NEON registers four times. The algorithm performs
 223         // each operation on the corresponding word of each state matrix, hence
 224         // requires no word shuffling. The words are re-interleaved before the
 225         // final addition of the original state and the XORing step.
 226         //
 227
 228         // x0..15[0-3] = s0..15[0-3]
 229         add             ip, r0, #0x20
 230         vld1.32         {q0-q1}, [r0]
 231         vld1.32         {q2-q3}, [ip]
 232
 233         adr             lr, .Lctrinc
 234         vdup.32         q15, d7[1]
 235         vdup.32         q14, d7[0]
 236         vld1.32         {q4}, [lr, :128]
 237         vdup.32         q13, d6[1]
 238         vdup.32         q12, d6[0]
 239         vdup.32         q11, d5[1]
 240         vdup.32         q10, d5[0]
 241         vadd.u32        q12, q12, q4            // x12 += counter values 0-3
 242         vdup.32         q9, d4[1]
 243         vdup.32         q8, d4[0]
 244         vdup.32         q7, d3[1]
 245         vdup.32         q6, d3[0]
 246         vdup.32         q5, d2[1]
 247         vdup.32         q4, d2[0]
 248         vdup.32         q3, d1[1]
 249         vdup.32         q2, d1[0]
 250         vdup.32         q1, d0[1]
 251         vdup.32         q0, d0[0]
 252
 253         adr             ip, .Lrol8_table
 254         b               1f
 255
 256 .Ldoubleround4:
 257         vld1.32         {q8-q9}, [sp, :256]
 258 1:
 259         // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 260         // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
 261         // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
 262         // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
 263         vadd.i32        q0, q0, q4
 264         vadd.i32        q1, q1, q5
 265         vadd.i32        q2, q2, q6
 266         vadd.i32        q3, q3, q7
 267
 268         veor            q12, q12, q0
 269         veor            q13, q13, q1
 270         veor            q14, q14, q2
 271         veor            q15, q15, q3
 272
 273         vrev32.16       q12, q12
 274         vrev32.16       q13, q13
 275         vrev32.16       q14, q14
 276         vrev32.16       q15, q15
 277
 278         // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
 279         // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
 280         // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
 281         // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
 282         vadd.i32        q8, q8, q12
 283         vadd.i32        q9, q9, q13
 284         vadd.i32        q10, q10, q14
 285         vadd.i32        q11, q11, q15
 286
 287         vst1.32         {q8-q9}, [sp, :256]
 288
 289         veor            q8, q4, q8
 290         veor            q9, q5, q9
 291         vshl.u32        q4, q8, #12
 292         vshl.u32        q5, q9, #12
 293         vsri.u32        q4, q8, #20
 294         vsri.u32        q5, q9, #20
 295
 296         veor            q8, q6, q10
 297         veor            q9, q7, q11
 298         vshl.u32        q6, q8, #12
 299         vshl.u32        q7, q9, #12
 300         vsri.u32        q6, q8, #20
 301         vsri.u32        q7, q9, #20
 302
 303         // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
 304         // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 305         // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 306         // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
 307         vld1.8          {d16}, [ip, :64]
 308         vadd.i32        q0, q0, q4
 309         vadd.i32        q1, q1, q5
 310         vadd.i32        q2, q2, q6
 311         vadd.i32        q3, q3, q7
 312
 313         veor            q12, q12, q0
 314         veor            q13, q13, q1
 315         veor            q14, q14, q2
 316         veor            q15, q15, q3
 317
 318         vtbl.8          d24, {d24}, d16
 319         vtbl.8          d25, {d25}, d16
 320         vtbl.8          d26, {d26}, d16
 321         vtbl.8          d27, {d27}, d16
 322         vtbl.8          d28, {d28}, d16
 323         vtbl.8          d29, {d29}, d16
 324         vtbl.8          d30, {d30}, d16
 325         vtbl.8          d31, {d31}, d16
 326
 327         vld1.32         {q8-q9}, [sp, :256]
 328
 329         // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
 330         // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
 331         // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
 332         // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
 333         vadd.i32        q8, q8, q12
 334         vadd.i32        q9, q9, q13
 335         vadd.i32        q10, q10, q14
 336         vadd.i32        q11, q11, q15
 337
 338         vst1.32         {q8-q9}, [sp, :256]
 339
 340         veor            q8, q4, q8
 341         veor            q9, q5, q9
 342         vshl.u32        q4, q8, #7
 343         vshl.u32        q5, q9, #7
 344         vsri.u32        q4, q8, #25
 345         vsri.u32        q5, q9, #25
 346
 347         veor            q8, q6, q10
 348         veor            q9, q7, q11
 349         vshl.u32        q6, q8, #7
 350         vshl.u32        q7, q9, #7
 351         vsri.u32        q6, q8, #25
 352         vsri.u32        q7, q9, #25
 353
 354         vld1.32         {q8-q9}, [sp, :256]
 355
 356         // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
 357         // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
 358         // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
 359         // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
 360         vadd.i32        q0, q0, q5
 361         vadd.i32        q1, q1, q6
 362         vadd.i32        q2, q2, q7
 363         vadd.i32        q3, q3, q4
 364
 365         veor            q15, q15, q0
 366         veor            q12, q12, q1
 367         veor            q13, q13, q2
 368         veor            q14, q14, q3
 369
 370         vrev32.16       q15, q15
 371         vrev32.16       q12, q12
 372         vrev32.16       q13, q13
 373         vrev32.16       q14, q14
 374
 375         // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
 376         // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
 377         // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
 378         // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
 379         vadd.i32        q10, q10, q15
 380         vadd.i32        q11, q11, q12
 381         vadd.i32        q8, q8, q13
 382         vadd.i32        q9, q9, q14
 383
 384         vst1.32         {q8-q9}, [sp, :256]
 385
 386         veor            q8, q7, q8
 387         veor            q9, q4, q9
 388         vshl.u32        q7, q8, #12
 389         vshl.u32        q4, q9, #12
 390         vsri.u32        q7, q8, #20
 391         vsri.u32        q4, q9, #20
 392
 393         veor            q8, q5, q10
 394         veor            q9, q6, q11
 395         vshl.u32        q5, q8, #12
 396         vshl.u32        q6, q9, #12
 397         vsri.u32        q5, q8, #20
 398         vsri.u32        q6, q9, #20
 399
 400         // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
 401         // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 402         // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 403         // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
 404         vld1.8          {d16}, [ip, :64]
 405         vadd.i32        q0, q0, q5
 406         vadd.i32        q1, q1, q6
 407         vadd.i32        q2, q2, q7
 408         vadd.i32        q3, q3, q4
 409
 410         veor            q15, q15, q0
 411         veor            q12, q12, q1
 412         veor            q13, q13, q2
 413         veor            q14, q14, q3
 414
 415         vtbl.8          d30, {d30}, d16
 416         vtbl.8          d31, {d31}, d16
 417         vtbl.8          d24, {d24}, d16
 418         vtbl.8          d25, {d25}, d16
 419         vtbl.8          d26, {d26}, d16
 420         vtbl.8          d27, {d27}, d16
 421         vtbl.8          d28, {d28}, d16
 422         vtbl.8          d29, {d29}, d16
 423
 424         vld1.32         {q8-q9}, [sp, :256]
 425
 426         // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
 427         // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
 428         // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
 429         // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
 430         vadd.i32        q10, q10, q15
 431         vadd.i32        q11, q11, q12
 432         vadd.i32        q8, q8, q13
 433         vadd.i32        q9, q9, q14
 434
 435         vst1.32         {q8-q9}, [sp, :256]
 436
 437         veor            q8, q7, q8
 438         veor            q9, q4, q9
 439         vshl.u32        q7, q8, #7
 440         vshl.u32        q4, q9, #7
 441         vsri.u32        q7, q8, #25
 442         vsri.u32        q4, q9, #25
 443
 444         veor            q8, q5, q10
 445         veor            q9, q6, q11
 446         vshl.u32        q5, q8, #7
 447         vshl.u32        q6, q9, #7
 448         vsri.u32        q5, q8, #25
 449         vsri.u32        q6, q9, #25
 450
 451         subs            r3, r3, #2
 452         bne             .Ldoubleround4
 453
 454         // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
 455         // x8..9[0-3] are on the stack.
 456
 457         // Re-interleave the words in the first two rows of each block (x0..7).
 458         // Also add the counter values 0-3 to x12[0-3].
 459           vld1.32       {q8}, [lr, :128]        // load counter values 0-3
 460         vzip.32         q0, q1                  // => (0 1 0 1) (0 1 0 1)
 461         vzip.32         q2, q3                  // => (2 3 2 3) (2 3 2 3)
 462         vzip.32         q4, q5                  // => (4 5 4 5) (4 5 4 5)
 463         vzip.32         q6, q7                  // => (6 7 6 7) (6 7 6 7)
 464           vadd.u32      q12, q8                 // x12 += counter values 0-3
 465         vswp            d1, d4
 466         vswp            d3, d6
 467           vld1.32       {q8-q9}, [r0]!          // load s0..7
 468         vswp            d9, d12
 469         vswp            d11, d14
 470
 471         // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
 472         // after XORing the first 32 bytes.
 473         vswp            q1, q4
 474
 475         // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
 476
 477         // x0..3[0-3] += s0..3[0-3]     (add orig state to 1st row of each block)
 478         vadd.u32        q0, q0, q8
 479         vadd.u32        q2, q2, q8
 480         vadd.u32        q4, q4, q8
 481         vadd.u32        q3, q3, q8
 482
 483         // x4..7[0-3] += s4..7[0-3]     (add orig state to 2nd row of each block)
 484         vadd.u32        q1, q1, q9
 485         vadd.u32        q6, q6, q9
 486         vadd.u32        q5, q5, q9
 487         vadd.u32        q7, q7, q9
 488
 489         // XOR first 32 bytes using keystream from first two rows of first block
 490         vld1.8          {q8-q9}, [r2]!
 491         veor            q8, q8, q0
 492         veor            q9, q9, q1
 493         vst1.8          {q8-q9}, [r1]!
 494
 495         // Re-interleave the words in the last two rows of each block (x8..15).
 496         vld1.32         {q8-q9}, [sp, :256]
 497           mov           sp, r4          // restore original stack pointer
 498           ldr           r4, [r4, #8]    // load number of bytes
 499         vzip.32         q12, q13        // => (12 13 12 13) (12 13 12 13)
 500         vzip.32         q14, q15        // => (14 15 14 15) (14 15 14 15)
 501         vzip.32         q8, q9          // => (8 9 8 9) (8 9 8 9)
 502         vzip.32         q10, q11        // => (10 11 10 11) (10 11 10 11)
 503           vld1.32       {q0-q1}, [r0]   // load s8..15
 504         vswp            d25, d28
 505         vswp            d27, d30
 506         vswp            d17, d20
 507         vswp            d19, d22
 508
 509         // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
 510
 511         // x8..11[0-3] += s8..11[0-3]   (add orig state to 3rd row of each block)
 512         vadd.u32        q8,  q8,  q0
 513         vadd.u32        q10, q10, q0
 514         vadd.u32        q9,  q9,  q0
 515         vadd.u32        q11, q11, q0
 516
 517         // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
 518         vadd.u32        q12, q12, q1
 519         vadd.u32        q14, q14, q1
 520         vadd.u32        q13, q13, q1
 521         vadd.u32        q15, q15, q1
 522
 523         // XOR the rest of the data with the keystream
 524
 525         vld1.8          {q0-q1}, [r2]!
 526         subs            r4, r4, #96
 527         veor            q0, q0, q8
 528         veor            q1, q1, q12
 529         ble             .Lle96
 530         vst1.8          {q0-q1}, [r1]!
 531
 532         vld1.8          {q0-q1}, [r2]!
 533         subs            r4, r4, #32
 534         veor            q0, q0, q2
 535         veor            q1, q1, q6
 536         ble             .Lle128
 537         vst1.8          {q0-q1}, [r1]!
 538
 539         vld1.8          {q0-q1}, [r2]!
 540         subs            r4, r4, #32
 541         veor            q0, q0, q10
 542         veor            q1, q1, q14
 543         ble             .Lle160
 544         vst1.8          {q0-q1}, [r1]!
 545
 546         vld1.8          {q0-q1}, [r2]!
 547         subs            r4, r4, #32
 548         veor            q0, q0, q4
 549         veor            q1, q1, q5
 550         ble             .Lle192
 551         vst1.8          {q0-q1}, [r1]!
 552
 553         vld1.8          {q0-q1}, [r2]!
 554         subs            r4, r4, #32
 555         veor            q0, q0, q9
 556         veor            q1, q1, q13
 557         ble             .Lle224
 558         vst1.8          {q0-q1}, [r1]!
 559
 560         vld1.8          {q0-q1}, [r2]!
 561         subs            r4, r4, #32
 562         veor            q0, q0, q3
 563         veor            q1, q1, q7
 564         blt             .Llt256
 565 .Lout:
 566         vst1.8          {q0-q1}, [r1]!
 567
 568         vld1.8          {q0-q1}, [r2]
 569         veor            q0, q0, q11
 570         veor            q1, q1, q15
 571         vst1.8          {q0-q1}, [r1]
 572
 573         pop             {r4, pc}
 574
 575 .Lle192:
 576         vmov            q4, q9
 577         vmov            q5, q13
 578
 579 .Lle160:
 580         // nothing to do
 581
 582 .Lfinalblock:
 583         // Process the final block if processing less than 4 full blocks.
 584         // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
 585         // previous 32 byte output block that still needs to be written at
 586         // [r1] in q0-q1.
 587         beq             .Lfullblock
 588
 589 .Lpartialblock:
 590         adr             lr, .Lpermute + 32
 591         add             r2, r2, r4
 592         add             lr, lr, r4
 593         add             r4, r4, r1
 594
 595         vld1.8          {q2-q3}, [lr]
 596         vld1.8          {q6-q7}, [r2]
 597
 598         add             r4, r4, #32
 599
 600         vtbl.8          d4, {q4-q5}, d4
 601         vtbl.8          d5, {q4-q5}, d5
 602         vtbl.8          d6, {q4-q5}, d6
 603         vtbl.8          d7, {q4-q5}, d7
 604
 605         veor            q6, q6, q2
 606         veor            q7, q7, q3
 607
 608         vst1.8          {q6-q7}, [r4]   // overlapping stores
 609         vst1.8          {q0-q1}, [r1]
 610         pop             {r4, pc}
 611
 612 .Lfullblock:
 613         vmov            q11, q4
 614         vmov            q15, q5
 615         b               .Lout
 616 .Lle96:
 617         vmov            q4, q2
 618         vmov            q5, q6
 619         b               .Lfinalblock
 620 .Lle128:
 621         vmov            q4, q10
 622         vmov            q5, q14
 623         b               .Lfinalblock
 624 .Lle224:
 625         vmov            q4, q3
 626         vmov            q5, q7
 627         b               .Lfinalblock
 628 .Llt256:
 629         vmov            q4, q11
 630         vmov            q5, q15
 631         b               .Lpartialblock
 632 ENDPROC(chacha_4block_xor_neon)
 633
 634         .align          L1_CACHE_SHIFT
 635 .Lpermute:
 636         .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
 637         .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 638         .byte           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
 639         .byte           0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
 640         .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
 641         .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 642         .byte           0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
 643         .byte           0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f