arch/arm64/crypto/chacha20-neon-core.S

   1 /*
   2  * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
   3  *
   4  * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 as
   8  * published by the Free Software Foundation.
   9  *
  10  * Based on:
  11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
  12  *
  13  * Copyright (C) 2015 Martin Willi
  14  *
  15  * This program is free software; you can redistribute it and/or modify
  16  * it under the terms of the GNU General Public License as published by
  17  * the Free Software Foundation; either version 2 of the License, or
  18  * (at your option) any later version.
  19  */
  20
  21 #include <linux/linkage.h>
  22
  23         .text
  24         .align          6
  25
  26 ENTRY(chacha20_block_xor_neon)
  27         // x0: Input state matrix, s
  28         // x1: 1 data block output, o
  29         // x2: 1 data block input, i
  30
  31         //
  32         // This function encrypts one ChaCha20 block by loading the state matrix
  33         // in four NEON registers. It performs matrix operation on four words in
  34         // parallel, but requires shuffling to rearrange the words after each
  35         // round.
  36         //
  37
  38         // x0..3 = s0..3
  39         adr             x3, ROT8
  40         ld1             {v0.4s-v3.4s}, [x0]
  41         ld1             {v8.4s-v11.4s}, [x0]
  42         ld1             {v12.4s}, [x3]
  43
  44         mov             x3, #10
  45
  46 .Ldoubleround:
  47         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  48         add             v0.4s, v0.4s, v1.4s
  49         eor             v3.16b, v3.16b, v0.16b
  50         rev32           v3.8h, v3.8h
  51
  52         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  53         add             v2.4s, v2.4s, v3.4s
  54         eor             v4.16b, v1.16b, v2.16b
  55         shl             v1.4s, v4.4s, #12
  56         sri             v1.4s, v4.4s, #20
  57
  58         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  59         add             v0.4s, v0.4s, v1.4s
  60         eor             v3.16b, v3.16b, v0.16b
  61         tbl             v3.16b, {v3.16b}, v12.16b
  62
  63         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  64         add             v2.4s, v2.4s, v3.4s
  65         eor             v4.16b, v1.16b, v2.16b
  66         shl             v1.4s, v4.4s, #7
  67         sri             v1.4s, v4.4s, #25
  68
  69         // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
  70         ext             v1.16b, v1.16b, v1.16b, #4
  71         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
  72         ext             v2.16b, v2.16b, v2.16b, #8
  73         // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
  74         ext             v3.16b, v3.16b, v3.16b, #12
  75
  76         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
  77         add             v0.4s, v0.4s, v1.4s
  78         eor             v3.16b, v3.16b, v0.16b
  79         rev32           v3.8h, v3.8h
  80
  81         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
  82         add             v2.4s, v2.4s, v3.4s
  83         eor             v4.16b, v1.16b, v2.16b
  84         shl             v1.4s, v4.4s, #12
  85         sri             v1.4s, v4.4s, #20
  86
  87         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
  88         add             v0.4s, v0.4s, v1.4s
  89         eor             v3.16b, v3.16b, v0.16b
  90         tbl             v3.16b, {v3.16b}, v12.16b
  91
  92         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
  93         add             v2.4s, v2.4s, v3.4s
  94         eor             v4.16b, v1.16b, v2.16b
  95         shl             v1.4s, v4.4s, #7
  96         sri             v1.4s, v4.4s, #25
  97
  98         // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
  99         ext             v1.16b, v1.16b, v1.16b, #12
 100         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
 101         ext             v2.16b, v2.16b, v2.16b, #8
 102         // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
 103         ext             v3.16b, v3.16b, v3.16b, #4
 104
 105         subs            x3, x3, #1
 106         b.ne            .Ldoubleround
 107
 108         ld1             {v4.16b-v7.16b}, [x2]
 109
 110         // o0 = i0 ^ (x0 + s0)
 111         add             v0.4s, v0.4s, v8.4s
 112         eor             v0.16b, v0.16b, v4.16b
 113
 114         // o1 = i1 ^ (x1 + s1)
 115         add             v1.4s, v1.4s, v9.4s
 116         eor             v1.16b, v1.16b, v5.16b
 117
 118         // o2 = i2 ^ (x2 + s2)
 119         add             v2.4s, v2.4s, v10.4s
 120         eor             v2.16b, v2.16b, v6.16b
 121
 122         // o3 = i3 ^ (x3 + s3)
 123         add             v3.4s, v3.4s, v11.4s
 124         eor             v3.16b, v3.16b, v7.16b
 125
 126         st1             {v0.16b-v3.16b}, [x1]
 127
 128         ret
 129 ENDPROC(chacha20_block_xor_neon)
 130
 131         .align          6
 132 ENTRY(chacha20_4block_xor_neon)
 133         // x0: Input state matrix, s
 134         // x1: 4 data blocks output, o
 135         // x2: 4 data blocks input, i
 136
 137         //
 138         // This function encrypts four consecutive ChaCha20 blocks by loading
 139         // the state matrix in NEON registers four times. The algorithm performs
 140         // each operation on the corresponding word of each state matrix, hence
 141         // requires no word shuffling. For final XORing step we transpose the
 142         // matrix by interleaving 32- and then 64-bit words, which allows us to
 143         // do XOR in NEON registers.
 144         //
 145         adr             x3, CTRINC              // ... and ROT8
 146         ld1             {v30.4s-v31.4s}, [x3]
 147
 148         // x0..15[0-3] = s0..3[0..3]
 149         mov             x4, x0
 150         ld4r            { v0.4s- v3.4s}, [x4], #16
 151         ld4r            { v4.4s- v7.4s}, [x4], #16
 152         ld4r            { v8.4s-v11.4s}, [x4], #16
 153         ld4r            {v12.4s-v15.4s}, [x4]
 154
 155         // x12 += counter values 0-3
 156         add             v12.4s, v12.4s, v30.4s
 157
 158         mov             x3, #10
 159
 160 .Ldoubleround4:
 161         // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
 162         // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
 163         // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
 164         // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
 165         add             v0.4s, v0.4s, v4.4s
 166         add             v1.4s, v1.4s, v5.4s
 167         add             v2.4s, v2.4s, v6.4s
 168         add             v3.4s, v3.4s, v7.4s
 169
 170         eor             v12.16b, v12.16b, v0.16b
 171         eor             v13.16b, v13.16b, v1.16b
 172         eor             v14.16b, v14.16b, v2.16b
 173         eor             v15.16b, v15.16b, v3.16b
 174
 175         rev32           v12.8h, v12.8h
 176         rev32           v13.8h, v13.8h
 177         rev32           v14.8h, v14.8h
 178         rev32           v15.8h, v15.8h
 179
 180         // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
 181         // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
 182         // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
 183         // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
 184         add             v8.4s, v8.4s, v12.4s
 185         add             v9.4s, v9.4s, v13.4s
 186         add             v10.4s, v10.4s, v14.4s
 187         add             v11.4s, v11.4s, v15.4s
 188
 189         eor             v16.16b, v4.16b, v8.16b
 190         eor             v17.16b, v5.16b, v9.16b
 191         eor             v18.16b, v6.16b, v10.16b
 192         eor             v19.16b, v7.16b, v11.16b
 193
 194         shl             v4.4s, v16.4s, #12
 195         shl             v5.4s, v17.4s, #12
 196         shl             v6.4s, v18.4s, #12
 197         shl             v7.4s, v19.4s, #12
 198
 199         sri             v4.4s, v16.4s, #20
 200         sri             v5.4s, v17.4s, #20
 201         sri             v6.4s, v18.4s, #20
 202         sri             v7.4s, v19.4s, #20
 203
 204         // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
 205         // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
 206         // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
 207         // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
 208         add             v0.4s, v0.4s, v4.4s
 209         add             v1.4s, v1.4s, v5.4s
 210         add             v2.4s, v2.4s, v6.4s
 211         add             v3.4s, v3.4s, v7.4s
 212
 213         eor             v12.16b, v12.16b, v0.16b
 214         eor             v13.16b, v13.16b, v1.16b
 215         eor             v14.16b, v14.16b, v2.16b
 216         eor             v15.16b, v15.16b, v3.16b
 217
 218         tbl             v12.16b, {v12.16b}, v31.16b
 219         tbl             v13.16b, {v13.16b}, v31.16b
 220         tbl             v14.16b, {v14.16b}, v31.16b
 221         tbl             v15.16b, {v15.16b}, v31.16b
 222
 223         // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
 224         // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
 225         // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
 226         // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
 227         add             v8.4s, v8.4s, v12.4s
 228         add             v9.4s, v9.4s, v13.4s
 229         add             v10.4s, v10.4s, v14.4s
 230         add             v11.4s, v11.4s, v15.4s
 231
 232         eor             v16.16b, v4.16b, v8.16b
 233         eor             v17.16b, v5.16b, v9.16b
 234         eor             v18.16b, v6.16b, v10.16b
 235         eor             v19.16b, v7.16b, v11.16b
 236
 237         shl             v4.4s, v16.4s, #7
 238         shl             v5.4s, v17.4s, #7
 239         shl             v6.4s, v18.4s, #7
 240         shl             v7.4s, v19.4s, #7
 241
 242         sri             v4.4s, v16.4s, #25
 243         sri             v5.4s, v17.4s, #25
 244         sri             v6.4s, v18.4s, #25
 245         sri             v7.4s, v19.4s, #25
 246
 247         // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
 248         // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
 249         // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
 250         // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
 251         add             v0.4s, v0.4s, v5.4s
 252         add             v1.4s, v1.4s, v6.4s
 253         add             v2.4s, v2.4s, v7.4s
 254         add             v3.4s, v3.4s, v4.4s
 255
 256         eor             v15.16b, v15.16b, v0.16b
 257         eor             v12.16b, v12.16b, v1.16b
 258         eor             v13.16b, v13.16b, v2.16b
 259         eor             v14.16b, v14.16b, v3.16b
 260
 261         rev32           v15.8h, v15.8h
 262         rev32           v12.8h, v12.8h
 263         rev32           v13.8h, v13.8h
 264         rev32           v14.8h, v14.8h
 265
 266         // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
 267         // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
 268         // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
 269         // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
 270         add             v10.4s, v10.4s, v15.4s
 271         add             v11.4s, v11.4s, v12.4s
 272         add             v8.4s, v8.4s, v13.4s
 273         add             v9.4s, v9.4s, v14.4s
 274
 275         eor             v16.16b, v5.16b, v10.16b
 276         eor             v17.16b, v6.16b, v11.16b
 277         eor             v18.16b, v7.16b, v8.16b
 278         eor             v19.16b, v4.16b, v9.16b
 279
 280         shl             v5.4s, v16.4s, #12
 281         shl             v6.4s, v17.4s, #12
 282         shl             v7.4s, v18.4s, #12
 283         shl             v4.4s, v19.4s, #12
 284
 285         sri             v5.4s, v16.4s, #20
 286         sri             v6.4s, v17.4s, #20
 287         sri             v7.4s, v18.4s, #20
 288         sri             v4.4s, v19.4s, #20
 289
 290         // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
 291         // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
 292         // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
 293         // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
 294         add             v0.4s, v0.4s, v5.4s
 295         add             v1.4s, v1.4s, v6.4s
 296         add             v2.4s, v2.4s, v7.4s
 297         add             v3.4s, v3.4s, v4.4s
 298
 299         eor             v15.16b, v15.16b, v0.16b
 300         eor             v12.16b, v12.16b, v1.16b
 301         eor             v13.16b, v13.16b, v2.16b
 302         eor             v14.16b, v14.16b, v3.16b
 303
 304         tbl             v15.16b, {v15.16b}, v31.16b
 305         tbl             v12.16b, {v12.16b}, v31.16b
 306         tbl             v13.16b, {v13.16b}, v31.16b
 307         tbl             v14.16b, {v14.16b}, v31.16b
 308
 309         // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
 310         // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
 311         // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
 312         // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
 313         add             v10.4s, v10.4s, v15.4s
 314         add             v11.4s, v11.4s, v12.4s
 315         add             v8.4s, v8.4s, v13.4s
 316         add             v9.4s, v9.4s, v14.4s
 317
 318         eor             v16.16b, v5.16b, v10.16b
 319         eor             v17.16b, v6.16b, v11.16b
 320         eor             v18.16b, v7.16b, v8.16b
 321         eor             v19.16b, v4.16b, v9.16b
 322
 323         shl             v5.4s, v16.4s, #7
 324         shl             v6.4s, v17.4s, #7
 325         shl             v7.4s, v18.4s, #7
 326         shl             v4.4s, v19.4s, #7
 327
 328         sri             v5.4s, v16.4s, #25
 329         sri             v6.4s, v17.4s, #25
 330         sri             v7.4s, v18.4s, #25
 331         sri             v4.4s, v19.4s, #25
 332
 333         subs            x3, x3, #1
 334         b.ne            .Ldoubleround4
 335
 336         ld4r            {v16.4s-v19.4s}, [x0], #16
 337         ld4r            {v20.4s-v23.4s}, [x0], #16
 338
 339         // x12 += counter values 0-3
 340         add             v12.4s, v12.4s, v30.4s
 341
 342         // x0[0-3] += s0[0]
 343         // x1[0-3] += s0[1]
 344         // x2[0-3] += s0[2]
 345         // x3[0-3] += s0[3]
 346         add             v0.4s, v0.4s, v16.4s
 347         add             v1.4s, v1.4s, v17.4s
 348         add             v2.4s, v2.4s, v18.4s
 349         add             v3.4s, v3.4s, v19.4s
 350
 351         ld4r            {v24.4s-v27.4s}, [x0], #16
 352         ld4r            {v28.4s-v31.4s}, [x0]
 353
 354         // x4[0-3] += s1[0]
 355         // x5[0-3] += s1[1]
 356         // x6[0-3] += s1[2]
 357         // x7[0-3] += s1[3]
 358         add             v4.4s, v4.4s, v20.4s
 359         add             v5.4s, v5.4s, v21.4s
 360         add             v6.4s, v6.4s, v22.4s
 361         add             v7.4s, v7.4s, v23.4s
 362
 363         // x8[0-3] += s2[0]
 364         // x9[0-3] += s2[1]
 365         // x10[0-3] += s2[2]
 366         // x11[0-3] += s2[3]
 367         add             v8.4s, v8.4s, v24.4s
 368         add             v9.4s, v9.4s, v25.4s
 369         add             v10.4s, v10.4s, v26.4s
 370         add             v11.4s, v11.4s, v27.4s
 371
 372         // x12[0-3] += s3[0]
 373         // x13[0-3] += s3[1]
 374         // x14[0-3] += s3[2]
 375         // x15[0-3] += s3[3]
 376         add             v12.4s, v12.4s, v28.4s
 377         add             v13.4s, v13.4s, v29.4s
 378         add             v14.4s, v14.4s, v30.4s
 379         add             v15.4s, v15.4s, v31.4s
 380
 381         // interleave 32-bit words in state n, n+1
 382         zip1            v16.4s, v0.4s, v1.4s
 383         zip2            v17.4s, v0.4s, v1.4s
 384         zip1            v18.4s, v2.4s, v3.4s
 385         zip2            v19.4s, v2.4s, v3.4s
 386         zip1            v20.4s, v4.4s, v5.4s
 387         zip2            v21.4s, v4.4s, v5.4s
 388         zip1            v22.4s, v6.4s, v7.4s
 389         zip2            v23.4s, v6.4s, v7.4s
 390         zip1            v24.4s, v8.4s, v9.4s
 391         zip2            v25.4s, v8.4s, v9.4s
 392         zip1            v26.4s, v10.4s, v11.4s
 393         zip2            v27.4s, v10.4s, v11.4s
 394         zip1            v28.4s, v12.4s, v13.4s
 395         zip2            v29.4s, v12.4s, v13.4s
 396         zip1            v30.4s, v14.4s, v15.4s
 397         zip2            v31.4s, v14.4s, v15.4s
 398
 399         // interleave 64-bit words in state n, n+2
 400         zip1            v0.2d, v16.2d, v18.2d
 401         zip2            v4.2d, v16.2d, v18.2d
 402         zip1            v8.2d, v17.2d, v19.2d
 403         zip2            v12.2d, v17.2d, v19.2d
 404         ld1             {v16.16b-v19.16b}, [x2], #64
 405
 406         zip1            v1.2d, v20.2d, v22.2d
 407         zip2            v5.2d, v20.2d, v22.2d
 408         zip1            v9.2d, v21.2d, v23.2d
 409         zip2            v13.2d, v21.2d, v23.2d
 410         ld1             {v20.16b-v23.16b}, [x2], #64
 411
 412         zip1            v2.2d, v24.2d, v26.2d
 413         zip2            v6.2d, v24.2d, v26.2d
 414         zip1            v10.2d, v25.2d, v27.2d
 415         zip2            v14.2d, v25.2d, v27.2d
 416         ld1             {v24.16b-v27.16b}, [x2], #64
 417
 418         zip1            v3.2d, v28.2d, v30.2d
 419         zip2            v7.2d, v28.2d, v30.2d
 420         zip1            v11.2d, v29.2d, v31.2d
 421         zip2            v15.2d, v29.2d, v31.2d
 422         ld1             {v28.16b-v31.16b}, [x2]
 423
 424         // xor with corresponding input, write to output
 425         eor             v16.16b, v16.16b, v0.16b
 426         eor             v17.16b, v17.16b, v1.16b
 427         eor             v18.16b, v18.16b, v2.16b
 428         eor             v19.16b, v19.16b, v3.16b
 429         eor             v20.16b, v20.16b, v4.16b
 430         eor             v21.16b, v21.16b, v5.16b
 431         st1             {v16.16b-v19.16b}, [x1], #64
 432         eor             v22.16b, v22.16b, v6.16b
 433         eor             v23.16b, v23.16b, v7.16b
 434         eor             v24.16b, v24.16b, v8.16b
 435         eor             v25.16b, v25.16b, v9.16b
 436         st1             {v20.16b-v23.16b}, [x1], #64
 437         eor             v26.16b, v26.16b, v10.16b
 438         eor             v27.16b, v27.16b, v11.16b
 439         eor             v28.16b, v28.16b, v12.16b
 440         st1             {v24.16b-v27.16b}, [x1], #64
 441         eor             v29.16b, v29.16b, v13.16b
 442         eor             v30.16b, v30.16b, v14.16b
 443         eor             v31.16b, v31.16b, v15.16b
 444         st1             {v28.16b-v31.16b}, [x1]
 445
 446         ret
 447 ENDPROC(chacha20_4block_xor_neon)
 448
 449 CTRINC: .word           0, 1, 2, 3
 450 ROT8:   .word           0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f