arch/arm64/kernel/vdso/vgetrandom-chacha.S

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/linkage.h>
   4 #include <asm/cache.h>
   5 #include <asm/assembler.h>
   6
   7         .text
   8
   9 #define state0          v0
  10 #define state1          v1
  11 #define state2          v2
  12 #define state3          v3
  13 #define copy0           v4
  14 #define copy0_q         q4
  15 #define copy1           v5
  16 #define copy2           v6
  17 #define copy3           v7
  18 #define copy3_d         d7
  19 #define one_d           d16
  20 #define one_q           q16
  21 #define one_v           v16
  22 #define tmp             v17
  23 #define rot8            v18
  24
  25 /*
  26  * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
  27  * number of blocks of output with nonce 0, taking an input key and 8-bytes
  28  * counter.  Importantly does not spill to the stack.
  29  *
  30  * This implementation avoids d8-d15 because they are callee-save in user
  31  * space.
  32  *
  33  * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
  34  *                                     const uint8_t *key,
  35  *                                     uint32_t *counter,
  36  *                                     size_t nblocks)
  37  *
  38  *      x0: output bytes
  39  *      x1: 32-byte key input
  40  *      x2: 8-byte counter input/output
  41  *      x3: number of 64-byte block to write to output
  42  */
  43 SYM_FUNC_START(__arch_chacha20_blocks_nostack)
  44
  45         /* copy0 = "expand 32-byte k" */
  46         mov_q           x8, 0x3320646e61707865
  47         mov_q           x9, 0x6b20657479622d32
  48         mov             copy0.d[0], x8
  49         mov             copy0.d[1], x9
  50
  51         /* copy1,copy2 = key */
  52         ld1             { copy1.4s, copy2.4s }, [x1]
  53         /* copy3 = counter || zero nonce  */
  54         ld1             { copy3.2s }, [x2]
  55
  56         movi            one_v.2s, #1
  57         uzp1            one_v.4s, one_v.4s, one_v.4s
  58
  59 .Lblock:
  60         /* copy state to auxiliary vectors for the final add after the permute.  */
  61         mov             state0.16b, copy0.16b
  62         mov             state1.16b, copy1.16b
  63         mov             state2.16b, copy2.16b
  64         mov             state3.16b, copy3.16b
  65
  66         mov             w4, 20
  67 .Lpermute:
  68         /*
  69          * Permute one 64-byte block where the state matrix is stored in the four NEON
  70          * registers state0-state3.  It performs matrix operations on four words in parallel,
  71          * but requires shuffling to rearrange the words after each round.
  72          */
  73
  74 .Ldoubleround:
  75         /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
  76         add             state0.4s, state0.4s, state1.4s
  77         eor             state3.16b, state3.16b, state0.16b
  78         rev32           state3.8h, state3.8h
  79
  80         /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
  81         add             state2.4s, state2.4s, state3.4s
  82         eor             tmp.16b, state1.16b, state2.16b
  83         shl             state1.4s, tmp.4s, #12
  84         sri             state1.4s, tmp.4s, #20
  85
  86         /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
  87         add             state0.4s, state0.4s, state1.4s
  88         eor             tmp.16b, state3.16b, state0.16b
  89         shl             state3.4s, tmp.4s, #8
  90         sri             state3.4s, tmp.4s, #24
  91
  92         /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
  93         add             state2.4s, state2.4s, state3.4s
  94         eor             tmp.16b, state1.16b, state2.16b
  95         shl             state1.4s, tmp.4s, #7
  96         sri             state1.4s, tmp.4s, #25
  97
  98         /* state1[0,1,2,3] = state1[1,2,3,0] */
  99         ext             state1.16b, state1.16b, state1.16b, #4
 100         /* state2[0,1,2,3] = state2[2,3,0,1] */
 101         ext             state2.16b, state2.16b, state2.16b, #8
 102         /* state3[0,1,2,3] = state3[1,2,3,0] */
 103         ext             state3.16b, state3.16b, state3.16b, #12
 104
 105         /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
 106         add             state0.4s, state0.4s, state1.4s
 107         eor             state3.16b, state3.16b, state0.16b
 108         rev32           state3.8h, state3.8h
 109
 110         /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
 111         add             state2.4s, state2.4s, state3.4s
 112         eor             tmp.16b, state1.16b, state2.16b
 113         shl             state1.4s, tmp.4s, #12
 114         sri             state1.4s, tmp.4s, #20
 115
 116         /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
 117         add             state0.4s, state0.4s, state1.4s
 118         eor             tmp.16b, state3.16b, state0.16b
 119         shl             state3.4s, tmp.4s, #8
 120         sri             state3.4s, tmp.4s, #24
 121
 122         /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
 123         add             state2.4s, state2.4s, state3.4s
 124         eor             tmp.16b, state1.16b, state2.16b
 125         shl             state1.4s, tmp.4s, #7
 126         sri             state1.4s, tmp.4s, #25
 127
 128         /* state1[0,1,2,3] = state1[3,0,1,2] */
 129         ext             state1.16b, state1.16b, state1.16b, #12
 130         /* state2[0,1,2,3] = state2[2,3,0,1] */
 131         ext             state2.16b, state2.16b, state2.16b, #8
 132         /* state3[0,1,2,3] = state3[1,2,3,0] */
 133         ext             state3.16b, state3.16b, state3.16b, #4
 134
 135         subs            w4, w4, #2
 136         b.ne            .Ldoubleround
 137
 138         /* output0 = state0 + state0 */
 139         add             state0.4s, state0.4s, copy0.4s
 140         /* output1 = state1 + state1 */
 141         add             state1.4s, state1.4s, copy1.4s
 142         /* output2 = state2 + state2 */
 143         add             state2.4s, state2.4s, copy2.4s
 144         /* output2 = state3 + state3 */
 145         add             state3.4s, state3.4s, copy3.4s
 146         st1             { state0.16b - state3.16b }, [x0]
 147
 148         /*
 149          * ++copy3.counter, the 'add' clears the upper half of the SIMD register
 150          * which is the expected behaviour here.
 151          */
 152         add             copy3_d, copy3_d, one_d
 153
 154         /* output += 64, --nblocks */
 155         add             x0, x0, 64
 156         subs            x3, x3, #1
 157         b.ne            .Lblock
 158
 159         /* counter = copy3.counter */
 160         st1             { copy3.2s }, [x2]
 161
 162         /* Zero out the potentially sensitive regs, in case nothing uses these again. */
 163         movi            state0.16b, #0
 164         movi            state1.16b, #0
 165         movi            state2.16b, #0
 166         movi            state3.16b, #0
 167         movi            copy1.16b, #0
 168         movi            copy2.16b, #0
 169         ret
 170 SYM_FUNC_END(__arch_chacha20_blocks_nostack)
 171
 172 emit_aarch64_feature_1_and