1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * BLAKE2s digest algorithm, ARM scalar implementation
5 * Copyright 2020 Google LLC
7 * Author: Eric Biggers <ebiggers@google.com>
10 #include <linux/linkage.h>
11 #include <asm/assembler.h>
13 // Registers used to hold message words temporarily. There aren't
14 // enough ARM registers to hold the whole message block, so we have to
15 // load the words on-demand.
19 // The BLAKE2s initialization vector
21 .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
22 .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
24 .macro __ldrd a, b, src, offset
25 #if __LINUX_ARM_ARCH__ >= 6
26 ldrd \a, \b, [\src, #\offset]
28 ldr \a, [\src, #\offset]
29 ldr \b, [\src, #\offset + 4]
33 .macro __strd a, b, dst, offset
34 #if __LINUX_ARM_ARCH__ >= 6
35 strd \a, \b, [\dst, #\offset]
37 str \a, [\dst, #\offset]
38 str \b, [\dst, #\offset + 4]
42 .macro _le32_bswap a, tmp
48 .macro _le32_bswap_8x a, b, c, d, e, f, g, h, tmp
59 // Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
60 // (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
61 // columns/diagonals. s0-s1 are the word offsets to the message words the first
62 // column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
63 // M_0 and M_1 are free to use, and the message block can be found at sp + 32.
65 // Note that to save instructions, the rotations don't happen when the
66 // pseudocode says they should, but rather they are delayed until the values are
67 // used. See the comment above _blake2s_round().
68 .macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
70 ldr M_0, [sp, #32 + 4 * \s0]
71 ldr M_1, [sp, #32 + 4 * \s2]
73 // a += b + m[blake2s_sigma[r][2*i + 0]];
74 add \a0, \a0, \b0, ror #brot
75 add \a1, \a1, \b1, ror #brot
79 // d = ror32(d ^ a, 16);
80 eor \d0, \a0, \d0, ror #drot
81 eor \d1, \a1, \d1, ror #drot
84 add \c0, \c0, \d0, ror #16
85 add \c1, \c1, \d1, ror #16
87 // b = ror32(b ^ c, 12);
88 eor \b0, \c0, \b0, ror #brot
89 eor \b1, \c1, \b1, ror #brot
91 ldr M_0, [sp, #32 + 4 * \s1]
92 ldr M_1, [sp, #32 + 4 * \s3]
94 // a += b + m[blake2s_sigma[r][2*i + 1]];
95 add \a0, \a0, \b0, ror #12
96 add \a1, \a1, \b1, ror #12
100 // d = ror32(d ^ a, 8);
101 eor \d0, \a0, \d0, ror#16
102 eor \d1, \a1, \d1, ror#16
105 add \c0, \c0, \d0, ror#8
106 add \c1, \c1, \d1, ror#8
108 // b = ror32(b ^ c, 7);
109 eor \b0, \c0, \b0, ror#12
110 eor \b1, \c1, \b1, ror#12
113 // Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
114 // are in r0..r9. The stack pointer points to 8 bytes of scratch space for
115 // spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and
116 // r14 are free to use. The macro arguments s0-s15 give the order in which the
117 // message words are used in this round.
119 // All rotates are performed using the implicit rotate operand accepted by the
120 // 'add' and 'eor' instructions. This is faster than using explicit rotate
121 // instructions. To make this work, we allow the values in the second and last
122 // rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
123 // wrong rotation amount. The rotation amount is then fixed up just in time
124 // when the values are used. 'brot' is the number of bits the values in row 'b'
125 // need to be rotated right to arrive at the correct values, and 'drot'
126 // similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
127 // that they end up as (7, 8) after every round.
128 .macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
129 s8, s9, s10, s11, s12, s13, s14, s15
131 // Mix first two columns:
132 // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
133 __ldrd r10, r11, sp, 16 // load v[12] and v[13]
134 _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
137 __strd r10, r11, sp, 16
139 // Mix second two columns:
140 // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
141 __ldrd r8, r9, sp, 8 // load v[10] and v[11]
142 __ldrd r10, r11, sp, 24 // load v[14] and v[15]
143 _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
145 str r10, [sp, #24] // store v[14]
146 // v[10], v[11], and v[15] are used below, so no need to store them yet.
151 // Mix first two diagonals:
152 // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
153 ldr r10, [sp, #16] // load v[12]
154 _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
160 // Mix second two diagonals:
161 // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
162 __ldrd r8, r9, sp, 0 // load v[8] and v[9]
163 __ldrd r10, r11, sp, 20 // load v[13] and v[14]
164 _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
165 \s12, \s13, \s14, \s15
166 __strd r10, r11, sp, 20
170 // void blake2s_compress(struct blake2s_state *state,
171 // const u8 *block, size_t nblocks, u32 inc);
173 // Only the first three fields of struct blake2s_state are used:
179 ENTRY(blake2s_compress)
180 push {r0-r2,r4-r11,lr} // keep this an even number
187 // Load and increment the counter t[0..1].
188 __ldrd r10, r11, r0, 32
191 __strd r10, r11, r0, 32
193 // _blake2s_round is very short on registers, so copy the message block
194 // to the stack to save a register during the rounds. This also has the
195 // advantage that misalignment only needs to be dealt with in one place.
199 bne .Lcopy_block_misaligned
201 _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
204 _le32_bswap_8x r2, r3, r4, r5, r6, r7, r8, r9, r14
207 str r1, [sp, #68] // Update message pointer
209 // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space
210 // for spilling v[8..9]. Leave v[8..9] in r8-r9.
211 mov r14, r0 // r14 = state
212 adr r12, .Lblake2s_IV
213 ldmia r12!, {r8-r9} // load IV[0..1]
214 __ldrd r0, r1, r14, 40 // load f[0..1]
215 ldm r12, {r2-r7} // load IV[3..7]
216 eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
217 eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
218 eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
219 eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
220 push {r2-r7} // push v[9..15]
221 sub sp, sp, #8 // leave space for v[8..9]
223 // Load h[0..7] == v[0..7].
226 // Execute the rounds. Each round is provided the order in which it
227 // needs to use the message words.
230 _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
231 _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
232 _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
233 _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
234 _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
235 _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
236 _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
237 _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
238 _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
239 _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
241 // Fold the final state matrix into the hash chaining value:
243 // for (i = 0; i < 8; i++)
244 // h[i] ^= v[i] ^ v[i + 8];
246 ldr r14, [sp, #96] // r14 = &h[0]
247 add sp, sp, #8 // v[8..9] are already loaded.
248 pop {r10-r11} // load v[10..11]
253 ldm r14, {r8-r11} // load h[0..3]
258 stmia r14!, {r0-r3} // store new h[0..3]
259 ldm r14, {r0-r3} // load old h[4..7]
260 pop {r8-r11} // load v[12..15]
261 eor r0, r0, r4, ror #brot
262 eor r1, r1, r5, ror #brot
263 eor r2, r2, r6, ror #brot
264 eor r3, r3, r7, ror #brot
265 eor r0, r0, r8, ror #drot
266 eor r1, r1, r9, ror #drot
267 eor r2, r2, r10, ror #drot
268 eor r3, r3, r11, ror #drot
269 add sp, sp, #64 // skip copy of message block
270 stm r14, {r0-r3} // store new h[4..7]
272 // Advance to the next block, if there is one. Note that if there are
273 // multiple blocks, then 'inc' (the counter increment amount) must be
274 // 64. So we can simply set it to 64 without re-loading it.
275 ldm sp, {r0, r1, r2} // load (state, block, nblocks)
276 mov r3, #64 // set 'inc'
277 subs r2, r2, #1 // nblocks--
279 bne .Lnext_block // nblocks != 0?
281 pop {r0-r2,r4-r11,pc}
283 // The next message block (pointed to by r1) isn't 4-byte aligned, so it
284 // can't be loaded using ldmia. Copy it to the stack buffer (pointed to
285 // by r12) using an alternative method. r2-r9 are free to use.
286 .Lcopy_block_misaligned:
289 #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
298 orr r3, r3, r4, lsl #8
299 orr r3, r3, r5, lsl #16
300 orr r3, r3, r6, lsl #24
306 ENDPROC(blake2s_compress)