1 /* SPDX-License-Identifier: GPL-2.0 OR MIT */
3 * Copyright (C) 2016-2018 René van Dorst <opensource@vdorst.com>. All Rights Reserved.
4 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
8 #define CHACHA20_BLOCK_SIZE 64
27 /* Use regs which are overwritten on exit for Tx so we don't leak clear data. */
40 /* NONCE[0] is kept in a register and not in memory.
41 * We don't want to touch original value in memory.
42 * Must be incremented every loop iteration.
46 /* SAVED_X and SAVED_CA are set in the jump table.
47 * Use regs which are overwritten on exit else we don't leak clear data.
48 * They are used to handling the last bytes which are not multiple of 4.
53 #define IS_UNALIGNED $s7
55 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
59 #define ROTR(n) rotr n, 24
60 #define CPU_TO_LE32(n) \
67 #define CPU_TO_LE32(n)
71 #define FOR_EACH_WORD(x) \
89 #define FOR_EACH_WORD_REV(x) \
116 #define PLUS_ONE_9 10
117 #define PLUS_ONE_10 11
118 #define PLUS_ONE_11 12
119 #define PLUS_ONE_12 13
120 #define PLUS_ONE_13 14
121 #define PLUS_ONE_14 15
122 #define PLUS_ONE_15 16
123 #define PLUS_ONE(x) PLUS_ONE_ ## x
124 #define _CONCAT3(a,b,c) a ## b ## c
125 #define CONCAT3(a,b,c) _CONCAT3(a,b,c)
127 #define STORE_UNALIGNED(x) \
128 CONCAT3(.Lchacha_mips_xor_unaligned_, PLUS_ONE(x), _b: ;) \
130 lw T0, (x*4)(STATE); \
132 lwl T1, (x*4)+MSB ## (IN); \
133 lwr T1, (x*4)+LSB ## (IN); \
135 addu X ## x, NONCE_0; \
139 CPU_TO_LE32(X ## x); \
141 swl X ## x, (x*4)+MSB ## (OUT); \
142 swr X ## x, (x*4)+LSB ## (OUT);
144 #define STORE_ALIGNED(x) \
145 CONCAT3(.Lchacha_mips_xor_aligned_, PLUS_ONE(x), _b: ;) \
147 lw T0, (x*4)(STATE); \
149 lw T1, (x*4) ## (IN); \
151 addu X ## x, NONCE_0; \
155 CPU_TO_LE32(X ## x); \
157 sw X ## x, (x*4) ## (OUT);
160 * Used for setup and handling the last bytes, which are not multiple of 4.
161 * X15 is free to store Xn
162 * Every jumptable entry must be equal in size.
164 #define JMPTBL_ALIGNED(x) \
165 .Lchacha_mips_jmptbl_aligned_ ## x: ; \
167 b .Lchacha_mips_xor_aligned_ ## x ## _b; \
169 addu SAVED_X, X ## x, NONCE_0; \
171 addu SAVED_X, X ## x, SAVED_CA; \
175 #define JMPTBL_UNALIGNED(x) \
176 .Lchacha_mips_jmptbl_unaligned_ ## x: ; \
178 b .Lchacha_mips_xor_unaligned_ ## x ## _b; \
180 addu SAVED_X, X ## x, NONCE_0; \
182 addu SAVED_X, X ## x, SAVED_CA; \
186 #define AXR(A, B, C, D, K, L, M, N, V, W, Y, Z, S) \
203 .globl chacha_crypt_arch
204 .ent chacha_crypt_arch
206 .frame $sp, STACK_SIZE, $ra
208 /* Load number of rounds */
211 addiu $sp, -STACK_SIZE
213 /* Return bytes = 0. */
214 beqz BYTES, .Lchacha_mips_end
216 lw NONCE_0, 48(STATE)
228 /* Test IN or OUT is unaligned.
229 * IS_UNALIGNED = ( IN | OUT ) & 0x00000003
231 or IS_UNALIGNED, IN, OUT
232 andi IS_UNALIGNED, 0x3
234 b .Lchacha_rounds_start
238 addiu IN, CHACHA20_BLOCK_SIZE
239 addiu OUT, CHACHA20_BLOCK_SIZE
242 .Lchacha_rounds_start:
262 .Loop_chacha_xor_rounds:
264 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
265 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
266 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
267 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
268 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
269 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
270 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
271 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
272 bnez $at, .Loop_chacha_xor_rounds
274 addiu BYTES, -(CHACHA20_BLOCK_SIZE)
276 /* Is data src/dst unaligned? Jump */
277 bnez IS_UNALIGNED, .Loop_chacha_unaligned
279 /* Set number rounds here to fill delayslot. */
280 lw $at, (STACK_SIZE+16)($sp)
282 /* BYTES < 0, it has no full block. */
283 bltz BYTES, .Lchacha_mips_no_full_block_aligned
285 FOR_EACH_WORD_REV(STORE_ALIGNED)
287 /* BYTES > 0? Loop again. */
288 bgtz BYTES, .Loop_chacha_rounds
290 /* Place this here to fill delay slot */
293 /* BYTES < 0? Handle last bytes */
294 bltz BYTES, .Lchacha_mips_xor_bytes
296 .Lchacha_mips_xor_done:
297 /* Restore used registers */
307 /* Write NONCE_0 back to right location in state */
308 sw NONCE_0, 48(STATE)
311 addiu $sp, STACK_SIZE
314 .Lchacha_mips_no_full_block_aligned:
315 /* Restore the offset on BYTES */
316 addiu BYTES, CHACHA20_BLOCK_SIZE
318 /* Get number of full WORDS */
319 andi $at, BYTES, MASK_U32
321 /* Load upper half of jump table addr */
322 lui T0, %hi(.Lchacha_mips_jmptbl_aligned_0)
324 /* Calculate lower half jump table offset */
327 /* Add offset to STATE */
330 /* Add lower half jump table addr */
331 addiu T0, %lo(.Lchacha_mips_jmptbl_aligned_0)
333 /* Read value from STATE */
336 /* Store remaining bytecounter as negative value */
337 subu BYTES, $at, BYTES
342 FOR_EACH_WORD(JMPTBL_ALIGNED)
345 .Loop_chacha_unaligned:
346 /* Set number rounds here to fill delayslot. */
347 lw $at, (STACK_SIZE+16)($sp)
349 /* BYTES > 0, it has no full block. */
350 bltz BYTES, .Lchacha_mips_no_full_block_unaligned
352 FOR_EACH_WORD_REV(STORE_UNALIGNED)
354 /* BYTES > 0? Loop again. */
355 bgtz BYTES, .Loop_chacha_rounds
357 /* Write NONCE_0 back to right location in state */
358 sw NONCE_0, 48(STATE)
361 /* Fall through to byte handling */
362 bgez BYTES, .Lchacha_mips_xor_done
363 .Lchacha_mips_xor_unaligned_0_b:
364 .Lchacha_mips_xor_aligned_0_b:
365 /* Place this here to fill delay slot */
369 .Lchacha_mips_xor_bytes:
379 beqz $at, .Lchacha_mips_xor_done
386 beqz $at, .Lchacha_mips_xor_done
392 b .Lchacha_mips_xor_done
394 .Lchacha_mips_no_full_block_unaligned:
395 /* Restore the offset on BYTES */
396 addiu BYTES, CHACHA20_BLOCK_SIZE
398 /* Get number of full WORDS */
399 andi $at, BYTES, MASK_U32
401 /* Load upper half of jump table addr */
402 lui T0, %hi(.Lchacha_mips_jmptbl_unaligned_0)
404 /* Calculate lower half jump table offset */
407 /* Add offset to STATE */
410 /* Add lower half jump table addr */
411 addiu T0, %lo(.Lchacha_mips_jmptbl_unaligned_0)
413 /* Read value from STATE */
416 /* Store remaining bytecounter as negative value */
417 subu BYTES, $at, BYTES
422 FOR_EACH_WORD(JMPTBL_UNALIGNED)
423 .end chacha_crypt_arch
443 .globl hchacha_block_arch
444 .ent hchacha_block_arch
446 .frame $sp, STACK_SIZE, $ra
448 addiu $sp, -STACK_SIZE
470 .Loop_hchacha_xor_rounds:
472 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 16);
473 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 12);
474 AXR( 0, 1, 2, 3, 4, 5, 6, 7, 12,13,14,15, 8);
475 AXR( 8, 9,10,11, 12,13,14,15, 4, 5, 6, 7, 7);
476 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 16);
477 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 12);
478 AXR( 0, 1, 2, 3, 5, 6, 7, 4, 15,12,13,14, 8);
479 AXR(10,11, 8, 9, 15,12,13,14, 5, 6, 7, 4, 7);
480 bnez $a2, .Loop_hchacha_xor_rounds
482 /* Restore used register */
494 addiu $sp, STACK_SIZE
496 .end hchacha_block_arch