Add: Overlay cargo icon in vehicle/depot list when holding shift+ctrl. (#12938)
[openttd-github.git] / src / 3rdparty / monocypher / monocypher.cpp
blob4a3be94f7d7eb2ea70e1ff27937bb026447ff24a
1 // Monocypher version 4.0.2
2 //
3 // This file is dual-licensed. Choose whichever licence you want from
4 // the two licences listed below.
5 //
6 // The first licence is a regular 2-clause BSD licence. The second licence
7 // is the CC-0 from Creative Commons. It is intended to release Monocypher
8 // to the public domain. The BSD licence serves as a fallback option.
9 //
10 // SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
12 // ------------------------------------------------------------------------
14 // Copyright (c) 2017-2020, Loup Vaillant
15 // All rights reserved.
18 // Redistribution and use in source and binary forms, with or without
19 // modification, are permitted provided that the following conditions are
20 // met:
22 // 1. Redistributions of source code must retain the above copyright
23 // notice, this list of conditions and the following disclaimer.
25 // 2. Redistributions in binary form must reproduce the above copyright
26 // notice, this list of conditions and the following disclaimer in the
27 // documentation and/or other materials provided with the
28 // distribution.
30 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
31 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
33 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
34 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
36 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42 // ------------------------------------------------------------------------
44 // Written in 2017-2020 by Loup Vaillant
46 // To the extent possible under law, the author(s) have dedicated all copyright
47 // and related neighboring rights to this software to the public domain
48 // worldwide. This software is distributed without any warranty.
50 // You should have received a copy of the CC0 Public Domain Dedication along
51 // with this software. If not, see
52 // <https://creativecommons.org/publicdomain/zero/1.0/>
54 #include "monocypher.h"
56 #ifdef MONOCYPHER_CPP_NAMESPACE
57 namespace MONOCYPHER_CPP_NAMESPACE {
58 #endif
60 /////////////////
61 /// Utilities ///
62 /////////////////
63 #define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
64 #define FOR(i, start, end) FOR_T(size_t, i, start, end)
65 #define COPY(dst, src, size) FOR(_i_, 0, size) (dst)[_i_] = (src)[_i_]
66 #define ZERO(buf, size) FOR(_i_, 0, size) (buf)[_i_] = 0
67 #define WIPE_CTX(ctx) crypto_wipe(ctx , sizeof(*(ctx)))
68 #define WIPE_BUFFER(buffer) crypto_wipe(buffer, sizeof(buffer))
69 #define MC_MIN(a, b) ((a) <= (b) ? (a) : (b))
70 #define MC_MAX(a, b) ((a) >= (b) ? (a) : (b))
72 typedef int8_t i8;
73 typedef uint8_t u8;
74 typedef int16_t i16;
75 typedef uint32_t u32;
76 typedef int32_t i32;
77 typedef int64_t i64;
78 typedef uint64_t u64;
80 static const u8 zero[128] = {0};
82 // returns the smallest positive integer y such that
83 // (x + y) % pow_2 == 0
84 // Basically, y is the "gap" missing to align x.
85 // Only works when pow_2 is a power of 2.
86 // Note: we use ~x+1 instead of -x to avoid compiler warnings
87 static size_t gap(size_t x, size_t pow_2)
89 return (~x + 1) & (pow_2 - 1);
92 static u32 load24_le(const u8 s[3])
94 return
95 ((u32)s[0] << 0) |
96 ((u32)s[1] << 8) |
97 ((u32)s[2] << 16);
100 static u32 load32_le(const u8 s[4])
102 return
103 ((u32)s[0] << 0) |
104 ((u32)s[1] << 8) |
105 ((u32)s[2] << 16) |
106 ((u32)s[3] << 24);
109 static u64 load64_le(const u8 s[8])
111 return load32_le(s) | ((u64)load32_le(s+4) << 32);
114 static void store32_le(u8 out[4], u32 in)
116 out[0] = in & 0xff;
117 out[1] = (in >> 8) & 0xff;
118 out[2] = (in >> 16) & 0xff;
119 out[3] = (in >> 24) & 0xff;
122 static void store64_le(u8 out[8], u64 in)
124 store32_le(out , (u32)in );
125 store32_le(out + 4, in >> 32);
128 static void load32_le_buf (u32 *dst, const u8 *src, size_t size) {
129 FOR(i, 0, size) { dst[i] = load32_le(src + i*4); }
131 static void load64_le_buf (u64 *dst, const u8 *src, size_t size) {
132 FOR(i, 0, size) { dst[i] = load64_le(src + i*8); }
134 static void store32_le_buf(u8 *dst, const u32 *src, size_t size) {
135 FOR(i, 0, size) { store32_le(dst + i*4, src[i]); }
137 static void store64_le_buf(u8 *dst, const u64 *src, size_t size) {
138 FOR(i, 0, size) { store64_le(dst + i*8, src[i]); }
141 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
142 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
144 static int neq0(u64 diff)
146 // constant time comparison to zero
147 // return diff != 0 ? -1 : 0
148 u64 half = (diff >> 32) | ((u32)diff);
149 return (1 & ((half - 1) >> 32)) - 1;
152 static u64 x16(const u8 a[16], const u8 b[16])
154 return (load64_le(a + 0) ^ load64_le(b + 0))
155 | (load64_le(a + 8) ^ load64_le(b + 8));
157 static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
158 static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);}
159 int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
160 int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); }
161 int crypto_verify64(const u8 a[64], const u8 b[64]){ return neq0(x64(a, b)); }
163 void crypto_wipe(void *secret, size_t size)
165 volatile u8 *v_secret = (u8*)secret;
166 ZERO(v_secret, size);
169 /////////////////
170 /// Chacha 20 ///
171 /////////////////
172 #define QUARTERROUND(a, b, c, d) \
173 a += b; d = rotl32(d ^ a, 16); \
174 c += d; b = rotl32(b ^ c, 12); \
175 a += b; d = rotl32(d ^ a, 8); \
176 c += d; b = rotl32(b ^ c, 7)
178 static void chacha20_rounds(u32 out[16], const u32 in[16])
180 // The temporary variables make Chacha20 10% faster.
181 u32 t0 = in[ 0]; u32 t1 = in[ 1]; u32 t2 = in[ 2]; u32 t3 = in[ 3];
182 u32 t4 = in[ 4]; u32 t5 = in[ 5]; u32 t6 = in[ 6]; u32 t7 = in[ 7];
183 u32 t8 = in[ 8]; u32 t9 = in[ 9]; u32 t10 = in[10]; u32 t11 = in[11];
184 u32 t12 = in[12]; u32 t13 = in[13]; u32 t14 = in[14]; u32 t15 = in[15];
186 FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
187 QUARTERROUND(t0, t4, t8 , t12); // column 0
188 QUARTERROUND(t1, t5, t9 , t13); // column 1
189 QUARTERROUND(t2, t6, t10, t14); // column 2
190 QUARTERROUND(t3, t7, t11, t15); // column 3
191 QUARTERROUND(t0, t5, t10, t15); // diagonal 0
192 QUARTERROUND(t1, t6, t11, t12); // diagonal 1
193 QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
194 QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
196 out[ 0] = t0; out[ 1] = t1; out[ 2] = t2; out[ 3] = t3;
197 out[ 4] = t4; out[ 5] = t5; out[ 6] = t6; out[ 7] = t7;
198 out[ 8] = t8; out[ 9] = t9; out[10] = t10; out[11] = t11;
199 out[12] = t12; out[13] = t13; out[14] = t14; out[15] = t15;
202 static const u8 *chacha20_constant = (const u8*)"expand 32-byte k"; // 16 bytes
204 void crypto_chacha20_h(u8 out[32], const u8 key[32], const u8 in [16])
206 u32 block[16];
207 load32_le_buf(block , chacha20_constant, 4);
208 load32_le_buf(block + 4, key , 8);
209 load32_le_buf(block + 12, in , 4);
211 chacha20_rounds(block, block);
213 // prevent reversal of the rounds by revealing only half of the buffer.
214 store32_le_buf(out , block , 4); // constant
215 store32_le_buf(out+16, block+12, 4); // counter and nonce
216 WIPE_BUFFER(block);
219 u64 crypto_chacha20_djb(u8 *cipher_text, const u8 *plain_text,
220 size_t text_size, const u8 key[32], const u8 nonce[8],
221 u64 ctr)
223 u32 input[16];
224 load32_le_buf(input , chacha20_constant, 4);
225 load32_le_buf(input + 4, key , 8);
226 load32_le_buf(input + 14, nonce , 2);
227 input[12] = (u32) ctr;
228 input[13] = (u32)(ctr >> 32);
230 // Whole blocks
231 u32 pool[16];
232 size_t nb_blocks = text_size >> 6;
233 FOR (i, 0, nb_blocks) {
234 chacha20_rounds(pool, input);
235 if (plain_text != 0) {
236 FOR (j, 0, 16) {
237 u32 p = pool[j] + input[j];
238 store32_le(cipher_text, p ^ load32_le(plain_text));
239 cipher_text += 4;
240 plain_text += 4;
242 } else {
243 FOR (j, 0, 16) {
244 u32 p = pool[j] + input[j];
245 store32_le(cipher_text, p);
246 cipher_text += 4;
249 input[12]++;
250 if (input[12] == 0) {
251 input[13]++;
254 text_size &= 63;
256 // Last (incomplete) block
257 if (text_size > 0) {
258 if (plain_text == 0) {
259 plain_text = zero;
261 chacha20_rounds(pool, input);
262 u8 tmp[64];
263 FOR (i, 0, 16) {
264 store32_le(tmp + i*4, pool[i] + input[i]);
266 FOR (i, 0, text_size) {
267 cipher_text[i] = tmp[i] ^ plain_text[i];
269 WIPE_BUFFER(tmp);
271 ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0);
273 WIPE_BUFFER(pool);
274 WIPE_BUFFER(input);
275 return ctr;
278 u32 crypto_chacha20_ietf(u8 *cipher_text, const u8 *plain_text,
279 size_t text_size,
280 const u8 key[32], const u8 nonce[12], u32 ctr)
282 u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32);
283 return (u32)crypto_chacha20_djb(cipher_text, plain_text, text_size,
284 key, nonce + 4, big_ctr);
287 u64 crypto_chacha20_x(u8 *cipher_text, const u8 *plain_text,
288 size_t text_size,
289 const u8 key[32], const u8 nonce[24], u64 ctr)
291 u8 sub_key[32];
292 crypto_chacha20_h(sub_key, key, nonce);
293 ctr = crypto_chacha20_djb(cipher_text, plain_text, text_size,
294 sub_key, nonce + 16, ctr);
295 WIPE_BUFFER(sub_key);
296 return ctr;
299 /////////////////
300 /// Poly 1305 ///
301 /////////////////
303 // h = (h + c) * r
304 // preconditions:
305 // ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
306 // ctx->r <= 0ffffffc_0ffffffc_0ffffffc_0fffffff
307 // end <= 1
308 // Postcondition:
309 // ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
310 static void poly_blocks(crypto_poly1305_ctx *ctx, const u8 *in,
311 size_t nb_blocks, unsigned end)
313 // Local all the things!
314 const u32 r0 = ctx->r[0];
315 const u32 r1 = ctx->r[1];
316 const u32 r2 = ctx->r[2];
317 const u32 r3 = ctx->r[3];
318 const u32 rr0 = (r0 >> 2) * 5; // lose 2 bits...
319 const u32 rr1 = (r1 >> 2) + r1; // rr1 == (r1 >> 2) * 5
320 const u32 rr2 = (r2 >> 2) + r2; // rr1 == (r2 >> 2) * 5
321 const u32 rr3 = (r3 >> 2) + r3; // rr1 == (r3 >> 2) * 5
322 const u32 rr4 = r0 & 3; // ...recover 2 bits
323 u32 h0 = ctx->h[0];
324 u32 h1 = ctx->h[1];
325 u32 h2 = ctx->h[2];
326 u32 h3 = ctx->h[3];
327 u32 h4 = ctx->h[4];
329 FOR (i, 0, nb_blocks) {
330 // h + c, without carry propagation
331 const u64 s0 = (u64)h0 + load32_le(in); in += 4;
332 const u64 s1 = (u64)h1 + load32_le(in); in += 4;
333 const u64 s2 = (u64)h2 + load32_le(in); in += 4;
334 const u64 s3 = (u64)h3 + load32_le(in); in += 4;
335 const u32 s4 = h4 + end;
337 // (h + c) * r, without carry propagation
338 const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0;
339 const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1;
340 const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2;
341 const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3;
342 const u32 x4 = s4*rr4;
344 // partial reduction modulo 2^130 - 5
345 const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
346 const u64 u0 = (u5 >> 2) * 5 + (x0 & 0xffffffff);
347 const u64 u1 = (u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32);
348 const u64 u2 = (u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32);
349 const u64 u3 = (u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32);
350 const u32 u4 = (u3 >> 32) + (u5 & 3); // u4 <= 4
352 // Update the hash
353 h0 = u0 & 0xffffffff;
354 h1 = u1 & 0xffffffff;
355 h2 = u2 & 0xffffffff;
356 h3 = u3 & 0xffffffff;
357 h4 = u4;
359 ctx->h[0] = h0;
360 ctx->h[1] = h1;
361 ctx->h[2] = h2;
362 ctx->h[3] = h3;
363 ctx->h[4] = h4;
366 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
368 ZERO(ctx->h, 5); // Initial hash is zero
369 ctx->c_idx = 0;
370 // load r and pad (r has some of its bits cleared)
371 load32_le_buf(ctx->r , key , 4);
372 load32_le_buf(ctx->pad, key+16, 4);
373 FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; }
374 FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; }
377 void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
378 const u8 *message, size_t message_size)
380 // Avoid undefined NULL pointer increments with empty messages
381 if (message_size == 0) {
382 return;
385 // Align ourselves with block boundaries
386 size_t aligned = MC_MIN(gap(ctx->c_idx, 16), message_size);
387 FOR (i, 0, aligned) {
388 ctx->c[ctx->c_idx] = *message;
389 ctx->c_idx++;
390 message++;
391 message_size--;
394 // If block is complete, process it
395 if (ctx->c_idx == 16) {
396 poly_blocks(ctx, ctx->c, 1, 1);
397 ctx->c_idx = 0;
400 // Process the message block by block
401 size_t nb_blocks = message_size >> 4;
402 poly_blocks(ctx, message, nb_blocks, 1);
403 message += nb_blocks << 4;
404 message_size &= 15;
406 // remaining bytes (we never complete a block here)
407 FOR (i, 0, message_size) {
408 ctx->c[ctx->c_idx] = message[i];
409 ctx->c_idx++;
413 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
415 // Process the last block (if any)
416 // We move the final 1 according to remaining input length
417 // (this will add less than 2^130 to the last input block)
418 if (ctx->c_idx != 0) {
419 ZERO(ctx->c + ctx->c_idx, 16 - ctx->c_idx);
420 ctx->c[ctx->c_idx] = 1;
421 poly_blocks(ctx, ctx->c, 1, 0);
424 // check if we should subtract 2^130-5 by performing the
425 // corresponding carry propagation.
426 u64 c = 5;
427 FOR (i, 0, 4) {
428 c += ctx->h[i];
429 c >>= 32;
431 c += ctx->h[4];
432 c = (c >> 2) * 5; // shift the carry back to the beginning
433 // c now indicates how many times we should subtract 2^130-5 (0 or 1)
434 FOR (i, 0, 4) {
435 c += (u64)ctx->h[i] + ctx->pad[i];
436 store32_le(mac + i*4, (u32)c);
437 c = c >> 32;
439 WIPE_CTX(ctx);
442 void crypto_poly1305(u8 mac[16], const u8 *message,
443 size_t message_size, const u8 key[32])
445 crypto_poly1305_ctx ctx;
446 crypto_poly1305_init (&ctx, key);
447 crypto_poly1305_update(&ctx, message, message_size);
448 crypto_poly1305_final (&ctx, mac);
451 ////////////////
452 /// BLAKE2 b ///
453 ////////////////
454 static const u64 iv[8] = {
455 0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
456 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
457 0x510e527fade682d1, 0x9b05688c2b3e6c1f,
458 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
461 static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
463 static const u8 sigma[12][16] = {
464 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
465 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
466 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
467 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
468 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
469 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
470 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
471 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
472 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
473 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
474 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
475 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
478 // increment input offset
479 u64 *x = ctx->input_offset;
480 size_t y = ctx->input_idx;
481 x[0] += y;
482 if (x[0] < y) {
483 x[1]++;
486 // init work vector
487 u64 v0 = ctx->hash[0]; u64 v8 = iv[0];
488 u64 v1 = ctx->hash[1]; u64 v9 = iv[1];
489 u64 v2 = ctx->hash[2]; u64 v10 = iv[2];
490 u64 v3 = ctx->hash[3]; u64 v11 = iv[3];
491 u64 v4 = ctx->hash[4]; u64 v12 = iv[4] ^ ctx->input_offset[0];
492 u64 v5 = ctx->hash[5]; u64 v13 = iv[5] ^ ctx->input_offset[1];
493 u64 v6 = ctx->hash[6]; u64 v14 = iv[6] ^ (u64)~(is_last_block - 1);
494 u64 v7 = ctx->hash[7]; u64 v15 = iv[7];
496 // mangle work vector
497 u64 *input = ctx->input;
498 #define BLAKE2_G(a, b, c, d, x, y) \
499 a += b + x; d = rotr64(d ^ a, 32); \
500 c += d; b = rotr64(b ^ c, 24); \
501 a += b + y; d = rotr64(d ^ a, 16); \
502 c += d; b = rotr64(b ^ c, 63)
503 #define BLAKE2_ROUND(i) \
504 BLAKE2_G(v0, v4, v8 , v12, input[sigma[i][ 0]], input[sigma[i][ 1]]); \
505 BLAKE2_G(v1, v5, v9 , v13, input[sigma[i][ 2]], input[sigma[i][ 3]]); \
506 BLAKE2_G(v2, v6, v10, v14, input[sigma[i][ 4]], input[sigma[i][ 5]]); \
507 BLAKE2_G(v3, v7, v11, v15, input[sigma[i][ 6]], input[sigma[i][ 7]]); \
508 BLAKE2_G(v0, v5, v10, v15, input[sigma[i][ 8]], input[sigma[i][ 9]]); \
509 BLAKE2_G(v1, v6, v11, v12, input[sigma[i][10]], input[sigma[i][11]]); \
510 BLAKE2_G(v2, v7, v8 , v13, input[sigma[i][12]], input[sigma[i][13]]); \
511 BLAKE2_G(v3, v4, v9 , v14, input[sigma[i][14]], input[sigma[i][15]])
513 #ifdef BLAKE2_NO_UNROLLING
514 FOR (i, 0, 12) {
515 BLAKE2_ROUND(i);
517 #else
518 BLAKE2_ROUND(0); BLAKE2_ROUND(1); BLAKE2_ROUND(2); BLAKE2_ROUND(3);
519 BLAKE2_ROUND(4); BLAKE2_ROUND(5); BLAKE2_ROUND(6); BLAKE2_ROUND(7);
520 BLAKE2_ROUND(8); BLAKE2_ROUND(9); BLAKE2_ROUND(10); BLAKE2_ROUND(11);
521 #endif
523 // update hash
524 ctx->hash[0] ^= v0 ^ v8; ctx->hash[1] ^= v1 ^ v9;
525 ctx->hash[2] ^= v2 ^ v10; ctx->hash[3] ^= v3 ^ v11;
526 ctx->hash[4] ^= v4 ^ v12; ctx->hash[5] ^= v5 ^ v13;
527 ctx->hash[6] ^= v6 ^ v14; ctx->hash[7] ^= v7 ^ v15;
530 void crypto_blake2b_keyed_init(crypto_blake2b_ctx *ctx, size_t hash_size,
531 const u8 *key, size_t key_size)
533 // initial hash
534 COPY(ctx->hash, iv, 8);
535 ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
537 ctx->input_offset[0] = 0; // beginning of the input, no offset
538 ctx->input_offset[1] = 0; // beginning of the input, no offset
539 ctx->hash_size = hash_size;
540 ctx->input_idx = 0;
541 ZERO(ctx->input, 16);
543 // if there is a key, the first block is that key (padded with zeroes)
544 if (key_size > 0) {
545 u8 key_block[128] = {0};
546 COPY(key_block, key, key_size);
547 // same as calling crypto_blake2b_update(ctx, key_block , 128)
548 load64_le_buf(ctx->input, key_block, 16);
549 ctx->input_idx = 128;
553 void crypto_blake2b_init(crypto_blake2b_ctx *ctx, size_t hash_size)
555 crypto_blake2b_keyed_init(ctx, hash_size, 0, 0);
558 void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
559 const u8 *message, size_t message_size)
561 // Avoid undefined NULL pointer increments with empty messages
562 if (message_size == 0) {
563 return;
566 // Align with word boundaries
567 if ((ctx->input_idx & 7) != 0) {
568 size_t nb_bytes = MC_MIN(gap(ctx->input_idx, 8), message_size);
569 size_t word = ctx->input_idx >> 3;
570 size_t byte = ctx->input_idx & 7;
571 FOR (i, 0, nb_bytes) {
572 ctx->input[word] |= (u64)message[i] << ((byte + i) << 3);
574 ctx->input_idx += nb_bytes;
575 message += nb_bytes;
576 message_size -= nb_bytes;
579 // Align with block boundaries (faster than byte by byte)
580 if ((ctx->input_idx & 127) != 0) {
581 size_t nb_words = MC_MIN(gap(ctx->input_idx, 128), message_size) >> 3;
582 load64_le_buf(ctx->input + (ctx->input_idx >> 3), message, nb_words);
583 ctx->input_idx += nb_words << 3;
584 message += nb_words << 3;
585 message_size -= nb_words << 3;
588 // Process block by block
589 size_t nb_blocks = message_size >> 7;
590 FOR (i, 0, nb_blocks) {
591 if (ctx->input_idx == 128) {
592 blake2b_compress(ctx, 0);
594 load64_le_buf(ctx->input, message, 16);
595 message += 128;
596 ctx->input_idx = 128;
598 message_size &= 127;
600 if (message_size != 0) {
601 // Compress block & flush input buffer as needed
602 if (ctx->input_idx == 128) {
603 blake2b_compress(ctx, 0);
604 ctx->input_idx = 0;
606 if (ctx->input_idx == 0) {
607 ZERO(ctx->input, 16);
609 // Fill remaining words (faster than byte by byte)
610 size_t nb_words = message_size >> 3;
611 load64_le_buf(ctx->input, message, nb_words);
612 ctx->input_idx += nb_words << 3;
613 message += nb_words << 3;
614 message_size -= nb_words << 3;
616 // Fill remaining bytes
617 FOR (i, 0, message_size) {
618 size_t word = ctx->input_idx >> 3;
619 size_t byte = ctx->input_idx & 7;
620 ctx->input[word] |= (u64)message[i] << (byte << 3);
621 ctx->input_idx++;
626 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
628 blake2b_compress(ctx, 1); // compress the last block
629 size_t hash_size = MC_MIN(ctx->hash_size, 64);
630 size_t nb_words = hash_size >> 3;
631 store64_le_buf(hash, ctx->hash, nb_words);
632 FOR (i, nb_words << 3, hash_size) {
633 hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff;
635 WIPE_CTX(ctx);
638 void crypto_blake2b_keyed(u8 *hash, size_t hash_size,
639 const u8 *key, size_t key_size,
640 const u8 *message, size_t message_size)
642 crypto_blake2b_ctx ctx;
643 crypto_blake2b_keyed_init(&ctx, hash_size, key, key_size);
644 crypto_blake2b_update (&ctx, message, message_size);
645 crypto_blake2b_final (&ctx, hash);
648 void crypto_blake2b(u8 *hash, size_t hash_size, const u8 *msg, size_t msg_size)
650 crypto_blake2b_keyed(hash, hash_size, 0, 0, msg, msg_size);
653 //////////////
654 /// Argon2 ///
655 //////////////
656 // references to R, Z, Q etc. come from the spec
658 // Argon2 operates on 1024 byte blocks.
659 typedef struct { u64 a[128]; } blk;
661 // updates a BLAKE2 hash with a 32 bit word, little endian.
662 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
664 u8 buf[4];
665 store32_le(buf, input);
666 crypto_blake2b_update(ctx, buf, 4);
667 WIPE_BUFFER(buf);
670 static void blake_update_32_buf(crypto_blake2b_ctx *ctx,
671 const u8 *buf, u32 size)
673 blake_update_32(ctx, size);
674 crypto_blake2b_update(ctx, buf, size);
678 static void copy_block(blk *o,const blk*in){FOR(i, 0, 128) o->a[i] = in->a[i];}
679 static void xor_block(blk *o,const blk*in){FOR(i, 0, 128) o->a[i] ^= in->a[i];}
681 // Hash with a virtually unlimited digest size.
682 // Doesn't extract more entropy than the base hash function.
683 // Mainly used for filling a whole kilobyte block with pseudo-random bytes.
684 // (One could use a stream cipher with a seed hash as the key, but
685 // this would introduce another dependency —and point of failure.)
686 static void extended_hash(u8 *digest, u32 digest_size,
687 const u8 *input , u32 input_size)
689 crypto_blake2b_ctx ctx;
690 crypto_blake2b_init (&ctx, MC_MIN(digest_size, 64));
691 blake_update_32 (&ctx, digest_size);
692 crypto_blake2b_update(&ctx, input, input_size);
693 crypto_blake2b_final (&ctx, digest);
695 if (digest_size > 64) {
696 // the conversion to u64 avoids integer overflow on
697 // ludicrously big hash sizes.
698 u32 r = (u32)(((u64)digest_size + 31) >> 5) - 2;
699 u32 i = 1;
700 u32 in = 0;
701 u32 out = 32;
702 while (i < r) {
703 // Input and output overlap. This is intentional
704 crypto_blake2b(digest + out, 64, digest + in, 64);
705 i += 1;
706 in += 32;
707 out += 32;
709 crypto_blake2b(digest + out, digest_size - (32 * r), digest + in , 64);
713 #define LSB(x) ((u64)(u32)x)
714 #define G(a, b, c, d) \
715 a += b + ((LSB(a) * LSB(b)) << 1); d ^= a; d = rotr64(d, 32); \
716 c += d + ((LSB(c) * LSB(d)) << 1); b ^= c; b = rotr64(b, 24); \
717 a += b + ((LSB(a) * LSB(b)) << 1); d ^= a; d = rotr64(d, 16); \
718 c += d + ((LSB(c) * LSB(d)) << 1); b ^= c; b = rotr64(b, 63)
719 #define ROUND(v0, v1, v2, v3, v4, v5, v6, v7, \
720 v8, v9, v10, v11, v12, v13, v14, v15) \
721 G(v0, v4, v8, v12); G(v1, v5, v9, v13); \
722 G(v2, v6, v10, v14); G(v3, v7, v11, v15); \
723 G(v0, v5, v10, v15); G(v1, v6, v11, v12); \
724 G(v2, v7, v8, v13); G(v3, v4, v9, v14)
726 // Core of the compression function G. Computes Z from R in place.
727 static void g_rounds(blk *b)
729 // column rounds (work_block = Q)
730 for (int i = 0; i < 128; i += 16) {
731 ROUND(b->a[i ], b->a[i+ 1], b->a[i+ 2], b->a[i+ 3],
732 b->a[i+ 4], b->a[i+ 5], b->a[i+ 6], b->a[i+ 7],
733 b->a[i+ 8], b->a[i+ 9], b->a[i+10], b->a[i+11],
734 b->a[i+12], b->a[i+13], b->a[i+14], b->a[i+15]);
736 // row rounds (b = Z)
737 for (int i = 0; i < 16; i += 2) {
738 ROUND(b->a[i ], b->a[i+ 1], b->a[i+ 16], b->a[i+ 17],
739 b->a[i+32], b->a[i+33], b->a[i+ 48], b->a[i+ 49],
740 b->a[i+64], b->a[i+65], b->a[i+ 80], b->a[i+ 81],
741 b->a[i+96], b->a[i+97], b->a[i+112], b->a[i+113]);
745 const crypto_argon2_extras crypto_argon2_no_extras = { 0, 0, 0, 0 };
747 void crypto_argon2(u8 *hash, u32 hash_size, void *work_area,
748 crypto_argon2_config config,
749 crypto_argon2_inputs inputs,
750 crypto_argon2_extras extras)
752 const u32 segment_size = config.nb_blocks / config.nb_lanes / 4;
753 const u32 lane_size = segment_size * 4;
754 const u32 nb_blocks = lane_size * config.nb_lanes; // rounding down
756 // work area seen as blocks (must be suitably aligned)
757 blk *blocks = (blk*)work_area;
759 u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
760 crypto_blake2b_ctx ctx;
761 crypto_blake2b_init (&ctx, 64);
762 blake_update_32 (&ctx, config.nb_lanes ); // p: number of "threads"
763 blake_update_32 (&ctx, hash_size);
764 blake_update_32 (&ctx, config.nb_blocks);
765 blake_update_32 (&ctx, config.nb_passes);
766 blake_update_32 (&ctx, 0x13); // v: version number
767 blake_update_32 (&ctx, config.algorithm); // y: Argon2i, Argon2d...
768 blake_update_32_buf (&ctx, inputs.pass, inputs.pass_size);
769 blake_update_32_buf (&ctx, inputs.salt, inputs.salt_size);
770 blake_update_32_buf (&ctx, extras.key, extras.key_size);
771 blake_update_32_buf (&ctx, extras.ad, extras.ad_size);
772 crypto_blake2b_final(&ctx, initial_hash); // fill 64 first bytes only
774 // fill first 2 blocks of each lane
775 u8 hash_area[1024];
776 FOR_T(u32, l, 0, config.nb_lanes) {
777 FOR_T(u32, i, 0, 2) {
778 store32_le(initial_hash + 64, i); // first additional word
779 store32_le(initial_hash + 68, l); // second additional word
780 extended_hash(hash_area, 1024, initial_hash, 72);
781 load64_le_buf(blocks[l * lane_size + i].a, hash_area, 128);
785 WIPE_BUFFER(initial_hash);
786 WIPE_BUFFER(hash_area);
789 // Argon2i and Argon2id start with constant time indexing
790 int constant_time = config.algorithm != CRYPTO_ARGON2_D;
792 // Fill (and re-fill) the rest of the blocks
794 // Note: even though each segment within the same slice can be
795 // computed in parallel, (one thread per lane), we are computing
796 // them sequentially, because Monocypher doesn't support threads.
798 // Yet optimal performance (and therefore security) requires one
799 // thread per lane. The only reason Monocypher supports multiple
800 // lanes is compatibility.
801 blk tmp;
802 FOR_T(u32, pass, 0, config.nb_passes) {
803 FOR_T(u32, slice, 0, 4) {
804 // On the first slice of the first pass,
805 // blocks 0 and 1 are already filled, hence pass_offset.
806 u32 pass_offset = pass == 0 && slice == 0 ? 2 : 0;
807 u32 slice_offset = slice * segment_size;
809 // Argon2id switches back to non-constant time indexing
810 // after the first two slices of the first pass
811 if (slice == 2 && config.algorithm == CRYPTO_ARGON2_ID) {
812 constant_time = 0;
815 // Each iteration of the following loop may be performed in
816 // a separate thread. All segments must be fully completed
817 // before we start filling the next slice.
818 FOR_T(u32, segment, 0, config.nb_lanes) {
819 blk index_block;
820 u32 index_ctr = 1;
821 FOR_T (u32, block, pass_offset, segment_size) {
822 // Current and previous blocks
823 u32 lane_offset = segment * lane_size;
824 blk *segment_start = blocks + lane_offset + slice_offset;
825 blk *current = segment_start + block;
826 blk *previous =
827 block == 0 && slice_offset == 0
828 ? segment_start + lane_size - 1
829 : segment_start + block - 1;
831 u64 index_seed;
832 if (constant_time) {
833 if (block == pass_offset || (block % 128) == 0) {
834 // Fill or refresh deterministic indices block
836 // seed the beginning of the block...
837 ZERO(index_block.a, 128);
838 index_block.a[0] = pass;
839 index_block.a[1] = segment;
840 index_block.a[2] = slice;
841 index_block.a[3] = nb_blocks;
842 index_block.a[4] = config.nb_passes;
843 index_block.a[5] = config.algorithm;
844 index_block.a[6] = index_ctr;
845 index_ctr++;
847 // ... then shuffle it
848 copy_block(&tmp, &index_block);
849 g_rounds (&index_block);
850 xor_block (&index_block, &tmp);
851 copy_block(&tmp, &index_block);
852 g_rounds (&index_block);
853 xor_block (&index_block, &tmp);
855 index_seed = index_block.a[block % 128];
856 } else {
857 index_seed = previous->a[0];
860 // Establish the reference set. *Approximately* comprises:
861 // - The last 3 slices (if they exist yet)
862 // - The already constructed blocks in the current segment
863 u32 next_slice = ((slice + 1) % 4) * segment_size;
864 u32 window_start = pass == 0 ? 0 : next_slice;
865 u32 nb_segments = pass == 0 ? slice : 3;
866 u64 lane =
867 pass == 0 && slice == 0
868 ? segment
869 : (index_seed >> 32) % config.nb_lanes;
870 u32 window_size =
871 nb_segments * segment_size +
872 (lane == segment ? block-1 :
873 block == 0 ? (u32)-1 : 0);
875 // Find reference block
876 u64 j1 = index_seed & 0xffffffff; // block selector
877 u64 x = (j1 * j1) >> 32;
878 u64 y = (window_size * x) >> 32;
879 u64 z = (window_size - 1) - y;
880 u64 ref = (window_start + z) % lane_size;
881 u32 index = lane * lane_size + (u32)ref;
882 blk *reference = blocks + index;
884 // Shuffle the previous & reference block
885 // into the current block
886 copy_block(&tmp, previous);
887 xor_block (&tmp, reference);
888 if (pass == 0) { copy_block(current, &tmp); }
889 else { xor_block (current, &tmp); }
890 g_rounds (&tmp);
891 xor_block (current, &tmp);
897 // Wipe temporary block
898 volatile u64* p = tmp.a;
899 ZERO(p, 128);
901 // XOR last blocks of each lane
902 blk *last_block = blocks + lane_size - 1;
903 FOR_T (u32, lane, 1, config.nb_lanes) {
904 blk *next_block = last_block + lane_size;
905 xor_block(next_block, last_block);
906 last_block = next_block;
909 // Serialize last block
910 u8 final_block[1024];
911 store64_le_buf(final_block, last_block->a, 128);
913 // Wipe work area
914 p = (u64*)work_area;
915 ZERO(p, 128 * nb_blocks);
917 // Hash the very last block with H' into the output hash
918 extended_hash(hash, hash_size, final_block, 1024);
919 WIPE_BUFFER(final_block);
922 ////////////////////////////////////
923 /// Arithmetic modulo 2^255 - 19 ///
924 ////////////////////////////////////
925 // Originally taken from SUPERCOP's ref10 implementation.
926 // A bit bigger than TweetNaCl, over 4 times faster.
928 // field element
929 typedef i32 fe[10];
931 // field constants
933 // fe_one : 1
934 // sqrtm1 : sqrt(-1)
935 // d : -121665 / 121666
936 // D2 : 2 * -121665 / 121666
937 // lop_x, lop_y: low order point in Edwards coordinates
938 // ufactor : -sqrt(-1) * 2
939 // A2 : 486662^2 (A squared)
940 static const fe fe_one = {1};
941 static const fe sqrtm1 = {
942 -32595792, -7943725, 9377950, 3500415, 12389472,
943 -272473, -25146209, -2005654, 326686, 11406482,
945 static const fe d = {
946 -10913610, 13857413, -15372611, 6949391, 114729,
947 -8787816, -6275908, -3247719, -18696448, -12055116,
949 static const fe D2 = {
950 -21827239, -5839606, -30745221, 13898782, 229458,
951 15978800, -12551817, -6495438, 29715968, 9444199,
953 static const fe lop_x = {
954 21352778, 5345713, 4660180, -8347857, 24143090,
955 14568123, 30185756, -12247770, -33528939, 8345319,
957 static const fe lop_y = {
958 -6952922, -1265500, 6862341, -7057498, -4037696,
959 -5447722, 31680899, -15325402, -19365852, 1569102,
961 static const fe ufactor = {
962 -1917299, 15887451, -18755900, -7000830, -24778944,
963 544946, -16816446, 4011309, -653372, 10741468,
965 static const fe A2 = {
966 12721188, 3529, 0, 0, 0, 0, 0, 0, 0, 0,
969 static void fe_0(fe h) { ZERO(h , 10); }
970 static void fe_1(fe h) { h[0] = 1; ZERO(h+1, 9); }
972 static void fe_copy(fe h,const fe f ){FOR(i,0,10) h[i] = f[i]; }
973 static void fe_neg (fe h,const fe f ){FOR(i,0,10) h[i] = -f[i]; }
974 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
975 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
977 static void fe_cswap(fe f, fe g, int b)
979 i32 mask = -b; // -1 = 0xffffffff
980 FOR (i, 0, 10) {
981 i32 x = (f[i] ^ g[i]) & mask;
982 f[i] = f[i] ^ x;
983 g[i] = g[i] ^ x;
987 static void fe_ccopy(fe f, const fe g, int b)
989 i32 mask = -b; // -1 = 0xffffffff
990 FOR (i, 0, 10) {
991 i32 x = (f[i] ^ g[i]) & mask;
992 f[i] = f[i] ^ x;
997 // Signed carry propagation
998 // ------------------------
1000 // Let t be a number. It can be uniquely decomposed thus:
1002 // t = h*2^26 + l
1003 // such that -2^25 <= l < 2^25
1005 // Let c = (t + 2^25) / 2^26 (rounded down)
1006 // c = (h*2^26 + l + 2^25) / 2^26 (rounded down)
1007 // c = h + (l + 2^25) / 2^26 (rounded down)
1008 // c = h (exactly)
1009 // Because 0 <= l + 2^25 < 2^26
1011 // Let u = t - c*2^26
1012 // u = h*2^26 + l - h*2^26
1013 // u = l
1014 // Therefore, -2^25 <= u < 2^25
1016 // Additionally, if |t| < x, then |h| < x/2^26 (rounded down)
1018 // Notations:
1019 // - In C, 1<<25 means 2^25.
1020 // - In C, x>>25 means floor(x / (2^25)).
1021 // - All of the above applies with 25 & 24 as well as 26 & 25.
1024 // Note on negative right shifts
1025 // -----------------------------
1027 // In C, x >> n, where x is a negative integer, is implementation
1028 // defined. In practice, all platforms do arithmetic shift, which is
1029 // equivalent to division by 2^26, rounded down. Some compilers, like
1030 // GCC, even guarantee it.
1032 // If we ever stumble upon a platform that does not propagate the sign
1033 // bit (we won't), visible failures will show at the slightest test, and
1034 // the signed shifts can be replaced by the following:
1036 // typedef struct { i64 x:39; } s25;
1037 // typedef struct { i64 x:38; } s26;
1038 // i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; }
1039 // i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; }
1041 // Current compilers cannot optimise this, causing a 30% drop in
1042 // performance. Fairly expensive for something that never happens.
1045 // Precondition
1046 // ------------
1048 // |t0| < 2^63
1049 // |t1|..|t9| < 2^62
1051 // Algorithm
1052 // ---------
1053 // c = t0 + 2^25 / 2^26 -- |c| <= 2^36
1054 // t0 -= c * 2^26 -- |t0| <= 2^25
1055 // t1 += c -- |t1| <= 2^63
1057 // c = t4 + 2^25 / 2^26 -- |c| <= 2^36
1058 // t4 -= c * 2^26 -- |t4| <= 2^25
1059 // t5 += c -- |t5| <= 2^63
1061 // c = t1 + 2^24 / 2^25 -- |c| <= 2^38
1062 // t1 -= c * 2^25 -- |t1| <= 2^24
1063 // t2 += c -- |t2| <= 2^63
1065 // c = t5 + 2^24 / 2^25 -- |c| <= 2^38
1066 // t5 -= c * 2^25 -- |t5| <= 2^24
1067 // t6 += c -- |t6| <= 2^63
1069 // c = t2 + 2^25 / 2^26 -- |c| <= 2^37
1070 // t2 -= c * 2^26 -- |t2| <= 2^25 < 1.1 * 2^25 (final t2)
1071 // t3 += c -- |t3| <= 2^63
1073 // c = t6 + 2^25 / 2^26 -- |c| <= 2^37
1074 // t6 -= c * 2^26 -- |t6| <= 2^25 < 1.1 * 2^25 (final t6)
1075 // t7 += c -- |t7| <= 2^63
1077 // c = t3 + 2^24 / 2^25 -- |c| <= 2^38
1078 // t3 -= c * 2^25 -- |t3| <= 2^24 < 1.1 * 2^24 (final t3)
1079 // t4 += c -- |t4| <= 2^25 + 2^38 < 2^39
1081 // c = t7 + 2^24 / 2^25 -- |c| <= 2^38
1082 // t7 -= c * 2^25 -- |t7| <= 2^24 < 1.1 * 2^24 (final t7)
1083 // t8 += c -- |t8| <= 2^63
1085 // c = t4 + 2^25 / 2^26 -- |c| <= 2^13
1086 // t4 -= c * 2^26 -- |t4| <= 2^25 < 1.1 * 2^25 (final t4)
1087 // t5 += c -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24 (final t5)
1089 // c = t8 + 2^25 / 2^26 -- |c| <= 2^37
1090 // t8 -= c * 2^26 -- |t8| <= 2^25 < 1.1 * 2^25 (final t8)
1091 // t9 += c -- |t9| <= 2^63
1093 // c = t9 + 2^24 / 2^25 -- |c| <= 2^38
1094 // t9 -= c * 2^25 -- |t9| <= 2^24 < 1.1 * 2^24 (final t9)
1095 // t0 += c * 19 -- |t0| <= 2^25 + 2^38*19 < 2^44
1097 // c = t0 + 2^25 / 2^26 -- |c| <= 2^18
1098 // t0 -= c * 2^26 -- |t0| <= 2^25 < 1.1 * 2^25 (final t0)
1099 // t1 += c -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24 (final t1)
1101 // Postcondition
1102 // -------------
1103 // |t0|, |t2|, |t4|, |t6|, |t8| < 1.1 * 2^25
1104 // |t1|, |t3|, |t5|, |t7|, |t9| < 1.1 * 2^24
1105 #define FE_CARRY \
1106 i64 c; \
1107 c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \
1108 c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \
1109 c = (t1 + ((i64)1<<24)) >> 25; t1 -= c * ((i64)1 << 25); t2 += c; \
1110 c = (t5 + ((i64)1<<24)) >> 25; t5 -= c * ((i64)1 << 25); t6 += c; \
1111 c = (t2 + ((i64)1<<25)) >> 26; t2 -= c * ((i64)1 << 26); t3 += c; \
1112 c = (t6 + ((i64)1<<25)) >> 26; t6 -= c * ((i64)1 << 26); t7 += c; \
1113 c = (t3 + ((i64)1<<24)) >> 25; t3 -= c * ((i64)1 << 25); t4 += c; \
1114 c = (t7 + ((i64)1<<24)) >> 25; t7 -= c * ((i64)1 << 25); t8 += c; \
1115 c = (t4 + ((i64)1<<25)) >> 26; t4 -= c * ((i64)1 << 26); t5 += c; \
1116 c = (t8 + ((i64)1<<25)) >> 26; t8 -= c * ((i64)1 << 26); t9 += c; \
1117 c = (t9 + ((i64)1<<24)) >> 25; t9 -= c * ((i64)1 << 25); t0 += c * 19; \
1118 c = (t0 + ((i64)1<<25)) >> 26; t0 -= c * ((i64)1 << 26); t1 += c; \
1119 h[0]=(i32)t0; h[1]=(i32)t1; h[2]=(i32)t2; h[3]=(i32)t3; h[4]=(i32)t4; \
1120 h[5]=(i32)t5; h[6]=(i32)t6; h[7]=(i32)t7; h[8]=(i32)t8; h[9]=(i32)t9
1122 // Decodes a field element from a byte buffer.
1123 // mask specifies how many bits we ignore.
1124 // Traditionally we ignore 1. It's useful for EdDSA,
1125 // which uses that bit to denote the sign of x.
1126 // Elligator however uses positive representatives,
1127 // which means ignoring 2 bits instead.
1128 static void fe_frombytes_mask(fe h, const u8 s[32], unsigned nb_mask)
1130 u32 mask = 0xffffff >> nb_mask;
1131 i64 t0 = load32_le(s); // t0 < 2^32
1132 i64 t1 = load24_le(s + 4) << 6; // t1 < 2^30
1133 i64 t2 = load24_le(s + 7) << 5; // t2 < 2^29
1134 i64 t3 = load24_le(s + 10) << 3; // t3 < 2^27
1135 i64 t4 = load24_le(s + 13) << 2; // t4 < 2^26
1136 i64 t5 = load32_le(s + 16); // t5 < 2^32
1137 i64 t6 = load24_le(s + 20) << 7; // t6 < 2^31
1138 i64 t7 = load24_le(s + 23) << 5; // t7 < 2^29
1139 i64 t8 = load24_le(s + 26) << 4; // t8 < 2^28
1140 i64 t9 = (load24_le(s + 29) & mask) << 2; // t9 < 2^25
1141 FE_CARRY; // Carry precondition OK
1144 static void fe_frombytes(fe h, const u8 s[32])
1146 fe_frombytes_mask(h, s, 1);
1150 // Precondition
1151 // |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]| < 1.1 * 2^25
1152 // |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]| < 1.1 * 2^24
1154 // Therefore, |h| < 2^255-19
1155 // There are two possibilities:
1157 // - If h is positive, all we need to do is reduce its individual
1158 // limbs down to their tight positive range.
1159 // - If h is negative, we also need to add 2^255-19 to it.
1160 // Or just remove 19 and chop off any excess bit.
1161 static void fe_tobytes(u8 s[32], const fe h)
1163 i32 t[10];
1164 COPY(t, h, 10);
1165 i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
1166 // |t9| < 1.1 * 2^24
1167 // -1.1 * 2^24 < t9 < 1.1 * 2^24
1168 // -21 * 2^24 < 19 * t9 < 21 * 2^24
1169 // -2^29 < 19 * t9 + 2^24 < 2^29
1170 // -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25
1171 // -16 < (19 * t9 + 2^24) / 2^25 < 16
1172 FOR (i, 0, 5) {
1173 q += t[2*i ]; q >>= 26; // q = 0 or -1
1174 q += t[2*i+1]; q >>= 25; // q = 0 or -1
1176 // q = 0 iff h >= 0
1177 // q = -1 iff h < 0
1178 // Adding q * 19 to h reduces h to its proper range.
1179 q *= 19; // Shift carry back to the beginning
1180 FOR (i, 0, 5) {
1181 t[i*2 ] += q; q = t[i*2 ] >> 26; t[i*2 ] -= q * ((i32)1 << 26);
1182 t[i*2+1] += q; q = t[i*2+1] >> 25; t[i*2+1] -= q * ((i32)1 << 25);
1184 // h is now fully reduced, and q represents the excess bit.
1186 store32_le(s + 0, ((u32)t[0] >> 0) | ((u32)t[1] << 26));
1187 store32_le(s + 4, ((u32)t[1] >> 6) | ((u32)t[2] << 19));
1188 store32_le(s + 8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
1189 store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] << 6));
1190 store32_le(s + 16, ((u32)t[5] >> 0) | ((u32)t[6] << 25));
1191 store32_le(s + 20, ((u32)t[6] >> 7) | ((u32)t[7] << 19));
1192 store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
1193 store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] << 6));
1195 WIPE_BUFFER(t);
1198 // Precondition
1199 // -------------
1200 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1201 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1203 // |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26
1204 // |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25
1205 static void fe_mul_small(fe h, const fe f, i32 g)
1207 i64 t0 = f[0] * (i64) g; i64 t1 = f[1] * (i64) g;
1208 i64 t2 = f[2] * (i64) g; i64 t3 = f[3] * (i64) g;
1209 i64 t4 = f[4] * (i64) g; i64 t5 = f[5] * (i64) g;
1210 i64 t6 = f[6] * (i64) g; i64 t7 = f[7] * (i64) g;
1211 i64 t8 = f[8] * (i64) g; i64 t9 = f[9] * (i64) g;
1212 // |t0|, |t2|, |t4|, |t6|, |t8| < 1.65 * 2^26 * 2^31 < 2^58
1213 // |t1|, |t3|, |t5|, |t7|, |t9| < 1.65 * 2^25 * 2^31 < 2^57
1215 FE_CARRY; // Carry precondition OK
1218 // Precondition
1219 // -------------
1220 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1221 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1223 // |g0|, |g2|, |g4|, |g6|, |g8| < 1.65 * 2^26
1224 // |g1|, |g3|, |g5|, |g7|, |g9| < 1.65 * 2^25
1225 static void fe_mul(fe h, const fe f, const fe g)
1227 // Everything is unrolled and put in temporary variables.
1228 // We could roll the loop, but that would make curve25519 twice as slow.
1229 i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1230 i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1231 i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
1232 i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
1233 i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
1234 i32 G1 = g1*19; i32 G2 = g2*19; i32 G3 = g3*19;
1235 i32 G4 = g4*19; i32 G5 = g5*19; i32 G6 = g6*19;
1236 i32 G7 = g7*19; i32 G8 = g8*19; i32 G9 = g9*19;
1237 // |F1|, |F3|, |F5|, |F7|, |F9| < 1.65 * 2^26
1238 // |G0|, |G2|, |G4|, |G6|, |G8| < 2^31
1239 // |G1|, |G3|, |G5|, |G7|, |G9| < 2^30
1241 i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
1242 + F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
1243 i64 t1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
1244 + f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
1245 i64 t2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
1246 + F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
1247 i64 t3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
1248 + f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
1249 i64 t4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
1250 + F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
1251 i64 t5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
1252 + f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
1253 i64 t6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
1254 + F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
1255 i64 t7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
1256 + f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
1257 i64 t8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
1258 + F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
1259 i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
1260 + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
1261 // t0 < 0.67 * 2^61
1262 // t1 < 0.41 * 2^61
1263 // t2 < 0.52 * 2^61
1264 // t3 < 0.32 * 2^61
1265 // t4 < 0.38 * 2^61
1266 // t5 < 0.22 * 2^61
1267 // t6 < 0.23 * 2^61
1268 // t7 < 0.13 * 2^61
1269 // t8 < 0.09 * 2^61
1270 // t9 < 0.03 * 2^61
1272 FE_CARRY; // Everything below 2^62, Carry precondition OK
1275 // Precondition
1276 // -------------
1277 // |f0|, |f2|, |f4|, |f6|, |f8| < 1.65 * 2^26
1278 // |f1|, |f3|, |f5|, |f7|, |f9| < 1.65 * 2^25
1280 // Note: we could use fe_mul() for this, but this is significantly faster
1281 static void fe_sq(fe h, const fe f)
1283 i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1284 i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1285 i32 f0_2 = f0*2; i32 f1_2 = f1*2; i32 f2_2 = f2*2; i32 f3_2 = f3*2;
1286 i32 f4_2 = f4*2; i32 f5_2 = f5*2; i32 f6_2 = f6*2; i32 f7_2 = f7*2;
1287 i32 f5_38 = f5*38; i32 f6_19 = f6*19; i32 f7_38 = f7*38;
1288 i32 f8_19 = f8*19; i32 f9_38 = f9*38;
1289 // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2| < 1.65 * 2^27
1290 // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2| < 1.65 * 2^26
1291 // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| < 2^31
1293 i64 t0 = f0 *(i64)f0 + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
1294 + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5 *(i64)f5_38;
1295 i64 t1 = f0_2*(i64)f1 + f2 *(i64)f9_38 + f3_2*(i64)f8_19
1296 + f4 *(i64)f7_38 + f5_2*(i64)f6_19;
1297 i64 t2 = f0_2*(i64)f2 + f1_2*(i64)f1 + f3_2*(i64)f9_38
1298 + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6 *(i64)f6_19;
1299 i64 t3 = f0_2*(i64)f3 + f1_2*(i64)f2 + f4 *(i64)f9_38
1300 + f5_2*(i64)f8_19 + f6 *(i64)f7_38;
1301 i64 t4 = f0_2*(i64)f4 + f1_2*(i64)f3_2 + f2 *(i64)f2
1302 + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7 *(i64)f7_38;
1303 i64 t5 = f0_2*(i64)f5 + f1_2*(i64)f4 + f2_2*(i64)f3
1304 + f6 *(i64)f9_38 + f7_2*(i64)f8_19;
1305 i64 t6 = f0_2*(i64)f6 + f1_2*(i64)f5_2 + f2_2*(i64)f4
1306 + f3_2*(i64)f3 + f7_2*(i64)f9_38 + f8 *(i64)f8_19;
1307 i64 t7 = f0_2*(i64)f7 + f1_2*(i64)f6 + f2_2*(i64)f5
1308 + f3_2*(i64)f4 + f8 *(i64)f9_38;
1309 i64 t8 = f0_2*(i64)f8 + f1_2*(i64)f7_2 + f2_2*(i64)f6
1310 + f3_2*(i64)f5_2 + f4 *(i64)f4 + f9 *(i64)f9_38;
1311 i64 t9 = f0_2*(i64)f9 + f1_2*(i64)f8 + f2_2*(i64)f7
1312 + f3_2*(i64)f6 + f4 *(i64)f5_2;
1313 // t0 < 0.67 * 2^61
1314 // t1 < 0.41 * 2^61
1315 // t2 < 0.52 * 2^61
1316 // t3 < 0.32 * 2^61
1317 // t4 < 0.38 * 2^61
1318 // t5 < 0.22 * 2^61
1319 // t6 < 0.23 * 2^61
1320 // t7 < 0.13 * 2^61
1321 // t8 < 0.09 * 2^61
1322 // t9 < 0.03 * 2^61
1324 FE_CARRY;
1327 // Parity check. Returns 0 if even, 1 if odd
1328 static int fe_isodd(const fe f)
1330 u8 s[32];
1331 fe_tobytes(s, f);
1332 u8 isodd = s[0] & 1;
1333 WIPE_BUFFER(s);
1334 return isodd;
1337 // Returns 1 if equal, 0 if not equal
1338 static int fe_isequal(const fe f, const fe g)
1340 u8 fs[32];
1341 u8 gs[32];
1342 fe_tobytes(fs, f);
1343 fe_tobytes(gs, g);
1344 int isdifferent = crypto_verify32(fs, gs);
1345 WIPE_BUFFER(fs);
1346 WIPE_BUFFER(gs);
1347 return 1 + isdifferent;
1350 // Inverse square root.
1351 // Returns true if x is a square, false otherwise.
1352 // After the call:
1353 // isr = sqrt(1/x) if x is a non-zero square.
1354 // isr = sqrt(sqrt(-1)/x) if x is not a square.
1355 // isr = 0 if x is zero.
1356 // We do not guarantee the sign of the square root.
1358 // Notes:
1359 // Let quartic = x^((p-1)/4)
1361 // x^((p-1)/2) = chi(x)
1362 // quartic^2 = chi(x)
1363 // quartic = sqrt(chi(x))
1364 // quartic = 1 or -1 or sqrt(-1) or -sqrt(-1)
1366 // Note that x is a square if quartic is 1 or -1
1367 // There are 4 cases to consider:
1369 // if quartic = 1 (x is a square)
1370 // then x^((p-1)/4) = 1
1371 // x^((p-5)/4) * x = 1
1372 // x^((p-5)/4) = 1/x
1373 // x^((p-5)/8) = sqrt(1/x) or -sqrt(1/x)
1375 // if quartic = -1 (x is a square)
1376 // then x^((p-1)/4) = -1
1377 // x^((p-5)/4) * x = -1
1378 // x^((p-5)/4) = -1/x
1379 // x^((p-5)/8) = sqrt(-1) / sqrt(x)
1380 // x^((p-5)/8) * sqrt(-1) = sqrt(-1)^2 / sqrt(x)
1381 // x^((p-5)/8) * sqrt(-1) = -1/sqrt(x)
1382 // x^((p-5)/8) * sqrt(-1) = -sqrt(1/x) or sqrt(1/x)
1384 // if quartic = sqrt(-1) (x is not a square)
1385 // then x^((p-1)/4) = sqrt(-1)
1386 // x^((p-5)/4) * x = sqrt(-1)
1387 // x^((p-5)/4) = sqrt(-1)/x
1388 // x^((p-5)/8) = sqrt(sqrt(-1)/x) or -sqrt(sqrt(-1)/x)
1390 // Note that the product of two non-squares is always a square:
1391 // For any non-squares a and b, chi(a) = -1 and chi(b) = -1.
1392 // Since chi(x) = x^((p-1)/2), chi(a)*chi(b) = chi(a*b) = 1.
1393 // Therefore a*b is a square.
1395 // Since sqrt(-1) and x are both non-squares, their product is a
1396 // square, and we can compute their square root.
1398 // if quartic = -sqrt(-1) (x is not a square)
1399 // then x^((p-1)/4) = -sqrt(-1)
1400 // x^((p-5)/4) * x = -sqrt(-1)
1401 // x^((p-5)/4) = -sqrt(-1)/x
1402 // x^((p-5)/8) = sqrt(-sqrt(-1)/x)
1403 // x^((p-5)/8) = sqrt( sqrt(-1)/x) * sqrt(-1)
1404 // x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * sqrt(-1)^2
1405 // x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * -1
1406 // x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x)
1407 static int invsqrt(fe isr, const fe x)
1409 fe t0, t1, t2;
1411 // t0 = x^((p-5)/8)
1412 // Can be achieved with a simple double & add ladder,
1413 // but it would be slower.
1414 fe_sq(t0, x);
1415 fe_sq(t1,t0); fe_sq(t1, t1); fe_mul(t1, x, t1);
1416 fe_mul(t0, t0, t1);
1417 fe_sq(t0, t0); fe_mul(t0, t1, t0);
1418 fe_sq(t1, t0); FOR (i, 1, 5) { fe_sq(t1, t1); } fe_mul(t0, t1, t0);
1419 fe_sq(t1, t0); FOR (i, 1, 10) { fe_sq(t1, t1); } fe_mul(t1, t1, t0);
1420 fe_sq(t2, t1); FOR (i, 1, 20) { fe_sq(t2, t2); } fe_mul(t1, t2, t1);
1421 fe_sq(t1, t1); FOR (i, 1, 10) { fe_sq(t1, t1); } fe_mul(t0, t1, t0);
1422 fe_sq(t1, t0); FOR (i, 1, 50) { fe_sq(t1, t1); } fe_mul(t1, t1, t0);
1423 fe_sq(t2, t1); FOR (i, 1, 100) { fe_sq(t2, t2); } fe_mul(t1, t2, t1);
1424 fe_sq(t1, t1); FOR (i, 1, 50) { fe_sq(t1, t1); } fe_mul(t0, t1, t0);
1425 fe_sq(t0, t0); FOR (i, 1, 2) { fe_sq(t0, t0); } fe_mul(t0, t0, x);
1427 // quartic = x^((p-1)/4)
1428 i32 *quartic = t1;
1429 fe_sq (quartic, t0);
1430 fe_mul(quartic, quartic, x);
1432 i32 *check = t2;
1433 fe_0 (check); int z0 = fe_isequal(x , check);
1434 fe_1 (check); int p1 = fe_isequal(quartic, check);
1435 fe_neg(check, check ); int m1 = fe_isequal(quartic, check);
1436 fe_neg(check, sqrtm1); int ms = fe_isequal(quartic, check);
1438 // if quartic == -1 or sqrt(-1)
1439 // then isr = x^((p-1)/4) * sqrt(-1)
1440 // else isr = x^((p-1)/4)
1441 fe_mul(isr, t0, sqrtm1);
1442 fe_ccopy(isr, t0, 1 - (m1 | ms));
1444 WIPE_BUFFER(t0);
1445 WIPE_BUFFER(t1);
1446 WIPE_BUFFER(t2);
1447 return p1 | m1 | z0;
1450 // Inverse in terms of inverse square root.
1451 // Requires two additional squarings to get rid of the sign.
1453 // 1/x = x * (+invsqrt(x^2))^2
1454 // = x * (-invsqrt(x^2))^2
1456 // A fully optimised exponentiation by p-1 would save 6 field
1457 // multiplications, but it would require more code.
1458 static void fe_invert(fe out, const fe x)
1460 fe tmp;
1461 fe_sq(tmp, x);
1462 invsqrt(tmp, tmp);
1463 fe_sq(tmp, tmp);
1464 fe_mul(out, tmp, x);
1465 WIPE_BUFFER(tmp);
1468 // trim a scalar for scalar multiplication
1469 void crypto_eddsa_trim_scalar(u8 out[32], const u8 in[32])
1471 COPY(out, in, 32);
1472 out[ 0] &= 248;
1473 out[31] &= 127;
1474 out[31] |= 64;
1477 // get bit from scalar at position i
1478 static int scalar_bit(const u8 s[32], int i)
1480 if (i < 0) { return 0; } // handle -1 for sliding windows
1481 return (s[i>>3] >> (i&7)) & 1;
1484 ///////////////
1485 /// X-25519 /// Taken from SUPERCOP's ref10 implementation.
1486 ///////////////
1487 static void scalarmult(u8 q[32], const u8 scalar[32], const u8 p[32],
1488 int nb_bits)
1490 // computes the scalar product
1491 fe x1;
1492 fe_frombytes(x1, p);
1494 // computes the actual scalar product (the result is in x2 and z2)
1495 fe x2, z2, x3, z3, t0, t1;
1496 // Montgomery ladder
1497 // In projective coordinates, to avoid divisions: x = X / Z
1498 // We don't care about the y coordinate, it's only 1 bit of information
1499 fe_1(x2); fe_0(z2); // "zero" point
1500 fe_copy(x3, x1); fe_1(z3); // "one" point
1501 int swap = 0;
1502 for (int pos = nb_bits-1; pos >= 0; --pos) {
1503 // constant time conditional swap before ladder step
1504 int b = scalar_bit(scalar, pos);
1505 swap ^= b; // xor trick avoids swapping at the end of the loop
1506 fe_cswap(x2, x3, swap);
1507 fe_cswap(z2, z3, swap);
1508 swap = b; // anticipates one last swap after the loop
1510 // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
1511 // with differential addition
1512 fe_sub(t0, x3, z3);
1513 fe_sub(t1, x2, z2);
1514 fe_add(x2, x2, z2);
1515 fe_add(z2, x3, z3);
1516 fe_mul(z3, t0, x2);
1517 fe_mul(z2, z2, t1);
1518 fe_sq (t0, t1 );
1519 fe_sq (t1, x2 );
1520 fe_add(x3, z3, z2);
1521 fe_sub(z2, z3, z2);
1522 fe_mul(x2, t1, t0);
1523 fe_sub(t1, t1, t0);
1524 fe_sq (z2, z2 );
1525 fe_mul_small(z3, t1, 121666);
1526 fe_sq (x3, x3 );
1527 fe_add(t0, t0, z3);
1528 fe_mul(z3, x1, z2);
1529 fe_mul(z2, t1, t0);
1531 // last swap is necessary to compensate for the xor trick
1532 // Note: after this swap, P3 == P2 + P1.
1533 fe_cswap(x2, x3, swap);
1534 fe_cswap(z2, z3, swap);
1536 // normalises the coordinates: x == X / Z
1537 fe_invert(z2, z2);
1538 fe_mul(x2, x2, z2);
1539 fe_tobytes(q, x2);
1541 WIPE_BUFFER(x1);
1542 WIPE_BUFFER(x2); WIPE_BUFFER(z2); WIPE_BUFFER(t0);
1543 WIPE_BUFFER(x3); WIPE_BUFFER(z3); WIPE_BUFFER(t1);
1546 void crypto_x25519(u8 raw_shared_secret[32],
1547 const u8 your_secret_key [32],
1548 const u8 their_public_key [32])
1550 // restrict the possible scalar values
1551 u8 e[32];
1552 crypto_eddsa_trim_scalar(e, your_secret_key);
1553 scalarmult(raw_shared_secret, e, their_public_key, 255);
1554 WIPE_BUFFER(e);
1557 void crypto_x25519_public_key(u8 public_key[32],
1558 const u8 secret_key[32])
1560 static const u8 base_point[32] = {9};
1561 crypto_x25519(public_key, secret_key, base_point);
1564 ///////////////////////////
1565 /// Arithmetic modulo L ///
1566 ///////////////////////////
1567 static const u32 L[8] = {
1568 0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
1569 0x00000000, 0x00000000, 0x00000000, 0x10000000,
1572 // p = a*b + p
1573 static void multiply(u32 p[16], const u32 a[8], const u32 b[8])
1575 FOR (i, 0, 8) {
1576 u64 carry = 0;
1577 FOR (j, 0, 8) {
1578 carry += p[i+j] + (u64)a[i] * b[j];
1579 p[i+j] = (u32)carry;
1580 carry >>= 32;
1582 p[i+8] = (u32)carry;
1586 static int is_above_l(const u32 x[8])
1588 // We work with L directly, in a 2's complement encoding
1589 // (-L == ~L + 1)
1590 u64 carry = 1;
1591 FOR (i, 0, 8) {
1592 carry += (u64)x[i] + (~L[i] & 0xffffffff);
1593 carry >>= 32;
1595 return (int)carry; // carry is either 0 or 1
1598 // Final reduction modulo L, by conditionally removing L.
1599 // if x < l , then r = x
1600 // if l <= x 2*l, then r = x-l
1601 // otherwise the result will be wrong
1602 static void remove_l(u32 r[8], const u32 x[8])
1604 u64 carry = (u64)is_above_l(x);
1605 u32 mask = ~(u32)carry + 1; // carry == 0 or 1
1606 FOR (i, 0, 8) {
1607 carry += (u64)x[i] + (~L[i] & mask);
1608 r[i] = (u32)carry;
1609 carry >>= 32;
1613 // Full reduction modulo L (Barrett reduction)
1614 static void mod_l(u8 reduced[32], const u32 x[16])
1616 static const u32 r[9] = {
1617 0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d,
1618 0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf,
1620 // xr = x * r
1621 u32 xr[25] = {0};
1622 FOR (i, 0, 9) {
1623 u64 carry = 0;
1624 FOR (j, 0, 16) {
1625 carry += xr[i+j] + (u64)r[i] * x[j];
1626 xr[i+j] = (u32)carry;
1627 carry >>= 32;
1629 xr[i+16] = (u32)carry;
1631 // xr = floor(xr / 2^512) * L
1632 // Since the result is guaranteed to be below 2*L,
1633 // it is enough to only compute the first 256 bits.
1634 // The division is performed by saying xr[i+16]. (16 * 32 = 512)
1635 ZERO(xr, 8);
1636 FOR (i, 0, 8) {
1637 u64 carry = 0;
1638 FOR (j, 0, 8-i) {
1639 carry += xr[i+j] + (u64)xr[i+16] * L[j];
1640 xr[i+j] = (u32)carry;
1641 carry >>= 32;
1644 // xr = x - xr
1645 u64 carry = 1;
1646 FOR (i, 0, 8) {
1647 carry += (u64)x[i] + (~xr[i] & 0xffffffff);
1648 xr[i] = (u32)carry;
1649 carry >>= 32;
1651 // Final reduction modulo L (conditional subtraction)
1652 remove_l(xr, xr);
1653 store32_le_buf(reduced, xr, 8);
1655 WIPE_BUFFER(xr);
1658 void crypto_eddsa_reduce(u8 reduced[32], const u8 expanded[64])
1660 u32 x[16];
1661 load32_le_buf(x, expanded, 16);
1662 mod_l(reduced, x);
1663 WIPE_BUFFER(x);
1666 // r = (a * b) + c
1667 void crypto_eddsa_mul_add(u8 r[32],
1668 const u8 a[32], const u8 b[32], const u8 c[32])
1670 u32 A[8]; load32_le_buf(A, a, 8);
1671 u32 B[8]; load32_le_buf(B, b, 8);
1672 u32 p[16]; load32_le_buf(p, c, 8); ZERO(p + 8, 8);
1673 multiply(p, A, B);
1674 mod_l(r, p);
1675 WIPE_BUFFER(p);
1676 WIPE_BUFFER(A);
1677 WIPE_BUFFER(B);
1680 ///////////////
1681 /// Ed25519 ///
1682 ///////////////
1684 // Point (group element, ge) in a twisted Edwards curve,
1685 // in extended projective coordinates.
1686 // ge : x = X/Z, y = Y/Z, T = XY/Z
1687 // ge_cached : Yp = X+Y, Ym = X-Y, T2 = T*D2
1688 // ge_precomp: Z = 1
1689 typedef struct { fe X; fe Y; fe Z; fe T; } ge;
1690 typedef struct { fe Yp; fe Ym; fe Z; fe T2; } ge_cached;
1691 typedef struct { fe Yp; fe Ym; fe T2; } ge_precomp;
1693 static void ge_zero(ge *p)
1695 fe_0(p->X);
1696 fe_1(p->Y);
1697 fe_1(p->Z);
1698 fe_0(p->T);
1701 static void ge_tobytes(u8 s[32], const ge *h)
1703 fe recip, x, y;
1704 fe_invert(recip, h->Z);
1705 fe_mul(x, h->X, recip);
1706 fe_mul(y, h->Y, recip);
1707 fe_tobytes(s, y);
1708 s[31] ^= fe_isodd(x) << 7;
1710 WIPE_BUFFER(recip);
1711 WIPE_BUFFER(x);
1712 WIPE_BUFFER(y);
1715 // h = -s, where s is a point encoded in 32 bytes
1717 // Variable time! Inputs must not be secret!
1718 // => Use only to *check* signatures.
1720 // From the specifications:
1721 // The encoding of s contains y and the sign of x
1722 // x = sqrt((y^2 - 1) / (d*y^2 + 1))
1723 // In extended coordinates:
1724 // X = x, Y = y, Z = 1, T = x*y
1726 // Note that num * den is a square iff num / den is a square
1727 // If num * den is not a square, the point was not on the curve.
1728 // From the above:
1729 // Let num = y^2 - 1
1730 // Let den = d*y^2 + 1
1731 // x = sqrt((y^2 - 1) / (d*y^2 + 1))
1732 // x = sqrt(num / den)
1733 // x = sqrt(num^2 / (num * den))
1734 // x = num * sqrt(1 / (num * den))
1736 // Therefore, we can just compute:
1737 // num = y^2 - 1
1738 // den = d*y^2 + 1
1739 // isr = invsqrt(num * den) // abort if not square
1740 // x = num * isr
1741 // Finally, negate x if its sign is not as specified.
1742 static int ge_frombytes_neg_vartime(ge *h, const u8 s[32])
1744 fe_frombytes(h->Y, s);
1745 fe_1(h->Z);
1746 fe_sq (h->T, h->Y); // t = y^2
1747 fe_mul(h->X, h->T, d ); // x = d*y^2
1748 fe_sub(h->T, h->T, h->Z); // t = y^2 - 1
1749 fe_add(h->X, h->X, h->Z); // x = d*y^2 + 1
1750 fe_mul(h->X, h->T, h->X); // x = (y^2 - 1) * (d*y^2 + 1)
1751 int is_square = invsqrt(h->X, h->X);
1752 if (!is_square) {
1753 return -1; // Not on the curve, abort
1755 fe_mul(h->X, h->T, h->X); // x = sqrt((y^2 - 1) / (d*y^2 + 1))
1756 if (fe_isodd(h->X) == (s[31] >> 7)) {
1757 fe_neg(h->X, h->X);
1759 fe_mul(h->T, h->X, h->Y);
1760 return 0;
1763 static void ge_cache(ge_cached *c, const ge *p)
1765 fe_add (c->Yp, p->Y, p->X);
1766 fe_sub (c->Ym, p->Y, p->X);
1767 fe_copy(c->Z , p->Z );
1768 fe_mul (c->T2, p->T, D2 );
1771 // Internal buffers are not wiped! Inputs must not be secret!
1772 // => Use only to *check* signatures.
1773 static void ge_add(ge *s, const ge *p, const ge_cached *q)
1775 fe a, b;
1776 fe_add(a , p->Y, p->X );
1777 fe_sub(b , p->Y, p->X );
1778 fe_mul(a , a , q->Yp);
1779 fe_mul(b , b , q->Ym);
1780 fe_add(s->Y, a , b );
1781 fe_sub(s->X, a , b );
1783 fe_add(s->Z, p->Z, p->Z );
1784 fe_mul(s->Z, s->Z, q->Z );
1785 fe_mul(s->T, p->T, q->T2);
1786 fe_add(a , s->Z, s->T );
1787 fe_sub(b , s->Z, s->T );
1789 fe_mul(s->T, s->X, s->Y);
1790 fe_mul(s->X, s->X, b );
1791 fe_mul(s->Y, s->Y, a );
1792 fe_mul(s->Z, a , b );
1795 // Internal buffers are not wiped! Inputs must not be secret!
1796 // => Use only to *check* signatures.
1797 static void ge_sub(ge *s, const ge *p, const ge_cached *q)
1799 ge_cached neg;
1800 fe_copy(neg.Ym, q->Yp);
1801 fe_copy(neg.Yp, q->Ym);
1802 fe_copy(neg.Z , q->Z );
1803 fe_neg (neg.T2, q->T2);
1804 ge_add(s, p, &neg);
1807 static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1809 fe_add(a , p->Y, p->X );
1810 fe_sub(b , p->Y, p->X );
1811 fe_mul(a , a , q->Yp);
1812 fe_mul(b , b , q->Ym);
1813 fe_add(s->Y, a , b );
1814 fe_sub(s->X, a , b );
1816 fe_add(s->Z, p->Z, p->Z );
1817 fe_mul(s->T, p->T, q->T2);
1818 fe_add(a , s->Z, s->T );
1819 fe_sub(b , s->Z, s->T );
1821 fe_mul(s->T, s->X, s->Y);
1822 fe_mul(s->X, s->X, b );
1823 fe_mul(s->Y, s->Y, a );
1824 fe_mul(s->Z, a , b );
1827 // Internal buffers are not wiped! Inputs must not be secret!
1828 // => Use only to *check* signatures.
1829 static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1831 ge_precomp neg;
1832 fe_copy(neg.Ym, q->Yp);
1833 fe_copy(neg.Yp, q->Ym);
1834 fe_neg (neg.T2, q->T2);
1835 ge_madd(s, p, &neg, a, b);
1838 static void ge_double(ge *s, const ge *p, ge *q)
1840 fe_sq (q->X, p->X);
1841 fe_sq (q->Y, p->Y);
1842 fe_sq (q->Z, p->Z); // qZ = pZ^2
1843 fe_mul_small(q->Z, q->Z, 2); // qZ = pZ^2 * 2
1844 fe_add(q->T, p->X, p->Y);
1845 fe_sq (s->T, q->T);
1846 fe_add(q->T, q->Y, q->X);
1847 fe_sub(q->Y, q->Y, q->X);
1848 fe_sub(q->X, s->T, q->T);
1849 fe_sub(q->Z, q->Z, q->Y);
1851 fe_mul(s->X, q->X , q->Z);
1852 fe_mul(s->Y, q->T , q->Y);
1853 fe_mul(s->Z, q->Y , q->Z);
1854 fe_mul(s->T, q->X , q->T);
1857 // 5-bit signed window in cached format (Niels coordinates, Z=1)
1858 static const ge_precomp b_window[8] = {
1859 {{25967493,-14356035,29566456,3660896,-12694345,
1860 4014787,27544626,-11754271,-6079156,2047605,},
1861 {-12545711,934262,-2722910,3049990,-727428,
1862 9406986,12720692,5043384,19500929,-15469378,},
1863 {-8738181,4489570,9688441,-14785194,10184609,
1864 -12363380,29287919,11864899,-24514362,-4438546,},},
1865 {{15636291,-9688557,24204773,-7912398,616977,
1866 -16685262,27787600,-14772189,28944400,-1550024,},
1867 {16568933,4717097,-11556148,-1102322,15682896,
1868 -11807043,16354577,-11775962,7689662,11199574,},
1869 {30464156,-5976125,-11779434,-15670865,23220365,
1870 15915852,7512774,10017326,-17749093,-9920357,},},
1871 {{10861363,11473154,27284546,1981175,-30064349,
1872 12577861,32867885,14515107,-15438304,10819380,},
1873 {4708026,6336745,20377586,9066809,-11272109,
1874 6594696,-25653668,12483688,-12668491,5581306,},
1875 {19563160,16186464,-29386857,4097519,10237984,
1876 -4348115,28542350,13850243,-23678021,-15815942,},},
1877 {{5153746,9909285,1723747,-2777874,30523605,
1878 5516873,19480852,5230134,-23952439,-15175766,},
1879 {-30269007,-3463509,7665486,10083793,28475525,
1880 1649722,20654025,16520125,30598449,7715701,},
1881 {28881845,14381568,9657904,3680757,-20181635,
1882 7843316,-31400660,1370708,29794553,-1409300,},},
1883 {{-22518993,-6692182,14201702,-8745502,-23510406,
1884 8844726,18474211,-1361450,-13062696,13821877,},
1885 {-6455177,-7839871,3374702,-4740862,-27098617,
1886 -10571707,31655028,-7212327,18853322,-14220951,},
1887 {4566830,-12963868,-28974889,-12240689,-7602672,
1888 -2830569,-8514358,-10431137,2207753,-3209784,},},
1889 {{-25154831,-4185821,29681144,7868801,-6854661,
1890 -9423865,-12437364,-663000,-31111463,-16132436,},
1891 {25576264,-2703214,7349804,-11814844,16472782,
1892 9300885,3844789,15725684,171356,6466918,},
1893 {23103977,13316479,9739013,-16149481,817875,
1894 -15038942,8965339,-14088058,-30714912,16193877,},},
1895 {{-33521811,3180713,-2394130,14003687,-16903474,
1896 -16270840,17238398,4729455,-18074513,9256800,},
1897 {-25182317,-4174131,32336398,5036987,-21236817,
1898 11360617,22616405,9761698,-19827198,630305,},
1899 {-13720693,2639453,-24237460,-7406481,9494427,
1900 -5774029,-6554551,-15960994,-2449256,-14291300,},},
1901 {{-3151181,-5046075,9282714,6866145,-31907062,
1902 -863023,-18940575,15033784,25105118,-7894876,},
1903 {-24326370,15950226,-31801215,-14592823,-11662737,
1904 -5090925,1573892,-2625887,2198790,-15804619,},
1905 {-3099351,10324967,-2241613,7453183,-5446979,
1906 -2735503,-13812022,-16236442,-32461234,-12290683,},},
1909 // Incremental sliding windows (left to right)
1910 // Based on Roberto Maria Avanzi[2005]
1911 typedef struct {
1912 i16 next_index; // position of the next signed digit
1913 i8 next_digit; // next signed digit (odd number below 2^window_width)
1914 u8 next_check; // point at which we must check for a new window
1915 } slide_ctx;
1917 static void slide_init(slide_ctx *ctx, const u8 scalar[32])
1919 // scalar is guaranteed to be below L, either because we checked (s),
1920 // or because we reduced it modulo L (h_ram). L is under 2^253, so
1921 // so bits 253 to 255 are guaranteed to be zero. No need to test them.
1923 // Note however that L is very close to 2^252, so bit 252 is almost
1924 // always zero. If we were to start at bit 251, the tests wouldn't
1925 // catch the off-by-one error (constructing one that does would be
1926 // prohibitively expensive).
1928 // We should still check bit 252, though.
1929 int i = 252;
1930 while (i > 0 && scalar_bit(scalar, i) == 0) {
1931 i--;
1933 ctx->next_check = (u8)(i + 1);
1934 ctx->next_index = -1;
1935 ctx->next_digit = -1;
1938 static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32])
1940 if (i == ctx->next_check) {
1941 if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) {
1942 ctx->next_check--;
1943 } else {
1944 // compute digit of next window
1945 int w = MC_MIN(width, i + 1);
1946 int v = -(scalar_bit(scalar, i) << (w-1));
1947 FOR_T (int, j, 0, w-1) {
1948 v += scalar_bit(scalar, i-(w-1)+j) << j;
1950 v += scalar_bit(scalar, i-w);
1951 int lsb = v & (~v + 1); // smallest bit of v
1952 int s = // log2(lsb)
1953 (((lsb & 0xAA) != 0) << 0) |
1954 (((lsb & 0xCC) != 0) << 1) |
1955 (((lsb & 0xF0) != 0) << 2);
1956 ctx->next_index = (i16)(i-(w-1)+s);
1957 ctx->next_digit = (i8) (v >> s );
1958 ctx->next_check -= (u8) w;
1961 return i == ctx->next_index ? ctx->next_digit: 0;
1964 #define P_W_WIDTH 3 // Affects the size of the stack
1965 #define B_W_WIDTH 5 // Affects the size of the binary
1966 #define P_W_SIZE (1<<(P_W_WIDTH-2))
1968 int crypto_eddsa_check_equation(const u8 signature[64], const u8 public_key[32],
1969 const u8 h[32])
1971 ge minus_A; // -public_key
1972 ge minus_R; // -first_half_of_signature
1973 const u8 *s = signature + 32;
1975 // Check that A and R are on the curve
1976 // Check that 0 <= S < L (prevents malleability)
1977 // *Allow* non-cannonical encoding for A and R
1979 u32 s32[8];
1980 load32_le_buf(s32, s, 8);
1981 if (ge_frombytes_neg_vartime(&minus_A, public_key) ||
1982 ge_frombytes_neg_vartime(&minus_R, signature) ||
1983 is_above_l(s32)) {
1984 return -1;
1988 // look-up table for minus_A
1989 ge_cached lutA[P_W_SIZE];
1991 ge minus_A2, tmp;
1992 ge_double(&minus_A2, &minus_A, &tmp);
1993 ge_cache(&lutA[0], &minus_A);
1994 FOR (i, 1, P_W_SIZE) {
1995 ge_add(&tmp, &minus_A2, &lutA[i-1]);
1996 ge_cache(&lutA[i], &tmp);
2000 // sum = [s]B - [h]A
2001 // Merged double and add ladder, fused with sliding
2002 slide_ctx h_slide; slide_init(&h_slide, h);
2003 slide_ctx s_slide; slide_init(&s_slide, s);
2004 int i = MC_MAX(h_slide.next_check, s_slide.next_check);
2005 ge *sum = &minus_A; // reuse minus_A for the sum
2006 ge_zero(sum);
2007 while (i >= 0) {
2008 ge tmp;
2009 ge_double(sum, sum, &tmp);
2010 int h_digit = slide_step(&h_slide, P_W_WIDTH, i, h);
2011 int s_digit = slide_step(&s_slide, B_W_WIDTH, i, s);
2012 if (h_digit > 0) { ge_add(sum, sum, &lutA[ h_digit / 2]); }
2013 if (h_digit < 0) { ge_sub(sum, sum, &lutA[-h_digit / 2]); }
2014 fe t1, t2;
2015 if (s_digit > 0) { ge_madd(sum, sum, b_window + s_digit/2, t1, t2); }
2016 if (s_digit < 0) { ge_msub(sum, sum, b_window + -s_digit/2, t1, t2); }
2017 i--;
2020 // Compare [8](sum-R) and the zero point
2021 // The multiplication by 8 eliminates any low-order component
2022 // and ensures consistency with batched verification.
2023 ge_cached cached;
2024 u8 check[32];
2025 static const u8 zero_point[32] = {1}; // Point of order 1
2026 ge_cache(&cached, &minus_R);
2027 ge_add(sum, sum, &cached);
2028 ge_double(sum, sum, &minus_R); // reuse minus_R as temporary
2029 ge_double(sum, sum, &minus_R); // reuse minus_R as temporary
2030 ge_double(sum, sum, &minus_R); // reuse minus_R as temporary
2031 ge_tobytes(check, sum);
2032 return crypto_verify32(check, zero_point);
2035 // 5-bit signed comb in cached format (Niels coordinates, Z=1)
2036 static const ge_precomp b_comb_low[8] = {
2037 {{-6816601,-2324159,-22559413,124364,18015490,
2038 8373481,19993724,1979872,-18549925,9085059,},
2039 {10306321,403248,14839893,9633706,8463310,
2040 -8354981,-14305673,14668847,26301366,2818560,},
2041 {-22701500,-3210264,-13831292,-2927732,-16326337,
2042 -14016360,12940910,177905,12165515,-2397893,},},
2043 {{-12282262,-7022066,9920413,-3064358,-32147467,
2044 2927790,22392436,-14852487,2719975,16402117,},
2045 {-7236961,-4729776,2685954,-6525055,-24242706,
2046 -15940211,-6238521,14082855,10047669,12228189,},
2047 {-30495588,-12893761,-11161261,3539405,-11502464,
2048 16491580,-27286798,-15030530,-7272871,-15934455,},},
2049 {{17650926,582297,-860412,-187745,-12072900,
2050 -10683391,-20352381,15557840,-31072141,-5019061,},
2051 {-6283632,-2259834,-4674247,-4598977,-4089240,
2052 12435688,-31278303,1060251,6256175,10480726,},
2053 {-13871026,2026300,-21928428,-2741605,-2406664,
2054 -8034988,7355518,15733500,-23379862,7489131,},},
2055 {{6883359,695140,23196907,9644202,-33430614,
2056 11354760,-20134606,6388313,-8263585,-8491918,},
2057 {-7716174,-13605463,-13646110,14757414,-19430591,
2058 -14967316,10359532,-11059670,-21935259,12082603,},
2059 {-11253345,-15943946,10046784,5414629,24840771,
2060 8086951,-6694742,9868723,15842692,-16224787,},},
2061 {{9639399,11810955,-24007778,-9320054,3912937,
2062 -9856959,996125,-8727907,-8919186,-14097242,},
2063 {7248867,14468564,25228636,-8795035,14346339,
2064 8224790,6388427,-7181107,6468218,-8720783,},
2065 {15513115,15439095,7342322,-10157390,18005294,
2066 -7265713,2186239,4884640,10826567,7135781,},},
2067 {{-14204238,5297536,-5862318,-6004934,28095835,
2068 4236101,-14203318,1958636,-16816875,3837147,},
2069 {-5511166,-13176782,-29588215,12339465,15325758,
2070 -15945770,-8813185,11075932,-19608050,-3776283,},
2071 {11728032,9603156,-4637821,-5304487,-7827751,
2072 2724948,31236191,-16760175,-7268616,14799772,},},
2073 {{-28842672,4840636,-12047946,-9101456,-1445464,
2074 381905,-30977094,-16523389,1290540,12798615,},
2075 {27246947,-10320914,14792098,-14518944,5302070,
2076 -8746152,-3403974,-4149637,-27061213,10749585,},
2077 {25572375,-6270368,-15353037,16037944,1146292,
2078 32198,23487090,9585613,24714571,-1418265,},},
2079 {{19844825,282124,-17583147,11004019,-32004269,
2080 -2716035,6105106,-1711007,-21010044,14338445,},
2081 {8027505,8191102,-18504907,-12335737,25173494,
2082 -5923905,15446145,7483684,-30440441,10009108,},
2083 {-14134701,-4174411,10246585,-14677495,33553567,
2084 -14012935,23366126,15080531,-7969992,7663473,},},
2087 static const ge_precomp b_comb_high[8] = {
2088 {{33055887,-4431773,-521787,6654165,951411,
2089 -6266464,-5158124,6995613,-5397442,-6985227,},
2090 {4014062,6967095,-11977872,3960002,8001989,
2091 5130302,-2154812,-1899602,-31954493,-16173976,},
2092 {16271757,-9212948,23792794,731486,-25808309,
2093 -3546396,6964344,-4767590,10976593,10050757,},},
2094 {{2533007,-4288439,-24467768,-12387405,-13450051,
2095 14542280,12876301,13893535,15067764,8594792,},
2096 {20073501,-11623621,3165391,-13119866,13188608,
2097 -11540496,-10751437,-13482671,29588810,2197295,},
2098 {-1084082,11831693,6031797,14062724,14748428,
2099 -8159962,-20721760,11742548,31368706,13161200,},},
2100 {{2050412,-6457589,15321215,5273360,25484180,
2101 124590,-18187548,-7097255,-6691621,-14604792,},
2102 {9938196,2162889,-6158074,-1711248,4278932,
2103 -2598531,-22865792,-7168500,-24323168,11746309,},
2104 {-22691768,-14268164,5965485,9383325,20443693,
2105 5854192,28250679,-1381811,-10837134,13717818,},},
2106 {{-8495530,16382250,9548884,-4971523,-4491811,
2107 -3902147,6182256,-12832479,26628081,10395408,},
2108 {27329048,-15853735,7715764,8717446,-9215518,
2109 -14633480,28982250,-5668414,4227628,242148,},
2110 {-13279943,-7986904,-7100016,8764468,-27276630,
2111 3096719,29678419,-9141299,3906709,11265498,},},
2112 {{11918285,15686328,-17757323,-11217300,-27548967,
2113 4853165,-27168827,6807359,6871949,-1075745,},
2114 {-29002610,13984323,-27111812,-2713442,28107359,
2115 -13266203,6155126,15104658,3538727,-7513788,},
2116 {14103158,11233913,-33165269,9279850,31014152,
2117 4335090,-1827936,4590951,13960841,12787712,},},
2118 {{1469134,-16738009,33411928,13942824,8092558,
2119 -8778224,-11165065,1437842,22521552,-2792954,},
2120 {31352705,-4807352,-25327300,3962447,12541566,
2121 -9399651,-27425693,7964818,-23829869,5541287,},
2122 {-25732021,-6864887,23848984,3039395,-9147354,
2123 6022816,-27421653,10590137,25309915,-1584678,},},
2124 {{-22951376,5048948,31139401,-190316,-19542447,
2125 -626310,-17486305,-16511925,-18851313,-12985140,},
2126 {-9684890,14681754,30487568,7717771,-10829709,
2127 9630497,30290549,-10531496,-27798994,-13812825,},
2128 {5827835,16097107,-24501327,12094619,7413972,
2129 11447087,28057551,-1793987,-14056981,4359312,},},
2130 {{26323183,2342588,-21887793,-1623758,-6062284,
2131 2107090,-28724907,9036464,-19618351,-13055189,},
2132 {-29697200,14829398,-4596333,14220089,-30022969,
2133 2955645,12094100,-13693652,-5941445,7047569,},
2134 {-3201977,14413268,-12058324,-16417589,-9035655,
2135 -7224648,9258160,1399236,30397584,-5684634,},},
2138 static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b,
2139 const ge_precomp comb[8], const u8 scalar[32], int i)
2141 u8 teeth = (u8)((scalar_bit(scalar, i) ) +
2142 (scalar_bit(scalar, i + 32) << 1) +
2143 (scalar_bit(scalar, i + 64) << 2) +
2144 (scalar_bit(scalar, i + 96) << 3));
2145 u8 high = teeth >> 3;
2146 u8 index = (teeth ^ (high - 1)) & 7;
2147 FOR (j, 0, 8) {
2148 i32 select = 1 & (((j ^ index) - 1) >> 8);
2149 fe_ccopy(tmp_c->Yp, comb[j].Yp, select);
2150 fe_ccopy(tmp_c->Ym, comb[j].Ym, select);
2151 fe_ccopy(tmp_c->T2, comb[j].T2, select);
2153 fe_neg(tmp_a, tmp_c->T2);
2154 fe_cswap(tmp_c->T2, tmp_a , high ^ 1);
2155 fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1);
2156 ge_madd(p, p, tmp_c, tmp_a, tmp_b);
2159 // p = [scalar]B, where B is the base point
2160 static void ge_scalarmult_base(ge *p, const u8 scalar[32])
2162 // twin 4-bits signed combs, from Mike Hamburg's
2163 // Fast and compact elliptic-curve cryptography (2012)
2164 // 1 / 2 modulo L
2165 static const u8 half_mod_L[32] = {
2166 247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10,
2167 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,
2169 // (2^256 - 1) / 2 modulo L
2170 static const u8 half_ones[32] = {
2171 142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99,
2172 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7,
2175 // All bits set form: 1 means 1, 0 means -1
2176 u8 s_scalar[32];
2177 crypto_eddsa_mul_add(s_scalar, scalar, half_mod_L, half_ones);
2179 // Double and add ladder
2180 fe tmp_a, tmp_b; // temporaries for addition
2181 ge_precomp tmp_c; // temporary for comb lookup
2182 ge tmp_d; // temporary for doubling
2183 fe_1(tmp_c.Yp);
2184 fe_1(tmp_c.Ym);
2185 fe_0(tmp_c.T2);
2187 // Save a double on the first iteration
2188 ge_zero(p);
2189 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31);
2190 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128);
2191 // Regular double & add for the rest
2192 for (int i = 30; i >= 0; i--) {
2193 ge_double(p, p, &tmp_d);
2194 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i);
2195 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128);
2197 // Note: we could save one addition at the end if we assumed the
2198 // scalar fit in 252 bits. Which it does in practice if it is
2199 // selected at random. However, non-random, non-hashed scalars
2200 // *can* overflow 252 bits in practice. Better account for that
2201 // than leaving that kind of subtle corner case.
2203 WIPE_BUFFER(tmp_a); WIPE_CTX(&tmp_d);
2204 WIPE_BUFFER(tmp_b); WIPE_CTX(&tmp_c);
2205 WIPE_BUFFER(s_scalar);
2208 void crypto_eddsa_scalarbase(u8 point[32], const u8 scalar[32])
2210 ge P;
2211 ge_scalarmult_base(&P, scalar);
2212 ge_tobytes(point, &P);
2213 WIPE_CTX(&P);
2216 void crypto_eddsa_key_pair(u8 secret_key[64], u8 public_key[32], u8 seed[32])
2218 // To allow overlaps, observable writes happen in this order:
2219 // 1. seed
2220 // 2. secret_key
2221 // 3. public_key
2222 u8 a[64];
2223 COPY(a, seed, 32);
2224 crypto_wipe(seed, 32);
2225 COPY(secret_key, a, 32);
2226 crypto_blake2b(a, 64, a, 32);
2227 crypto_eddsa_trim_scalar(a, a);
2228 crypto_eddsa_scalarbase(secret_key + 32, a);
2229 COPY(public_key, secret_key + 32, 32);
2230 WIPE_BUFFER(a);
2233 static void hash_reduce(u8 h[32],
2234 const u8 *a, size_t a_size,
2235 const u8 *b, size_t b_size,
2236 const u8 *c, size_t c_size)
2238 u8 hash[64];
2239 crypto_blake2b_ctx ctx;
2240 crypto_blake2b_init (&ctx, 64);
2241 crypto_blake2b_update(&ctx, a, a_size);
2242 crypto_blake2b_update(&ctx, b, b_size);
2243 crypto_blake2b_update(&ctx, c, c_size);
2244 crypto_blake2b_final (&ctx, hash);
2245 crypto_eddsa_reduce(h, hash);
2248 // Digital signature of a message with from a secret key.
2250 // The secret key comprises two parts:
2251 // - The seed that generates the key (secret_key[ 0..31])
2252 // - The public key (secret_key[32..63])
2254 // The seed and the public key are bundled together to make sure users
2255 // don't use mismatched seeds and public keys, which would instantly
2256 // leak the secret scalar and allow forgeries (allowing this to happen
2257 // has resulted in critical vulnerabilities in the wild).
2259 // The seed is hashed to derive the secret scalar and a secret prefix.
2260 // The sole purpose of the prefix is to generate a secret random nonce.
2261 // The properties of that nonce must be as follows:
2262 // - Unique: we need a different one for each message.
2263 // - Secret: third parties must not be able to predict it.
2264 // - Random: any detectable bias would break all security.
2266 // There are two ways to achieve these properties. The obvious one is
2267 // to simply generate a random number. Here that would be a parameter
2268 // (Monocypher doesn't have an RNG). It works, but then users may reuse
2269 // the nonce by accident, which _also_ leaks the secret scalar and
2270 // allows forgeries. This has happened in the wild too.
2272 // This is no good, so instead we generate that nonce deterministically
2273 // by reducing modulo L a hash of the secret prefix and the message.
2274 // The secret prefix makes the nonce unpredictable, the message makes it
2275 // unique, and the hash/reduce removes all bias.
2277 // The cost of that safety is hashing the message twice. If that cost
2278 // is unacceptable, there are two alternatives:
2280 // - Signing a hash of the message instead of the message itself. This
2281 // is fine as long as the hash is collision resistant. It is not
2282 // compatible with existing "pure" signatures, but at least it's safe.
2284 // - Using a random nonce. Please exercise **EXTREME CAUTION** if you
2285 // ever do that. It is absolutely **critical** that the nonce is
2286 // really an unbiased random number between 0 and L-1, never reused,
2287 // and wiped immediately.
2289 // To lower the likelihood of complete catastrophe if the RNG is
2290 // either flawed or misused, you can hash the RNG output together with
2291 // the secret prefix and the beginning of the message, and use the
2292 // reduction of that hash instead of the RNG output itself. It's not
2293 // foolproof (you'd need to hash the whole message) but it helps.
2295 // Signing a message involves the following operations:
2297 // scalar, prefix = HASH(secret_key)
2298 // r = HASH(prefix || message) % L
2299 // R = [r]B
2300 // h = HASH(R || public_key || message) % L
2301 // S = ((h * a) + r) % L
2302 // signature = R || S
2303 void crypto_eddsa_sign(u8 signature [64], const u8 secret_key[64],
2304 const u8 *message, size_t message_size)
2306 u8 a[64]; // secret scalar and prefix
2307 u8 r[32]; // secret deterministic "random" nonce
2308 u8 h[32]; // publically verifiable hash of the message (not wiped)
2309 u8 R[32]; // first half of the signature (allows overlapping inputs)
2311 crypto_blake2b(a, 64, secret_key, 32);
2312 crypto_eddsa_trim_scalar(a, a);
2313 hash_reduce(r, a + 32, 32, message, message_size, 0, 0);
2314 crypto_eddsa_scalarbase(R, r);
2315 hash_reduce(h, R, 32, secret_key + 32, 32, message, message_size);
2316 COPY(signature, R, 32);
2317 crypto_eddsa_mul_add(signature + 32, h, a, r);
2319 WIPE_BUFFER(a);
2320 WIPE_BUFFER(r);
2323 // To check the signature R, S of the message M with the public key A,
2324 // there are 3 steps:
2326 // compute h = HASH(R || A || message) % L
2327 // check that A is on the curve.
2328 // check that R == [s]B - [h]A
2330 // The last two steps are done in crypto_eddsa_check_equation()
2331 int crypto_eddsa_check(const u8 signature[64], const u8 public_key[32],
2332 const u8 *message, size_t message_size)
2334 u8 h[32];
2335 hash_reduce(h, signature, 32, public_key, 32, message, message_size);
2336 return crypto_eddsa_check_equation(signature, public_key, h);
2339 /////////////////////////
2340 /// EdDSA <--> X25519 ///
2341 /////////////////////////
2342 void crypto_eddsa_to_x25519(u8 x25519[32], const u8 eddsa[32])
2344 // (u, v) = ((1+y)/(1-y), sqrt(-486664)*u/x)
2345 // Only converting y to u, the sign of x is ignored.
2346 fe t1, t2;
2347 fe_frombytes(t2, eddsa);
2348 fe_add(t1, fe_one, t2);
2349 fe_sub(t2, fe_one, t2);
2350 fe_invert(t2, t2);
2351 fe_mul(t1, t1, t2);
2352 fe_tobytes(x25519, t1);
2353 WIPE_BUFFER(t1);
2354 WIPE_BUFFER(t2);
2357 void crypto_x25519_to_eddsa(u8 eddsa[32], const u8 x25519[32])
2359 // (x, y) = (sqrt(-486664)*u/v, (u-1)/(u+1))
2360 // Only converting u to y, x is assumed positive.
2361 fe t1, t2;
2362 fe_frombytes(t2, x25519);
2363 fe_sub(t1, t2, fe_one);
2364 fe_add(t2, t2, fe_one);
2365 fe_invert(t2, t2);
2366 fe_mul(t1, t1, t2);
2367 fe_tobytes(eddsa, t1);
2368 WIPE_BUFFER(t1);
2369 WIPE_BUFFER(t2);
2372 /////////////////////////////////////////////
2373 /// Dirty ephemeral public key generation ///
2374 /////////////////////////////////////////////
2376 // Those functions generates a public key, *without* clearing the
2377 // cofactor. Sending that key over the network leaks 3 bits of the
2378 // private key. Use only to generate ephemeral keys that will be hidden
2379 // with crypto_curve_to_hidden().
2381 // The public key is otherwise compatible with crypto_x25519(), which
2382 // properly clears the cofactor.
2384 // Note that the distribution of the resulting public keys is almost
2385 // uniform. Flipping the sign of the v coordinate (not provided by this
2386 // function), covers the entire key space almost perfectly, where
2387 // "almost" means a 2^-128 bias (undetectable). This uniformity is
2388 // needed to ensure the proper randomness of the resulting
2389 // representatives (once we apply crypto_curve_to_hidden()).
2391 // Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not
2392 // to be confused with the prime order of the main subgroup, L, which is
2393 // 8 times less than that).
2395 // Generating all points would require us to multiply a point of order C
2396 // (the base point plus any point of order 8) by all scalars from 0 to
2397 // C-1. Clamping limits us to scalars between 2^254 and 2^255 - 1. But
2398 // by negating the resulting point at random, we also cover scalars from
2399 // -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e).
2401 // In practice:
2402 // - Scalars from 0 to e + 1 are never generated
2403 // - Scalars from 2^255 to 2^255 + e are never generated
2404 // - Scalars from 2^254 + 1 to 2^254 + e are generated twice
2406 // Since e < 2^128, detecting this bias requires observing over 2^100
2407 // representatives from a given source (this will never happen), *and*
2408 // recovering enough of the private key to determine that they do, or do
2409 // not, belong to the biased set (this practically requires solving
2410 // discrete logarithm, which is conjecturally intractable).
2412 // In practice, this means the bias is impossible to detect.
2414 // s + (x*L) % 8*L
2415 // Guaranteed to fit in 256 bits iff s fits in 255 bits.
2416 // L < 2^253
2417 // x%8 < 2^3
2418 // L * (x%8) < 2^255
2419 // s < 2^255
2420 // s + L * (x%8) < 2^256
2421 static void add_xl(u8 s[32], u8 x)
2423 u64 mod8 = x & 7;
2424 u64 carry = 0;
2425 FOR (i , 0, 8) {
2426 carry = carry + load32_le(s + 4*i) + L[i] * mod8;
2427 store32_le(s + 4*i, (u32)carry);
2428 carry >>= 32;
2432 // "Small" dirty ephemeral key.
2433 // Use if you need to shrink the size of the binary, and can afford to
2434 // slow down by a factor of two (compared to the fast version)
2436 // This version works by decoupling the cofactor from the main factor.
2438 // - The trimmed scalar determines the main factor
2439 // - The clamped bits of the scalar determine the cofactor.
2441 // Cofactor and main factor are combined into a single scalar, which is
2442 // then multiplied by a point of order 8*L (unlike the base point, which
2443 // has prime order). That "dirty" base point is the addition of the
2444 // regular base point (9), and a point of order 8.
2445 void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
2447 // Base point of order 8*L
2448 // Raw scalar multiplication with it does not clear the cofactor,
2449 // and the resulting public key will reveal 3 bits of the scalar.
2451 // The low order component of this base point has been chosen
2452 // to yield the same results as crypto_x25519_dirty_fast().
2453 static const u8 dirty_base_point[32] = {
2454 0xd8, 0x86, 0x1a, 0xa2, 0x78, 0x7a, 0xd9, 0x26,
2455 0x8b, 0x74, 0x74, 0xb6, 0x82, 0xe3, 0xbe, 0xc3,
2456 0xce, 0x36, 0x9a, 0x1e, 0x5e, 0x31, 0x47, 0xa2,
2457 0x6d, 0x37, 0x7c, 0xfd, 0x20, 0xb5, 0xdf, 0x75,
2459 // separate the main factor & the cofactor of the scalar
2460 u8 scalar[32];
2461 crypto_eddsa_trim_scalar(scalar, secret_key);
2463 // Separate the main factor and the cofactor
2465 // The scalar is trimmed, so its cofactor is cleared. The three
2466 // least significant bits however still have a main factor. We must
2467 // remove it for X25519 compatibility.
2469 // cofactor = lsb * L (modulo 8*L)
2470 // combined = scalar + cofactor (modulo 8*L)
2471 add_xl(scalar, secret_key[0]);
2472 scalarmult(public_key, scalar, dirty_base_point, 256);
2473 WIPE_BUFFER(scalar);
2476 // Select low order point
2477 // We're computing the [cofactor]lop scalar multiplication, where:
2479 // cofactor = tweak & 7.
2480 // lop = (lop_x, lop_y)
2481 // lop_x = sqrt((sqrt(d + 1) + 1) / d)
2482 // lop_y = -lop_x * sqrtm1
2484 // The low order point has order 8. There are 4 such points. We've
2485 // chosen the one whose both coordinates are positive (below p/2).
2486 // The 8 low order points are as follows:
2488 // [0]lop = ( 0 , 1 )
2489 // [1]lop = ( lop_x , lop_y)
2490 // [2]lop = ( sqrt(-1), -0 )
2491 // [3]lop = ( lop_x , -lop_y)
2492 // [4]lop = (-0 , -1 )
2493 // [5]lop = (-lop_x , -lop_y)
2494 // [6]lop = (-sqrt(-1), 0 )
2495 // [7]lop = (-lop_x , lop_y)
2497 // The x coordinate is either 0, sqrt(-1), lop_x, or their opposite.
2498 // The y coordinate is either 0, -1 , lop_y, or their opposite.
2499 // The pattern for both is the same, except for a rotation of 2 (modulo 8)
2501 // This helper function captures the pattern, and we can use it thus:
2503 // select_lop(x, lop_x, sqrtm1, cofactor);
2504 // select_lop(y, lop_y, fe_one, cofactor + 2);
2506 // This is faster than an actual scalar multiplication,
2507 // and requires less code than naive constant time look up.
2508 static void select_lop(fe out, const fe x, const fe k, u8 cofactor)
2510 fe tmp;
2511 fe_0(out);
2512 fe_ccopy(out, k , (cofactor >> 1) & 1); // bit 1
2513 fe_ccopy(out, x , (cofactor >> 0) & 1); // bit 0
2514 fe_neg (tmp, out);
2515 fe_ccopy(out, tmp, (cofactor >> 2) & 1); // bit 2
2516 WIPE_BUFFER(tmp);
2519 // "Fast" dirty ephemeral key
2520 // We use this one by default.
2522 // This version works by performing a regular scalar multiplication,
2523 // then add a low order point. The scalar multiplication is done in
2524 // Edwards space for more speed (*2 compared to the "small" version).
2525 // The cost is a bigger binary for programs that don't also sign messages.
2526 void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
2528 // Compute clean scalar multiplication
2529 u8 scalar[32];
2530 ge pk;
2531 crypto_eddsa_trim_scalar(scalar, secret_key);
2532 ge_scalarmult_base(&pk, scalar);
2534 // Compute low order point
2535 fe t1, t2;
2536 select_lop(t1, lop_x, sqrtm1, secret_key[0]);
2537 select_lop(t2, lop_y, fe_one, secret_key[0] + 2);
2538 ge_precomp low_order_point;
2539 fe_add(low_order_point.Yp, t2, t1);
2540 fe_sub(low_order_point.Ym, t2, t1);
2541 fe_mul(low_order_point.T2, t2, t1);
2542 fe_mul(low_order_point.T2, low_order_point.T2, D2);
2544 // Add low order point to the public key
2545 ge_madd(&pk, &pk, &low_order_point, t1, t2);
2547 // Convert to Montgomery u coordinate (we ignore the sign)
2548 fe_add(t1, pk.Z, pk.Y);
2549 fe_sub(t2, pk.Z, pk.Y);
2550 fe_invert(t2, t2);
2551 fe_mul(t1, t1, t2);
2553 fe_tobytes(public_key, t1);
2555 WIPE_BUFFER(t1); WIPE_CTX(&pk);
2556 WIPE_BUFFER(t2); WIPE_CTX(&low_order_point);
2557 WIPE_BUFFER(scalar);
2560 ///////////////////
2561 /// Elligator 2 ///
2562 ///////////////////
2563 static const fe A = {486662};
2565 // Elligator direct map
2567 // Computes the point corresponding to a representative, encoded in 32
2568 // bytes (little Endian). Since positive representatives fits in 254
2569 // bits, The two most significant bits are ignored.
2571 // From the paper:
2572 // w = -A / (fe(1) + non_square * r^2)
2573 // e = chi(w^3 + A*w^2 + w)
2574 // u = e*w - (fe(1)-e)*(A//2)
2575 // v = -e * sqrt(u^3 + A*u^2 + u)
2577 // We ignore v because we don't need it for X25519 (the Montgomery
2578 // ladder only uses u).
2580 // Note that e is either 0, 1 or -1
2581 // if e = 0 u = 0 and v = 0
2582 // if e = 1 u = w
2583 // if e = -1 u = -w - A = w * non_square * r^2
2585 // Let r1 = non_square * r^2
2586 // Let r2 = 1 + r1
2587 // Note that r2 cannot be zero, -1/non_square is not a square.
2588 // We can (tediously) verify that:
2589 // w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3
2590 // Therefore:
2591 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3))
2592 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1
2593 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6)
2594 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3) * r2^6)
2595 // chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * A * r2^3)
2596 // Corollary:
2597 // e = 1 if (A^2*r1 - r2^2) * A * r2^3) is a non-zero square
2598 // e = -1 if (A^2*r1 - r2^2) * A * r2^3) is not a square
2599 // Note that w^3 + A*w^2 + w (and therefore e) can never be zero:
2600 // w^3 + A*w^2 + w = w * (w^2 + A*w + 1)
2601 // w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1)
2602 // w^3 + A*w^2 + w = w * (w + A/2)^2 - A^2/4 + 1)
2603 // which is zero only if:
2604 // w = 0 (impossible)
2605 // (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square)
2607 // Let isr = invsqrt((A^2*r1 - r2^2) * A * r2^3)
2608 // isr = sqrt(1 / ((A^2*r1 - r2^2) * A * r2^3)) if e = 1
2609 // isr = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) * A * r2^3)) if e = -1
2611 // if e = 1
2612 // let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2
2613 // u1 = w
2614 // u1 = u
2616 // if e = -1
2617 // let ufactor = -non_square * sqrt(-1) * r^2
2618 // let vfactor = sqrt(ufactor)
2619 // let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor
2620 // u2 = w * -1 * -non_square * r^2
2621 // u2 = w * non_square * r^2
2622 // u2 = u
2623 void crypto_elligator_map(u8 curve[32], const u8 hidden[32])
2625 fe r, u, t1, t2, t3;
2626 fe_frombytes_mask(r, hidden, 2); // r is encoded in 254 bits.
2627 fe_sq(r, r);
2628 fe_add(t1, r, r);
2629 fe_add(u, t1, fe_one);
2630 fe_sq (t2, u);
2631 fe_mul(t3, A2, t1);
2632 fe_sub(t3, t3, t2);
2633 fe_mul(t3, t3, A);
2634 fe_mul(t1, t2, u);
2635 fe_mul(t1, t3, t1);
2636 int is_square = invsqrt(t1, t1);
2637 fe_mul(u, r, ufactor);
2638 fe_ccopy(u, fe_one, is_square);
2639 fe_sq (t1, t1);
2640 fe_mul(u, u, A);
2641 fe_mul(u, u, t3);
2642 fe_mul(u, u, t2);
2643 fe_mul(u, u, t1);
2644 fe_neg(u, u);
2645 fe_tobytes(curve, u);
2647 WIPE_BUFFER(t1); WIPE_BUFFER(r);
2648 WIPE_BUFFER(t2); WIPE_BUFFER(u);
2649 WIPE_BUFFER(t3);
2652 // Elligator inverse map
2654 // Computes the representative of a point, if possible. If not, it does
2655 // nothing and returns -1. Note that the success of the operation
2656 // depends only on the point (more precisely its u coordinate). The
2657 // tweak parameter is used only upon success
2659 // The tweak should be a random byte. Beyond that, its contents are an
2660 // implementation detail. Currently, the tweak comprises:
2661 // - Bit 1 : sign of the v coordinate (0 if positive, 1 if negative)
2662 // - Bit 2-5: not used
2663 // - Bits 6-7: random padding
2665 // From the paper:
2666 // Let sq = -non_square * u * (u+A)
2667 // if sq is not a square, or u = -A, there is no mapping
2668 // Assuming there is a mapping:
2669 // if v is positive: r = sqrt(-u / (non_square * (u+A)))
2670 // if v is negative: r = sqrt(-(u+A) / (non_square * u ))
2672 // We compute isr = invsqrt(-non_square * u * (u+A))
2673 // if it wasn't a square, abort.
2674 // else, isr = sqrt(-1 / (non_square * u * (u+A))
2676 // If v is positive, we return isr * u:
2677 // isr * u = sqrt(-1 / (non_square * u * (u+A)) * u
2678 // isr * u = sqrt(-u / (non_square * (u+A))
2680 // If v is negative, we return isr * (u+A):
2681 // isr * (u+A) = sqrt(-1 / (non_square * u * (u+A)) * (u+A)
2682 // isr * (u+A) = sqrt(-(u+A) / (non_square * u)
2683 int crypto_elligator_rev(u8 hidden[32], const u8 public_key[32], u8 tweak)
2685 fe t1, t2, t3;
2686 fe_frombytes(t1, public_key); // t1 = u
2688 fe_add(t2, t1, A); // t2 = u + A
2689 fe_mul(t3, t1, t2);
2690 fe_mul_small(t3, t3, -2);
2691 int is_square = invsqrt(t3, t3); // t3 = sqrt(-1 / non_square * u * (u+A))
2692 if (is_square) {
2693 // The only variable time bit. This ultimately reveals how many
2694 // tries it took us to find a representable key.
2695 // This does not affect security as long as we try keys at random.
2697 fe_ccopy (t1, t2, tweak & 1); // multiply by u if v is positive,
2698 fe_mul (t3, t1, t3); // multiply by u+A otherwise
2699 fe_mul_small(t1, t3, 2);
2700 fe_neg (t2, t3);
2701 fe_ccopy (t3, t2, fe_isodd(t1));
2702 fe_tobytes(hidden, t3);
2704 // Pad with two random bits
2705 hidden[31] |= tweak & 0xc0;
2708 WIPE_BUFFER(t1);
2709 WIPE_BUFFER(t2);
2710 WIPE_BUFFER(t3);
2711 return is_square - 1;
2714 void crypto_elligator_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32])
2716 u8 pk [32]; // public key
2717 u8 buf[64]; // seed + representative
2718 COPY(buf + 32, seed, 32);
2719 do {
2720 crypto_chacha20_djb(buf, 0, 64, buf+32, zero, 0);
2721 crypto_x25519_dirty_fast(pk, buf); // or the "small" version
2722 } while(crypto_elligator_rev(buf+32, pk, buf[32]));
2723 // Note that the return value of crypto_elligator_rev() is
2724 // independent from its tweak parameter.
2725 // Therefore, buf[32] is not actually reused. Either we loop one
2726 // more time and buf[32] is used for the new seed, or we succeeded,
2727 // and buf[32] becomes the tweak parameter.
2729 crypto_wipe(seed, 32);
2730 COPY(hidden , buf + 32, 32);
2731 COPY(secret_key, buf , 32);
2732 WIPE_BUFFER(buf);
2733 WIPE_BUFFER(pk);
2736 ///////////////////////
2737 /// Scalar division ///
2738 ///////////////////////
2740 // Montgomery reduction.
2741 // Divides x by (2^256), and reduces the result modulo L
2743 // Precondition:
2744 // x < L * 2^256
2745 // Constants:
2746 // r = 2^256 (makes division by r trivial)
2747 // k = (r * (1/r) - 1) // L (1/r is computed modulo L )
2748 // Algorithm:
2749 // s = (x * k) % r
2750 // t = x + s*L (t is always a multiple of r)
2751 // u = (t/r) % L (u is always below 2*L, conditional subtraction is enough)
2752 static void redc(u32 u[8], u32 x[16])
2754 static const u32 k[8] = {
2755 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
2756 0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,
2759 // s = x * k (modulo 2^256)
2760 // This is cheaper than the full multiplication.
2761 u32 s[8] = {0};
2762 FOR (i, 0, 8) {
2763 u64 carry = 0;
2764 FOR (j, 0, 8-i) {
2765 carry += s[i+j] + (u64)x[i] * k[j];
2766 s[i+j] = (u32)carry;
2767 carry >>= 32;
2770 u32 t[16] = {0};
2771 multiply(t, s, L);
2773 // t = t + x
2774 u64 carry = 0;
2775 FOR (i, 0, 16) {
2776 carry += (u64)t[i] + x[i];
2777 t[i] = (u32)carry;
2778 carry >>= 32;
2781 // u = (t / 2^256) % L
2782 // Note that t / 2^256 is always below 2*L,
2783 // So a constant time conditional subtraction is enough
2784 remove_l(u, t+8);
2786 WIPE_BUFFER(s);
2787 WIPE_BUFFER(t);
2790 void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32],
2791 const u8 curve_point[32])
2793 static const u8 Lm2[32] = { // L - 2
2794 0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
2795 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
2796 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2797 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
2799 // 1 in Montgomery form
2800 u32 m_inv [8] = {
2801 0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4,
2802 0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff,
2805 u8 scalar[32];
2806 crypto_eddsa_trim_scalar(scalar, private_key);
2808 // Convert the scalar in Montgomery form
2809 // m_scl = scalar * 2^256 (modulo L)
2810 u32 m_scl[8];
2812 u32 tmp[16];
2813 ZERO(tmp, 8);
2814 load32_le_buf(tmp+8, scalar, 8);
2815 mod_l(scalar, tmp);
2816 load32_le_buf(m_scl, scalar, 8);
2817 WIPE_BUFFER(tmp); // Wipe ASAP to save stack space
2820 // Compute the inverse
2821 u32 product[16];
2822 for (int i = 252; i >= 0; i--) {
2823 ZERO(product, 16);
2824 multiply(product, m_inv, m_inv);
2825 redc(m_inv, product);
2826 if (scalar_bit(Lm2, i)) {
2827 ZERO(product, 16);
2828 multiply(product, m_inv, m_scl);
2829 redc(m_inv, product);
2832 // Convert the inverse *out* of Montgomery form
2833 // scalar = m_inv / 2^256 (modulo L)
2834 COPY(product, m_inv, 8);
2835 ZERO(product + 8, 8);
2836 redc(m_inv, product);
2837 store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar
2839 // Clear the cofactor of scalar:
2840 // cleared = scalar * (3*L + 1) (modulo 8*L)
2841 // cleared = scalar + scalar * 3 * L (modulo 8*L)
2842 // Note that (scalar * 3) is reduced modulo 8, so we only need the
2843 // first byte.
2844 add_xl(scalar, scalar[0] * 3);
2846 // Recall that 8*L < 2^256. However it is also very close to
2847 // 2^255. If we spanned the ladder over 255 bits, random tests
2848 // wouldn't catch the off-by-one error.
2849 scalarmult(blind_salt, scalar, curve_point, 256);
2851 WIPE_BUFFER(scalar); WIPE_BUFFER(m_scl);
2852 WIPE_BUFFER(product); WIPE_BUFFER(m_inv);
2855 ////////////////////////////////
2856 /// Authenticated encryption ///
2857 ////////////////////////////////
2858 static void lock_auth(u8 mac[16], const u8 auth_key[32],
2859 const u8 *ad , size_t ad_size,
2860 const u8 *cipher_text, size_t text_size)
2862 u8 sizes[16]; // Not secret, not wiped
2863 store64_le(sizes + 0, ad_size);
2864 store64_le(sizes + 8, text_size);
2865 crypto_poly1305_ctx poly_ctx; // auto wiped...
2866 crypto_poly1305_init (&poly_ctx, auth_key);
2867 crypto_poly1305_update(&poly_ctx, ad , ad_size);
2868 crypto_poly1305_update(&poly_ctx, zero , gap(ad_size, 16));
2869 crypto_poly1305_update(&poly_ctx, cipher_text, text_size);
2870 crypto_poly1305_update(&poly_ctx, zero , gap(text_size, 16));
2871 crypto_poly1305_update(&poly_ctx, sizes , 16);
2872 crypto_poly1305_final (&poly_ctx, mac); // ...here
2875 void crypto_aead_init_x(crypto_aead_ctx *ctx,
2876 u8 const key[32], const u8 nonce[24])
2878 crypto_chacha20_h(ctx->key, key, nonce);
2879 COPY(ctx->nonce, nonce + 16, 8);
2880 ctx->counter = 0;
2883 void crypto_aead_init_djb(crypto_aead_ctx *ctx,
2884 const u8 key[32], const u8 nonce[8])
2886 COPY(ctx->key , key , 32);
2887 COPY(ctx->nonce, nonce, 8);
2888 ctx->counter = 0;
2891 void crypto_aead_init_ietf(crypto_aead_ctx *ctx,
2892 const u8 key[32], const u8 nonce[12])
2894 COPY(ctx->key , key , 32);
2895 COPY(ctx->nonce, nonce + 4, 8);
2896 ctx->counter = (u64)load32_le(nonce) << 32;
2899 void crypto_aead_write(crypto_aead_ctx *ctx, u8 *cipher_text, u8 mac[16],
2900 const u8 *ad, size_t ad_size,
2901 const u8 *plain_text, size_t text_size)
2903 u8 auth_key[64]; // the last 32 bytes are used for rekeying.
2904 crypto_chacha20_djb(auth_key, 0, 64, ctx->key, ctx->nonce, ctx->counter);
2905 crypto_chacha20_djb(cipher_text, plain_text, text_size,
2906 ctx->key, ctx->nonce, ctx->counter + 1);
2907 lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size);
2908 COPY(ctx->key, auth_key + 32, 32);
2909 WIPE_BUFFER(auth_key);
2912 int crypto_aead_read(crypto_aead_ctx *ctx, u8 *plain_text, const u8 mac[16],
2913 const u8 *ad, size_t ad_size,
2914 const u8 *cipher_text, size_t text_size)
2916 u8 auth_key[64]; // the last 32 bytes are used for rekeying.
2917 u8 real_mac[16];
2918 crypto_chacha20_djb(auth_key, 0, 64, ctx->key, ctx->nonce, ctx->counter);
2919 lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
2920 int mismatch = crypto_verify16(mac, real_mac);
2921 if (!mismatch) {
2922 crypto_chacha20_djb(plain_text, cipher_text, text_size,
2923 ctx->key, ctx->nonce, ctx->counter + 1);
2924 COPY(ctx->key, auth_key + 32, 32);
2926 WIPE_BUFFER(auth_key);
2927 WIPE_BUFFER(real_mac);
2928 return mismatch;
2931 void crypto_aead_lock(u8 *cipher_text, u8 mac[16], const u8 key[32],
2932 const u8 nonce[24], const u8 *ad, size_t ad_size,
2933 const u8 *plain_text, size_t text_size)
2935 crypto_aead_ctx ctx;
2936 crypto_aead_init_x(&ctx, key, nonce);
2937 crypto_aead_write(&ctx, cipher_text, mac, ad, ad_size,
2938 plain_text, text_size);
2939 crypto_wipe(&ctx, sizeof(ctx));
2942 int crypto_aead_unlock(u8 *plain_text, const u8 mac[16], const u8 key[32],
2943 const u8 nonce[24], const u8 *ad, size_t ad_size,
2944 const u8 *cipher_text, size_t text_size)
2946 crypto_aead_ctx ctx;
2947 crypto_aead_init_x(&ctx, key, nonce);
2948 int mismatch = crypto_aead_read(&ctx, plain_text, mac, ad, ad_size,
2949 cipher_text, text_size);
2950 crypto_wipe(&ctx, sizeof(ctx));
2951 return mismatch;
2954 #ifdef MONOCYPHER_CPP_NAMESPACE
2956 #endif