src/3rdparty/monocypher/monocypher.cpp

   1 // Monocypher version 4.0.2
   2 //
   3 // This file is dual-licensed.  Choose whichever licence you want from
   4 // the two licences listed below.
   5 //
   6 // The first licence is a regular 2-clause BSD licence.  The second licence
   7 // is the CC-0 from Creative Commons. It is intended to release Monocypher
   8 // to the public domain.  The BSD licence serves as a fallback option.
   9 //
  10 // SPDX-License-Identifier: BSD-2-Clause OR CC0-1.0
  11 //
  12 // ------------------------------------------------------------------------
  13 //
  14 // Copyright (c) 2017-2020, Loup Vaillant
  15 // All rights reserved.
  16 //
  17 //
  18 // Redistribution and use in source and binary forms, with or without
  19 // modification, are permitted provided that the following conditions are
  20 // met:
  21 //
  22 // 1. Redistributions of source code must retain the above copyright
  23 //    notice, this list of conditions and the following disclaimer.
  24 //
  25 // 2. Redistributions in binary form must reproduce the above copyright
  26 //    notice, this list of conditions and the following disclaimer in the
  27 //    documentation and/or other materials provided with the
  28 //    distribution.
  29 //
  30 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  31 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  32 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  33 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  34 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  35 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  36 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  37 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  38 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  39 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  40 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  41 //
  42 // ------------------------------------------------------------------------
  43 //
  44 // Written in 2017-2020 by Loup Vaillant
  45 //
  46 // To the extent possible under law, the author(s) have dedicated all copyright
  47 // and related neighboring rights to this software to the public domain
  48 // worldwide.  This software is distributed without any warranty.
  49 //
  50 // You should have received a copy of the CC0 Public Domain Dedication along
  51 // with this software.  If not, see
  52 // <https://creativecommons.org/publicdomain/zero/1.0/>
  53
  54 #include "monocypher.h"
  55
  56 #ifdef MONOCYPHER_CPP_NAMESPACE
  57 namespace MONOCYPHER_CPP_NAMESPACE {
  58 #endif
  59
  60 /////////////////
  61 /// Utilities ///
  62 /////////////////
  63 #define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
  64 #define FOR(i, start, end)         FOR_T(size_t, i, start, end)
  65 #define COPY(dst, src, size)       FOR(_i_, 0, size) (dst)[_i_] = (src)[_i_]
  66 #define ZERO(buf, size)            FOR(_i_, 0, size) (buf)[_i_] = 0
  67 #define WIPE_CTX(ctx)              crypto_wipe(ctx   , sizeof(*(ctx)))
  68 #define WIPE_BUFFER(buffer)        crypto_wipe(buffer, sizeof(buffer))
  69 #define MC_MIN(a, b)               ((a) <= (b) ? (a) : (b))
  70 #define MC_MAX(a, b)               ((a) >= (b) ? (a) : (b))
  71
  72 typedef int8_t   i8;
  73 typedef uint8_t  u8;
  74 typedef int16_t  i16;
  75 typedef uint32_t u32;
  76 typedef int32_t  i32;
  77 typedef int64_t  i64;
  78 typedef uint64_t u64;
  79
  80 static const u8 zero[128] = {0};
  81
  82 // returns the smallest positive integer y such that
  83 // (x + y) % pow_2  == 0
  84 // Basically, y is the "gap" missing to align x.
  85 // Only works when pow_2 is a power of 2.
  86 // Note: we use ~x+1 instead of -x to avoid compiler warnings
  87 static size_t gap(size_t x, size_t pow_2)
  88 {
  89         return (~x + 1) & (pow_2 - 1);
  90 }
  91
  92 static u32 load24_le(const u8 s[3])
  93 {
  94         return
  95                 ((u32)s[0] <<  0) |
  96                 ((u32)s[1] <<  8) |
  97                 ((u32)s[2] << 16);
  98 }
  99
 100 static u32 load32_le(const u8 s[4])
 101 {
 102         return
 103                 ((u32)s[0] <<  0) |
 104                 ((u32)s[1] <<  8) |
 105                 ((u32)s[2] << 16) |
 106                 ((u32)s[3] << 24);
 107 }
 108
 109 static u64 load64_le(const u8 s[8])
 110 {
 111         return load32_le(s) | ((u64)load32_le(s+4) << 32);
 112 }
 113
 114 static void store32_le(u8 out[4], u32 in)
 115 {
 116         out[0] =  in        & 0xff;
 117         out[1] = (in >>  8) & 0xff;
 118         out[2] = (in >> 16) & 0xff;
 119         out[3] = (in >> 24) & 0xff;
 120 }
 121
 122 static void store64_le(u8 out[8], u64 in)
 123 {
 124         store32_le(out    , (u32)in );
 125         store32_le(out + 4, in >> 32);
 126 }
 127
 128 static void load32_le_buf (u32 *dst, const u8 *src, size_t size) {
 129         FOR(i, 0, size) { dst[i] = load32_le(src + i*4); }
 130 }
 131 static void load64_le_buf (u64 *dst, const u8 *src, size_t size) {
 132         FOR(i, 0, size) { dst[i] = load64_le(src + i*8); }
 133 }
 134 static void store32_le_buf(u8 *dst, const u32 *src, size_t size) {
 135         FOR(i, 0, size) { store32_le(dst + i*4, src[i]); }
 136 }
 137 static void store64_le_buf(u8 *dst, const u64 *src, size_t size) {
 138         FOR(i, 0, size) { store64_le(dst + i*8, src[i]); }
 139 }
 140
 141 static u64 rotr64(u64 x, u64 n) { return (x >> n) ^ (x << (64 - n)); }
 142 static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
 143
 144 static int neq0(u64 diff)
 145 {
 146         // constant time comparison to zero
 147         // return diff != 0 ? -1 : 0
 148         u64 half = (diff >> 32) | ((u32)diff);
 149         return (1 & ((half - 1) >> 32)) - 1;
 150 }
 151
 152 static u64 x16(const u8 a[16], const u8 b[16])
 153 {
 154         return (load64_le(a + 0) ^ load64_le(b + 0))
 155                 |  (load64_le(a + 8) ^ load64_le(b + 8));
 156 }
 157 static u64 x32(const u8 a[32],const u8 b[32]){return x16(a,b)| x16(a+16, b+16);}
 158 static u64 x64(const u8 a[64],const u8 b[64]){return x32(a,b)| x32(a+32, b+32);}
 159 int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
 160 int crypto_verify32(const u8 a[32], const u8 b[32]){ return neq0(x32(a, b)); }
 161 int crypto_verify64(const u8 a[64], const u8 b[64]){ return neq0(x64(a, b)); }
 162
 163 void crypto_wipe(void *secret, size_t size)
 164 {
 165         volatile u8 *v_secret = (u8*)secret;
 166         ZERO(v_secret, size);
 167 }
 168
 169 /////////////////
 170 /// Chacha 20 ///
 171 /////////////////
 172 #define QUARTERROUND(a, b, c, d)        \
 173         a += b;  d = rotl32(d ^ a, 16); \
 174         c += d;  b = rotl32(b ^ c, 12); \
 175         a += b;  d = rotl32(d ^ a,  8); \
 176         c += d;  b = rotl32(b ^ c,  7)
 177
 178 static void chacha20_rounds(u32 out[16], const u32 in[16])
 179 {
 180         // The temporary variables make Chacha20 10% faster.
 181         u32 t0  = in[ 0];  u32 t1  = in[ 1];  u32 t2  = in[ 2];  u32 t3  = in[ 3];
 182         u32 t4  = in[ 4];  u32 t5  = in[ 5];  u32 t6  = in[ 6];  u32 t7  = in[ 7];
 183         u32 t8  = in[ 8];  u32 t9  = in[ 9];  u32 t10 = in[10];  u32 t11 = in[11];
 184         u32 t12 = in[12];  u32 t13 = in[13];  u32 t14 = in[14];  u32 t15 = in[15];
 185
 186         FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
 187                 QUARTERROUND(t0, t4, t8 , t12); // column 0
 188                 QUARTERROUND(t1, t5, t9 , t13); // column 1
 189                 QUARTERROUND(t2, t6, t10, t14); // column 2
 190                 QUARTERROUND(t3, t7, t11, t15); // column 3
 191                 QUARTERROUND(t0, t5, t10, t15); // diagonal 0
 192                 QUARTERROUND(t1, t6, t11, t12); // diagonal 1
 193                 QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
 194                 QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
 195         }
 196         out[ 0] = t0;   out[ 1] = t1;   out[ 2] = t2;   out[ 3] = t3;
 197         out[ 4] = t4;   out[ 5] = t5;   out[ 6] = t6;   out[ 7] = t7;
 198         out[ 8] = t8;   out[ 9] = t9;   out[10] = t10;  out[11] = t11;
 199         out[12] = t12;  out[13] = t13;  out[14] = t14;  out[15] = t15;
 200 }
 201
 202 static const u8 *chacha20_constant = (const u8*)"expand 32-byte k"; // 16 bytes
 203
 204 void crypto_chacha20_h(u8 out[32], const u8 key[32], const u8 in [16])
 205 {
 206         u32 block[16];
 207         load32_le_buf(block     , chacha20_constant, 4);
 208         load32_le_buf(block +  4, key              , 8);
 209         load32_le_buf(block + 12, in               , 4);
 210
 211         chacha20_rounds(block, block);
 212
 213         // prevent reversal of the rounds by revealing only half of the buffer.
 214         store32_le_buf(out   , block   , 4); // constant
 215         store32_le_buf(out+16, block+12, 4); // counter and nonce
 216         WIPE_BUFFER(block);
 217 }
 218
 219 u64 crypto_chacha20_djb(u8 *cipher_text, const u8 *plain_text,
 220                         size_t text_size, const u8 key[32], const u8 nonce[8],
 221                         u64 ctr)
 222 {
 223         u32 input[16];
 224         load32_le_buf(input     , chacha20_constant, 4);
 225         load32_le_buf(input +  4, key              , 8);
 226         load32_le_buf(input + 14, nonce            , 2);
 227         input[12] = (u32) ctr;
 228         input[13] = (u32)(ctr >> 32);
 229
 230         // Whole blocks
 231         u32    pool[16];
 232         size_t nb_blocks = text_size >> 6;
 233         FOR (i, 0, nb_blocks) {
 234                 chacha20_rounds(pool, input);
 235                 if (plain_text != 0) {
 236                         FOR (j, 0, 16) {
 237                                 u32 p = pool[j] + input[j];
 238                                 store32_le(cipher_text, p ^ load32_le(plain_text));
 239                                 cipher_text += 4;
 240                                 plain_text  += 4;
 241                         }
 242                 } else {
 243                         FOR (j, 0, 16) {
 244                                 u32 p = pool[j] + input[j];
 245                                 store32_le(cipher_text, p);
 246                                 cipher_text += 4;
 247                         }
 248                 }
 249                 input[12]++;
 250                 if (input[12] == 0) {
 251                         input[13]++;
 252                 }
 253         }
 254         text_size &= 63;
 255
 256         // Last (incomplete) block
 257         if (text_size > 0) {
 258                 if (plain_text == 0) {
 259                         plain_text = zero;
 260                 }
 261                 chacha20_rounds(pool, input);
 262                 u8 tmp[64];
 263                 FOR (i, 0, 16) {
 264                         store32_le(tmp + i*4, pool[i] + input[i]);
 265                 }
 266                 FOR (i, 0, text_size) {
 267                         cipher_text[i] = tmp[i] ^ plain_text[i];
 268                 }
 269                 WIPE_BUFFER(tmp);
 270         }
 271         ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0);
 272
 273         WIPE_BUFFER(pool);
 274         WIPE_BUFFER(input);
 275         return ctr;
 276 }
 277
 278 u32 crypto_chacha20_ietf(u8 *cipher_text, const u8 *plain_text,
 279                          size_t text_size,
 280                          const u8 key[32], const u8 nonce[12], u32 ctr)
 281 {
 282         u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32);
 283         return (u32)crypto_chacha20_djb(cipher_text, plain_text, text_size,
 284                                         key, nonce + 4, big_ctr);
 285 }
 286
 287 u64 crypto_chacha20_x(u8 *cipher_text, const u8 *plain_text,
 288                       size_t text_size,
 289                       const u8 key[32], const u8 nonce[24], u64 ctr)
 290 {
 291         u8 sub_key[32];
 292         crypto_chacha20_h(sub_key, key, nonce);
 293         ctr = crypto_chacha20_djb(cipher_text, plain_text, text_size,
 294                                   sub_key, nonce + 16, ctr);
 295         WIPE_BUFFER(sub_key);
 296         return ctr;
 297 }
 298
 299 /////////////////
 300 /// Poly 1305 ///
 301 /////////////////
 302
 303 // h = (h + c) * r
 304 // preconditions:
 305 //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
 306 //   ctx->r <=   0ffffffc_0ffffffc_0ffffffc_0fffffff
 307 //   end    <= 1
 308 // Postcondition:
 309 //   ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
 310 static void poly_blocks(crypto_poly1305_ctx *ctx, const u8 *in,
 311                         size_t nb_blocks, unsigned end)
 312 {
 313         // Local all the things!
 314         const u32 r0 = ctx->r[0];
 315         const u32 r1 = ctx->r[1];
 316         const u32 r2 = ctx->r[2];
 317         const u32 r3 = ctx->r[3];
 318         const u32 rr0 = (r0 >> 2) * 5;  // lose 2 bits...
 319         const u32 rr1 = (r1 >> 2) + r1; // rr1 == (r1 >> 2) * 5
 320         const u32 rr2 = (r2 >> 2) + r2; // rr1 == (r2 >> 2) * 5
 321         const u32 rr3 = (r3 >> 2) + r3; // rr1 == (r3 >> 2) * 5
 322         const u32 rr4 = r0 & 3;         // ...recover 2 bits
 323         u32 h0 = ctx->h[0];
 324         u32 h1 = ctx->h[1];
 325         u32 h2 = ctx->h[2];
 326         u32 h3 = ctx->h[3];
 327         u32 h4 = ctx->h[4];
 328
 329         FOR (i, 0, nb_blocks) {
 330                 // h + c, without carry propagation
 331                 const u64 s0 = (u64)h0 + load32_le(in);  in += 4;
 332                 const u64 s1 = (u64)h1 + load32_le(in);  in += 4;
 333                 const u64 s2 = (u64)h2 + load32_le(in);  in += 4;
 334                 const u64 s3 = (u64)h3 + load32_le(in);  in += 4;
 335                 const u32 s4 =      h4 + end;
 336
 337                 // (h + c) * r, without carry propagation
 338                 const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0;
 339                 const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1;
 340                 const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2;
 341                 const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3;
 342                 const u32 x4 =                                s4*rr4;
 343
 344                 // partial reduction modulo 2^130 - 5
 345                 const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
 346                 const u64 u0 = (u5 >>  2) * 5 + (x0 & 0xffffffff);
 347                 const u64 u1 = (u0 >> 32)     + (x1 & 0xffffffff) + (x0 >> 32);
 348                 const u64 u2 = (u1 >> 32)     + (x2 & 0xffffffff) + (x1 >> 32);
 349                 const u64 u3 = (u2 >> 32)     + (x3 & 0xffffffff) + (x2 >> 32);
 350                 const u32 u4 = (u3 >> 32)     + (u5 & 3); // u4 <= 4
 351
 352                 // Update the hash
 353                 h0 = u0 & 0xffffffff;
 354                 h1 = u1 & 0xffffffff;
 355                 h2 = u2 & 0xffffffff;
 356                 h3 = u3 & 0xffffffff;
 357                 h4 = u4;
 358         }
 359         ctx->h[0] = h0;
 360         ctx->h[1] = h1;
 361         ctx->h[2] = h2;
 362         ctx->h[3] = h3;
 363         ctx->h[4] = h4;
 364 }
 365
 366 void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
 367 {
 368         ZERO(ctx->h, 5); // Initial hash is zero
 369         ctx->c_idx = 0;
 370         // load r and pad (r has some of its bits cleared)
 371         load32_le_buf(ctx->r  , key   , 4);
 372         load32_le_buf(ctx->pad, key+16, 4);
 373         FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; }
 374         FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; }
 375 }
 376
 377 void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
 378                             const u8 *message, size_t message_size)
 379 {
 380         // Avoid undefined NULL pointer increments with empty messages
 381         if (message_size == 0) {
 382                 return;
 383         }
 384
 385         // Align ourselves with block boundaries
 386         size_t aligned = MC_MIN(gap(ctx->c_idx, 16), message_size);
 387         FOR (i, 0, aligned) {
 388                 ctx->c[ctx->c_idx] = *message;
 389                 ctx->c_idx++;
 390                 message++;
 391                 message_size--;
 392         }
 393
 394         // If block is complete, process it
 395         if (ctx->c_idx == 16) {
 396                 poly_blocks(ctx, ctx->c, 1, 1);
 397                 ctx->c_idx = 0;
 398         }
 399
 400         // Process the message block by block
 401         size_t nb_blocks = message_size >> 4;
 402         poly_blocks(ctx, message, nb_blocks, 1);
 403         message      += nb_blocks << 4;
 404         message_size &= 15;
 405
 406         // remaining bytes (we never complete a block here)
 407         FOR (i, 0, message_size) {
 408                 ctx->c[ctx->c_idx] = message[i];
 409                 ctx->c_idx++;
 410         }
 411 }
 412
 413 void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
 414 {
 415         // Process the last block (if any)
 416         // We move the final 1 according to remaining input length
 417         // (this will add less than 2^130 to the last input block)
 418         if (ctx->c_idx != 0) {
 419                 ZERO(ctx->c + ctx->c_idx, 16 - ctx->c_idx);
 420                 ctx->c[ctx->c_idx] = 1;
 421                 poly_blocks(ctx, ctx->c, 1, 0);
 422         }
 423
 424         // check if we should subtract 2^130-5 by performing the
 425         // corresponding carry propagation.
 426         u64 c = 5;
 427         FOR (i, 0, 4) {
 428                 c  += ctx->h[i];
 429                 c >>= 32;
 430         }
 431         c += ctx->h[4];
 432         c  = (c >> 2) * 5; // shift the carry back to the beginning
 433         // c now indicates how many times we should subtract 2^130-5 (0 or 1)
 434         FOR (i, 0, 4) {
 435                 c += (u64)ctx->h[i] + ctx->pad[i];
 436                 store32_le(mac + i*4, (u32)c);
 437                 c = c >> 32;
 438         }
 439         WIPE_CTX(ctx);
 440 }
 441
 442 void crypto_poly1305(u8     mac[16],  const u8 *message,
 443                      size_t message_size, const u8  key[32])
 444 {
 445         crypto_poly1305_ctx ctx;
 446         crypto_poly1305_init  (&ctx, key);
 447         crypto_poly1305_update(&ctx, message, message_size);
 448         crypto_poly1305_final (&ctx, mac);
 449 }
 450
 451 ////////////////
 452 /// BLAKE2 b ///
 453 ////////////////
 454 static const u64 iv[8] = {
 455         0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
 456         0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
 457         0x510e527fade682d1, 0x9b05688c2b3e6c1f,
 458         0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
 459 };
 460
 461 static void blake2b_compress(crypto_blake2b_ctx *ctx, int is_last_block)
 462 {
 463         static const u8 sigma[12][16] = {
 464                 {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
 465                 { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
 466                 { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 },
 467                 {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 },
 468                 {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 },
 469                 {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 },
 470                 { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 },
 471                 { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 },
 472                 {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 },
 473                 { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 },
 474                 {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
 475                 { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 },
 476         };
 477
 478         // increment input offset
 479         u64   *x = ctx->input_offset;
 480         size_t y = ctx->input_idx;
 481         x[0] += y;
 482         if (x[0] < y) {
 483                 x[1]++;
 484         }
 485
 486         // init work vector
 487         u64 v0 = ctx->hash[0];  u64 v8  = iv[0];
 488         u64 v1 = ctx->hash[1];  u64 v9  = iv[1];
 489         u64 v2 = ctx->hash[2];  u64 v10 = iv[2];
 490         u64 v3 = ctx->hash[3];  u64 v11 = iv[3];
 491         u64 v4 = ctx->hash[4];  u64 v12 = iv[4] ^ ctx->input_offset[0];
 492         u64 v5 = ctx->hash[5];  u64 v13 = iv[5] ^ ctx->input_offset[1];
 493         u64 v6 = ctx->hash[6];  u64 v14 = iv[6] ^ (u64)~(is_last_block - 1);
 494         u64 v7 = ctx->hash[7];  u64 v15 = iv[7];
 495
 496         // mangle work vector
 497         u64 *input = ctx->input;
 498 #define BLAKE2_G(a, b, c, d, x, y)      \
 499         a += b + x;  d = rotr64(d ^ a, 32); \
 500         c += d;      b = rotr64(b ^ c, 24); \
 501         a += b + y;  d = rotr64(d ^ a, 16); \
 502         c += d;      b = rotr64(b ^ c, 63)
 503 #define BLAKE2_ROUND(i) \
 504         BLAKE2_G(v0, v4, v8 , v12, input[sigma[i][ 0]], input[sigma[i][ 1]]); \
 505         BLAKE2_G(v1, v5, v9 , v13, input[sigma[i][ 2]], input[sigma[i][ 3]]); \
 506         BLAKE2_G(v2, v6, v10, v14, input[sigma[i][ 4]], input[sigma[i][ 5]]); \
 507         BLAKE2_G(v3, v7, v11, v15, input[sigma[i][ 6]], input[sigma[i][ 7]]); \
 508         BLAKE2_G(v0, v5, v10, v15, input[sigma[i][ 8]], input[sigma[i][ 9]]); \
 509         BLAKE2_G(v1, v6, v11, v12, input[sigma[i][10]], input[sigma[i][11]]); \
 510         BLAKE2_G(v2, v7, v8 , v13, input[sigma[i][12]], input[sigma[i][13]]); \
 511         BLAKE2_G(v3, v4, v9 , v14, input[sigma[i][14]], input[sigma[i][15]])
 512
 513 #ifdef BLAKE2_NO_UNROLLING
 514         FOR (i, 0, 12) {
 515                 BLAKE2_ROUND(i);
 516         }
 517 #else
 518         BLAKE2_ROUND(0);  BLAKE2_ROUND(1);  BLAKE2_ROUND(2);  BLAKE2_ROUND(3);
 519         BLAKE2_ROUND(4);  BLAKE2_ROUND(5);  BLAKE2_ROUND(6);  BLAKE2_ROUND(7);
 520         BLAKE2_ROUND(8);  BLAKE2_ROUND(9);  BLAKE2_ROUND(10); BLAKE2_ROUND(11);
 521 #endif
 522
 523         // update hash
 524         ctx->hash[0] ^= v0 ^ v8;   ctx->hash[1] ^= v1 ^ v9;
 525         ctx->hash[2] ^= v2 ^ v10;  ctx->hash[3] ^= v3 ^ v11;
 526         ctx->hash[4] ^= v4 ^ v12;  ctx->hash[5] ^= v5 ^ v13;
 527         ctx->hash[6] ^= v6 ^ v14;  ctx->hash[7] ^= v7 ^ v15;
 528 }
 529
 530 void crypto_blake2b_keyed_init(crypto_blake2b_ctx *ctx, size_t hash_size,
 531                                const u8 *key, size_t key_size)
 532 {
 533         // initial hash
 534         COPY(ctx->hash, iv, 8);
 535         ctx->hash[0] ^= 0x01010000 ^ (key_size << 8) ^ hash_size;
 536
 537         ctx->input_offset[0] = 0;  // beginning of the input, no offset
 538         ctx->input_offset[1] = 0;  // beginning of the input, no offset
 539         ctx->hash_size       = hash_size;
 540         ctx->input_idx       = 0;
 541         ZERO(ctx->input, 16);
 542
 543         // if there is a key, the first block is that key (padded with zeroes)
 544         if (key_size > 0) {
 545                 u8 key_block[128] = {0};
 546                 COPY(key_block, key, key_size);
 547                 // same as calling crypto_blake2b_update(ctx, key_block , 128)
 548                 load64_le_buf(ctx->input, key_block, 16);
 549                 ctx->input_idx = 128;
 550         }
 551 }
 552
 553 void crypto_blake2b_init(crypto_blake2b_ctx *ctx, size_t hash_size)
 554 {
 555         crypto_blake2b_keyed_init(ctx, hash_size, 0, 0);
 556 }
 557
 558 void crypto_blake2b_update(crypto_blake2b_ctx *ctx,
 559                            const u8 *message, size_t message_size)
 560 {
 561         // Avoid undefined NULL pointer increments with empty messages
 562         if (message_size == 0) {
 563                 return;
 564         }
 565
 566         // Align with word boundaries
 567         if ((ctx->input_idx & 7) != 0) {
 568                 size_t nb_bytes = MC_MIN(gap(ctx->input_idx, 8), message_size);
 569                 size_t word     = ctx->input_idx >> 3;
 570                 size_t byte     = ctx->input_idx & 7;
 571                 FOR (i, 0, nb_bytes) {
 572                         ctx->input[word] |= (u64)message[i] << ((byte + i) << 3);
 573                 }
 574                 ctx->input_idx += nb_bytes;
 575                 message        += nb_bytes;
 576                 message_size   -= nb_bytes;
 577         }
 578
 579         // Align with block boundaries (faster than byte by byte)
 580         if ((ctx->input_idx & 127) != 0) {
 581                 size_t nb_words = MC_MIN(gap(ctx->input_idx, 128), message_size) >> 3;
 582                 load64_le_buf(ctx->input + (ctx->input_idx >> 3), message, nb_words);
 583                 ctx->input_idx += nb_words << 3;
 584                 message        += nb_words << 3;
 585                 message_size   -= nb_words << 3;
 586         }
 587
 588         // Process block by block
 589         size_t nb_blocks = message_size >> 7;
 590         FOR (i, 0, nb_blocks) {
 591                 if (ctx->input_idx == 128) {
 592                         blake2b_compress(ctx, 0);
 593                 }
 594                 load64_le_buf(ctx->input, message, 16);
 595                 message += 128;
 596                 ctx->input_idx = 128;
 597         }
 598         message_size &= 127;
 599
 600         if (message_size != 0) {
 601                 // Compress block & flush input buffer as needed
 602                 if (ctx->input_idx == 128) {
 603                         blake2b_compress(ctx, 0);
 604                         ctx->input_idx = 0;
 605                 }
 606                 if (ctx->input_idx == 0) {
 607                         ZERO(ctx->input, 16);
 608                 }
 609                 // Fill remaining words (faster than byte by byte)
 610                 size_t nb_words = message_size >> 3;
 611                 load64_le_buf(ctx->input, message, nb_words);
 612                 ctx->input_idx += nb_words << 3;
 613                 message        += nb_words << 3;
 614                 message_size   -= nb_words << 3;
 615
 616                 // Fill remaining bytes
 617                 FOR (i, 0, message_size) {
 618                         size_t word = ctx->input_idx >> 3;
 619                         size_t byte = ctx->input_idx & 7;
 620                         ctx->input[word] |= (u64)message[i] << (byte << 3);
 621                         ctx->input_idx++;
 622                 }
 623         }
 624 }
 625
 626 void crypto_blake2b_final(crypto_blake2b_ctx *ctx, u8 *hash)
 627 {
 628         blake2b_compress(ctx, 1); // compress the last block
 629         size_t hash_size = MC_MIN(ctx->hash_size, 64);
 630         size_t nb_words  = hash_size >> 3;
 631         store64_le_buf(hash, ctx->hash, nb_words);
 632         FOR (i, nb_words << 3, hash_size) {
 633                 hash[i] = (ctx->hash[i >> 3] >> (8 * (i & 7))) & 0xff;
 634         }
 635         WIPE_CTX(ctx);
 636 }
 637
 638 void crypto_blake2b_keyed(u8 *hash,          size_t hash_size,
 639                           const u8 *key,     size_t key_size,
 640                           const u8 *message, size_t message_size)
 641 {
 642         crypto_blake2b_ctx ctx;
 643         crypto_blake2b_keyed_init(&ctx, hash_size, key, key_size);
 644         crypto_blake2b_update    (&ctx, message, message_size);
 645         crypto_blake2b_final     (&ctx, hash);
 646 }
 647
 648 void crypto_blake2b(u8 *hash, size_t hash_size, const u8 *msg, size_t msg_size)
 649 {
 650         crypto_blake2b_keyed(hash, hash_size, 0, 0, msg, msg_size);
 651 }
 652
 653 //////////////
 654 /// Argon2 ///
 655 //////////////
 656 // references to R, Z, Q etc. come from the spec
 657
 658 // Argon2 operates on 1024 byte blocks.
 659 typedef struct { u64 a[128]; } blk;
 660
 661 // updates a BLAKE2 hash with a 32 bit word, little endian.
 662 static void blake_update_32(crypto_blake2b_ctx *ctx, u32 input)
 663 {
 664         u8 buf[4];
 665         store32_le(buf, input);
 666         crypto_blake2b_update(ctx, buf, 4);
 667         WIPE_BUFFER(buf);
 668 }
 669
 670 static void blake_update_32_buf(crypto_blake2b_ctx *ctx,
 671                                 const u8 *buf, u32 size)
 672 {
 673         blake_update_32(ctx, size);
 674         crypto_blake2b_update(ctx, buf, size);
 675 }
 676
 677
 678 static void copy_block(blk *o,const blk*in){FOR(i, 0, 128) o->a[i]  = in->a[i];}
 679 static void  xor_block(blk *o,const blk*in){FOR(i, 0, 128) o->a[i] ^= in->a[i];}
 680
 681 // Hash with a virtually unlimited digest size.
 682 // Doesn't extract more entropy than the base hash function.
 683 // Mainly used for filling a whole kilobyte block with pseudo-random bytes.
 684 // (One could use a stream cipher with a seed hash as the key, but
 685 //  this would introduce another dependency —and point of failure.)
 686 static void extended_hash(u8       *digest, u32 digest_size,
 687                           const u8 *input , u32 input_size)
 688 {
 689         crypto_blake2b_ctx ctx;
 690         crypto_blake2b_init  (&ctx, MC_MIN(digest_size, 64));
 691         blake_update_32      (&ctx, digest_size);
 692         crypto_blake2b_update(&ctx, input, input_size);
 693         crypto_blake2b_final (&ctx, digest);
 694
 695         if (digest_size > 64) {
 696                 // the conversion to u64 avoids integer overflow on
 697                 // ludicrously big hash sizes.
 698                 u32 r   = (u32)(((u64)digest_size + 31) >> 5) - 2;
 699                 u32 i   =  1;
 700                 u32 in  =  0;
 701                 u32 out = 32;
 702                 while (i < r) {
 703                         // Input and output overlap. This is intentional
 704                         crypto_blake2b(digest + out, 64, digest + in, 64);
 705                         i   +=  1;
 706                         in  += 32;
 707                         out += 32;
 708                 }
 709                 crypto_blake2b(digest + out, digest_size - (32 * r), digest + in , 64);
 710         }
 711 }
 712
 713 #define LSB(x) ((u64)(u32)x)
 714 #define G(a, b, c, d)   \
 715         a += b + ((LSB(a) * LSB(b)) << 1);  d ^= a;  d = rotr64(d, 32); \
 716         c += d + ((LSB(c) * LSB(d)) << 1);  b ^= c;  b = rotr64(b, 24); \
 717         a += b + ((LSB(a) * LSB(b)) << 1);  d ^= a;  d = rotr64(d, 16); \
 718         c += d + ((LSB(c) * LSB(d)) << 1);  b ^= c;  b = rotr64(b, 63)
 719 #define ROUND(v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,    \
 720               v8,  v9, v10, v11, v12, v13, v14, v15)    \
 721         G(v0, v4,  v8, v12);  G(v1, v5,  v9, v13); \
 722         G(v2, v6, v10, v14);  G(v3, v7, v11, v15); \
 723         G(v0, v5, v10, v15);  G(v1, v6, v11, v12); \
 724         G(v2, v7,  v8, v13);  G(v3, v4,  v9, v14)
 725
 726 // Core of the compression function G.  Computes Z from R in place.
 727 static void g_rounds(blk *b)
 728 {
 729         // column rounds (work_block = Q)
 730         for (int i = 0; i < 128; i += 16) {
 731                 ROUND(b->a[i   ], b->a[i+ 1], b->a[i+ 2], b->a[i+ 3],
 732                       b->a[i+ 4], b->a[i+ 5], b->a[i+ 6], b->a[i+ 7],
 733                       b->a[i+ 8], b->a[i+ 9], b->a[i+10], b->a[i+11],
 734                       b->a[i+12], b->a[i+13], b->a[i+14], b->a[i+15]);
 735         }
 736         // row rounds (b = Z)
 737         for (int i = 0; i < 16; i += 2) {
 738                 ROUND(b->a[i   ], b->a[i+ 1], b->a[i+ 16], b->a[i+ 17],
 739                       b->a[i+32], b->a[i+33], b->a[i+ 48], b->a[i+ 49],
 740                       b->a[i+64], b->a[i+65], b->a[i+ 80], b->a[i+ 81],
 741                       b->a[i+96], b->a[i+97], b->a[i+112], b->a[i+113]);
 742         }
 743 }
 744
 745 const crypto_argon2_extras crypto_argon2_no_extras = { 0, 0, 0, 0 };
 746
 747 void crypto_argon2(u8 *hash, u32 hash_size, void *work_area,
 748                    crypto_argon2_config config,
 749                    crypto_argon2_inputs inputs,
 750                    crypto_argon2_extras extras)
 751 {
 752         const u32 segment_size = config.nb_blocks / config.nb_lanes / 4;
 753         const u32 lane_size    = segment_size * 4;
 754         const u32 nb_blocks    = lane_size * config.nb_lanes; // rounding down
 755
 756         // work area seen as blocks (must be suitably aligned)
 757         blk *blocks = (blk*)work_area;
 758         {
 759                 u8 initial_hash[72]; // 64 bytes plus 2 words for future hashes
 760                 crypto_blake2b_ctx ctx;
 761                 crypto_blake2b_init (&ctx, 64);
 762                 blake_update_32     (&ctx, config.nb_lanes ); // p: number of "threads"
 763                 blake_update_32     (&ctx, hash_size);
 764                 blake_update_32     (&ctx, config.nb_blocks);
 765                 blake_update_32     (&ctx, config.nb_passes);
 766                 blake_update_32     (&ctx, 0x13);             // v: version number
 767                 blake_update_32     (&ctx, config.algorithm); // y: Argon2i, Argon2d...
 768                 blake_update_32_buf (&ctx, inputs.pass, inputs.pass_size);
 769                 blake_update_32_buf (&ctx, inputs.salt, inputs.salt_size);
 770                 blake_update_32_buf (&ctx, extras.key,  extras.key_size);
 771                 blake_update_32_buf (&ctx, extras.ad,   extras.ad_size);
 772                 crypto_blake2b_final(&ctx, initial_hash); // fill 64 first bytes only
 773
 774                 // fill first 2 blocks of each lane
 775                 u8 hash_area[1024];
 776                 FOR_T(u32, l, 0, config.nb_lanes) {
 777                         FOR_T(u32, i, 0, 2) {
 778                                 store32_le(initial_hash + 64, i); // first  additional word
 779                                 store32_le(initial_hash + 68, l); // second additional word
 780                                 extended_hash(hash_area, 1024, initial_hash, 72);
 781                                 load64_le_buf(blocks[l * lane_size + i].a, hash_area, 128);
 782                         }
 783                 }
 784
 785                 WIPE_BUFFER(initial_hash);
 786                 WIPE_BUFFER(hash_area);
 787         }
 788
 789         // Argon2i and Argon2id start with constant time indexing
 790         int constant_time = config.algorithm != CRYPTO_ARGON2_D;
 791
 792         // Fill (and re-fill) the rest of the blocks
 793         //
 794         // Note: even though each segment within the same slice can be
 795         // computed in parallel, (one thread per lane), we are computing
 796         // them sequentially, because Monocypher doesn't support threads.
 797         //
 798         // Yet optimal performance (and therefore security) requires one
 799         // thread per lane. The only reason Monocypher supports multiple
 800         // lanes is compatibility.
 801         blk tmp;
 802         FOR_T(u32, pass, 0, config.nb_passes) {
 803                 FOR_T(u32, slice, 0, 4) {
 804                         // On the first slice of the first pass,
 805                         // blocks 0 and 1 are already filled, hence pass_offset.
 806                         u32 pass_offset  = pass == 0 && slice == 0 ? 2 : 0;
 807                         u32 slice_offset = slice * segment_size;
 808
 809                         // Argon2id switches back to non-constant time indexing
 810                         // after the first two slices of the first pass
 811                         if (slice == 2 && config.algorithm == CRYPTO_ARGON2_ID) {
 812                                 constant_time = 0;
 813                         }
 814
 815                         // Each iteration of the following loop may be performed in
 816                         // a separate thread.  All segments must be fully completed
 817                         // before we start filling the next slice.
 818                         FOR_T(u32, segment, 0, config.nb_lanes) {
 819                                 blk index_block;
 820                                 u32 index_ctr = 1;
 821                                 FOR_T (u32, block, pass_offset, segment_size) {
 822                                         // Current and previous blocks
 823                                         u32  lane_offset   = segment * lane_size;
 824                                         blk *segment_start = blocks + lane_offset + slice_offset;
 825                                         blk *current       = segment_start + block;
 826                                         blk *previous      =
 827                                                 block == 0 && slice_offset == 0
 828                                                 ? segment_start + lane_size - 1
 829                                                 : segment_start + block - 1;
 830
 831                                         u64 index_seed;
 832                                         if (constant_time) {
 833                                                 if (block == pass_offset || (block % 128) == 0) {
 834                                                         // Fill or refresh deterministic indices block
 835
 836                                                         // seed the beginning of the block...
 837                                                         ZERO(index_block.a, 128);
 838                                                         index_block.a[0] = pass;
 839                                                         index_block.a[1] = segment;
 840                                                         index_block.a[2] = slice;
 841                                                         index_block.a[3] = nb_blocks;
 842                                                         index_block.a[4] = config.nb_passes;
 843                                                         index_block.a[5] = config.algorithm;
 844                                                         index_block.a[6] = index_ctr;
 845                                                         index_ctr++;
 846
 847                                                         // ... then shuffle it
 848                                                         copy_block(&tmp, &index_block);
 849                                                         g_rounds  (&index_block);
 850                                                         xor_block (&index_block, &tmp);
 851                                                         copy_block(&tmp, &index_block);
 852                                                         g_rounds  (&index_block);
 853                                                         xor_block (&index_block, &tmp);
 854                                                 }
 855                                                 index_seed = index_block.a[block % 128];
 856                                         } else {
 857                                                 index_seed = previous->a[0];
 858                                         }
 859
 860                                         // Establish the reference set.  *Approximately* comprises:
 861                                         // - The last 3 slices (if they exist yet)
 862                                         // - The already constructed blocks in the current segment
 863                                         u32 next_slice   = ((slice + 1) % 4) * segment_size;
 864                                         u32 window_start = pass == 0 ? 0     : next_slice;
 865                                         u32 nb_segments  = pass == 0 ? slice : 3;
 866                                         u64 lane         =
 867                                                 pass == 0 && slice == 0
 868                                                 ? segment
 869                                                 : (index_seed >> 32) % config.nb_lanes;
 870                                         u32 window_size  =
 871                                                 nb_segments * segment_size +
 872                                                 (lane  == segment ? block-1 :
 873                                                  block == 0       ? (u32)-1 : 0);
 874
 875                                         // Find reference block
 876                                         u64  j1        = index_seed & 0xffffffff; // block selector
 877                                         u64  x         = (j1 * j1)         >> 32;
 878                                         u64  y         = (window_size * x) >> 32;
 879                                         u64  z         = (window_size - 1) - y;
 880                                         u64  ref       = (window_start + z) % lane_size;
 881                                         u32  index     = lane * lane_size + (u32)ref;
 882                                         blk *reference = blocks + index;
 883
 884                                         // Shuffle the previous & reference block
 885                                         // into the current block
 886                                         copy_block(&tmp, previous);
 887                                         xor_block (&tmp, reference);
 888                                         if (pass == 0) { copy_block(current, &tmp); }
 889                                         else           { xor_block (current, &tmp); }
 890                                         g_rounds  (&tmp);
 891                                         xor_block (current, &tmp);
 892                                 }
 893                         }
 894                 }
 895         }
 896
 897         // Wipe temporary block
 898         volatile u64* p = tmp.a;
 899         ZERO(p, 128);
 900
 901         // XOR last blocks of each lane
 902         blk *last_block = blocks + lane_size - 1;
 903         FOR_T (u32, lane, 1, config.nb_lanes) {
 904                 blk *next_block = last_block + lane_size;
 905                 xor_block(next_block, last_block);
 906                 last_block = next_block;
 907         }
 908
 909         // Serialize last block
 910         u8 final_block[1024];
 911         store64_le_buf(final_block, last_block->a, 128);
 912
 913         // Wipe work area
 914         p = (u64*)work_area;
 915         ZERO(p, 128 * nb_blocks);
 916
 917         // Hash the very last block with H' into the output hash
 918         extended_hash(hash, hash_size, final_block, 1024);
 919         WIPE_BUFFER(final_block);
 920 }
 921
 922 ////////////////////////////////////
 923 /// Arithmetic modulo 2^255 - 19 ///
 924 ////////////////////////////////////
 925 //  Originally taken from SUPERCOP's ref10 implementation.
 926 //  A bit bigger than TweetNaCl, over 4 times faster.
 927
 928 // field element
 929 typedef i32 fe[10];
 930
 931 // field constants
 932 //
 933 // fe_one      : 1
 934 // sqrtm1      : sqrt(-1)
 935 // d           :     -121665 / 121666
 936 // D2          : 2 * -121665 / 121666
 937 // lop_x, lop_y: low order point in Edwards coordinates
 938 // ufactor     : -sqrt(-1) * 2
 939 // A2          : 486662^2  (A squared)
 940 static const fe fe_one  = {1};
 941 static const fe sqrtm1  = {
 942         -32595792, -7943725, 9377950, 3500415, 12389472,
 943         -272473, -25146209, -2005654, 326686, 11406482,
 944 };
 945 static const fe d       = {
 946         -10913610, 13857413, -15372611, 6949391, 114729,
 947         -8787816, -6275908, -3247719, -18696448, -12055116,
 948 };
 949 static const fe D2      = {
 950         -21827239, -5839606, -30745221, 13898782, 229458,
 951         15978800, -12551817, -6495438, 29715968, 9444199,
 952 };
 953 static const fe lop_x   = {
 954         21352778, 5345713, 4660180, -8347857, 24143090,
 955         14568123, 30185756, -12247770, -33528939, 8345319,
 956 };
 957 static const fe lop_y   = {
 958         -6952922, -1265500, 6862341, -7057498, -4037696,
 959         -5447722, 31680899, -15325402, -19365852, 1569102,
 960 };
 961 static const fe ufactor = {
 962         -1917299, 15887451, -18755900, -7000830, -24778944,
 963         544946, -16816446, 4011309, -653372, 10741468,
 964 };
 965 static const fe A2      = {
 966         12721188, 3529, 0, 0, 0, 0, 0, 0, 0, 0,
 967 };
 968
 969 static void fe_0(fe h) {           ZERO(h  , 10); }
 970 static void fe_1(fe h) { h[0] = 1; ZERO(h+1,  9); }
 971
 972 static void fe_copy(fe h,const fe f           ){FOR(i,0,10) h[i] =  f[i];      }
 973 static void fe_neg (fe h,const fe f           ){FOR(i,0,10) h[i] = -f[i];      }
 974 static void fe_add (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] + g[i];}
 975 static void fe_sub (fe h,const fe f,const fe g){FOR(i,0,10) h[i] = f[i] - g[i];}
 976
 977 static void fe_cswap(fe f, fe g, int b)
 978 {
 979         i32 mask = -b; // -1 = 0xffffffff
 980         FOR (i, 0, 10) {
 981                 i32 x = (f[i] ^ g[i]) & mask;
 982                 f[i] = f[i] ^ x;
 983                 g[i] = g[i] ^ x;
 984         }
 985 }
 986
 987 static void fe_ccopy(fe f, const fe g, int b)
 988 {
 989         i32 mask = -b; // -1 = 0xffffffff
 990         FOR (i, 0, 10) {
 991                 i32 x = (f[i] ^ g[i]) & mask;
 992                 f[i] = f[i] ^ x;
 993         }
 994 }
 995
 996
 997 // Signed carry propagation
 998 // ------------------------
 999 //
1000 // Let t be a number.  It can be uniquely decomposed thus:
1001 //
1002 //    t = h*2^26 + l
1003 //    such that -2^25 <= l < 2^25
1004 //
1005 // Let c = (t + 2^25) / 2^26            (rounded down)
1006 //     c = (h*2^26 + l + 2^25) / 2^26   (rounded down)
1007 //     c =  h   +   (l + 2^25) / 2^26   (rounded down)
1008 //     c =  h                           (exactly)
1009 // Because 0 <= l + 2^25 < 2^26
1010 //
1011 // Let u = t          - c*2^26
1012 //     u = h*2^26 + l - h*2^26
1013 //     u = l
1014 // Therefore, -2^25 <= u < 2^25
1015 //
1016 // Additionally, if |t| < x, then |h| < x/2^26 (rounded down)
1017 //
1018 // Notations:
1019 // - In C, 1<<25 means 2^25.
1020 // - In C, x>>25 means floor(x / (2^25)).
1021 // - All of the above applies with 25 & 24 as well as 26 & 25.
1022 //
1023 //
1024 // Note on negative right shifts
1025 // -----------------------------
1026 //
1027 // In C, x >> n, where x is a negative integer, is implementation
1028 // defined.  In practice, all platforms do arithmetic shift, which is
1029 // equivalent to division by 2^26, rounded down.  Some compilers, like
1030 // GCC, even guarantee it.
1031 //
1032 // If we ever stumble upon a platform that does not propagate the sign
1033 // bit (we won't), visible failures will show at the slightest test, and
1034 // the signed shifts can be replaced by the following:
1035 //
1036 //     typedef struct { i64 x:39; } s25;
1037 //     typedef struct { i64 x:38; } s26;
1038 //     i64 shift25(i64 x) { s25 s; s.x = ((u64)x)>>25; return s.x; }
1039 //     i64 shift26(i64 x) { s26 s; s.x = ((u64)x)>>26; return s.x; }
1040 //
1041 // Current compilers cannot optimise this, causing a 30% drop in
1042 // performance.  Fairly expensive for something that never happens.
1043 //
1044 //
1045 // Precondition
1046 // ------------
1047 //
1048 // |t0|       < 2^63
1049 // |t1|..|t9| < 2^62
1050 //
1051 // Algorithm
1052 // ---------
1053 // c   = t0 + 2^25 / 2^26   -- |c|  <= 2^36
1054 // t0 -= c * 2^26           -- |t0| <= 2^25
1055 // t1 += c                  -- |t1| <= 2^63
1056 //
1057 // c   = t4 + 2^25 / 2^26   -- |c|  <= 2^36
1058 // t4 -= c * 2^26           -- |t4| <= 2^25
1059 // t5 += c                  -- |t5| <= 2^63
1060 //
1061 // c   = t1 + 2^24 / 2^25   -- |c|  <= 2^38
1062 // t1 -= c * 2^25           -- |t1| <= 2^24
1063 // t2 += c                  -- |t2| <= 2^63
1064 //
1065 // c   = t5 + 2^24 / 2^25   -- |c|  <= 2^38
1066 // t5 -= c * 2^25           -- |t5| <= 2^24
1067 // t6 += c                  -- |t6| <= 2^63
1068 //
1069 // c   = t2 + 2^25 / 2^26   -- |c|  <= 2^37
1070 // t2 -= c * 2^26           -- |t2| <= 2^25        < 1.1 * 2^25  (final t2)
1071 // t3 += c                  -- |t3| <= 2^63
1072 //
1073 // c   = t6 + 2^25 / 2^26   -- |c|  <= 2^37
1074 // t6 -= c * 2^26           -- |t6| <= 2^25        < 1.1 * 2^25  (final t6)
1075 // t7 += c                  -- |t7| <= 2^63
1076 //
1077 // c   = t3 + 2^24 / 2^25   -- |c|  <= 2^38
1078 // t3 -= c * 2^25           -- |t3| <= 2^24        < 1.1 * 2^24  (final t3)
1079 // t4 += c                  -- |t4| <= 2^25 + 2^38 < 2^39
1080 //
1081 // c   = t7 + 2^24 / 2^25   -- |c|  <= 2^38
1082 // t7 -= c * 2^25           -- |t7| <= 2^24        < 1.1 * 2^24  (final t7)
1083 // t8 += c                  -- |t8| <= 2^63
1084 //
1085 // c   = t4 + 2^25 / 2^26   -- |c|  <= 2^13
1086 // t4 -= c * 2^26           -- |t4| <= 2^25        < 1.1 * 2^25  (final t4)
1087 // t5 += c                  -- |t5| <= 2^24 + 2^13 < 1.1 * 2^24  (final t5)
1088 //
1089 // c   = t8 + 2^25 / 2^26   -- |c|  <= 2^37
1090 // t8 -= c * 2^26           -- |t8| <= 2^25        < 1.1 * 2^25  (final t8)
1091 // t9 += c                  -- |t9| <= 2^63
1092 //
1093 // c   = t9 + 2^24 / 2^25   -- |c|  <= 2^38
1094 // t9 -= c * 2^25           -- |t9| <= 2^24        < 1.1 * 2^24  (final t9)
1095 // t0 += c * 19             -- |t0| <= 2^25 + 2^38*19 < 2^44
1096 //
1097 // c   = t0 + 2^25 / 2^26   -- |c|  <= 2^18
1098 // t0 -= c * 2^26           -- |t0| <= 2^25        < 1.1 * 2^25  (final t0)
1099 // t1 += c                  -- |t1| <= 2^24 + 2^18 < 1.1 * 2^24  (final t1)
1100 //
1101 // Postcondition
1102 // -------------
1103 //   |t0|, |t2|, |t4|, |t6|, |t8|  <  1.1 * 2^25
1104 //   |t1|, |t3|, |t5|, |t7|, |t9|  <  1.1 * 2^24
1105 #define FE_CARRY        \
1106         i64 c; \
1107         c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
1108         c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
1109         c = (t1 + ((i64)1<<24)) >> 25;  t1 -= c * ((i64)1 << 25);  t2 += c; \
1110         c = (t5 + ((i64)1<<24)) >> 25;  t5 -= c * ((i64)1 << 25);  t6 += c; \
1111         c = (t2 + ((i64)1<<25)) >> 26;  t2 -= c * ((i64)1 << 26);  t3 += c; \
1112         c = (t6 + ((i64)1<<25)) >> 26;  t6 -= c * ((i64)1 << 26);  t7 += c; \
1113         c = (t3 + ((i64)1<<24)) >> 25;  t3 -= c * ((i64)1 << 25);  t4 += c; \
1114         c = (t7 + ((i64)1<<24)) >> 25;  t7 -= c * ((i64)1 << 25);  t8 += c; \
1115         c = (t4 + ((i64)1<<25)) >> 26;  t4 -= c * ((i64)1 << 26);  t5 += c; \
1116         c = (t8 + ((i64)1<<25)) >> 26;  t8 -= c * ((i64)1 << 26);  t9 += c; \
1117         c = (t9 + ((i64)1<<24)) >> 25;  t9 -= c * ((i64)1 << 25);  t0 += c * 19; \
1118         c = (t0 + ((i64)1<<25)) >> 26;  t0 -= c * ((i64)1 << 26);  t1 += c; \
1119         h[0]=(i32)t0;  h[1]=(i32)t1;  h[2]=(i32)t2;  h[3]=(i32)t3;  h[4]=(i32)t4; \
1120         h[5]=(i32)t5;  h[6]=(i32)t6;  h[7]=(i32)t7;  h[8]=(i32)t8;  h[9]=(i32)t9
1121
1122 // Decodes a field element from a byte buffer.
1123 // mask specifies how many bits we ignore.
1124 // Traditionally we ignore 1. It's useful for EdDSA,
1125 // which uses that bit to denote the sign of x.
1126 // Elligator however uses positive representatives,
1127 // which means ignoring 2 bits instead.
1128 static void fe_frombytes_mask(fe h, const u8 s[32], unsigned nb_mask)
1129 {
1130         u32 mask = 0xffffff >> nb_mask;
1131         i64 t0 =  load32_le(s);                    // t0 < 2^32
1132         i64 t1 =  load24_le(s +  4) << 6;          // t1 < 2^30
1133         i64 t2 =  load24_le(s +  7) << 5;          // t2 < 2^29
1134         i64 t3 =  load24_le(s + 10) << 3;          // t3 < 2^27
1135         i64 t4 =  load24_le(s + 13) << 2;          // t4 < 2^26
1136         i64 t5 =  load32_le(s + 16);               // t5 < 2^32
1137         i64 t6 =  load24_le(s + 20) << 7;          // t6 < 2^31
1138         i64 t7 =  load24_le(s + 23) << 5;          // t7 < 2^29
1139         i64 t8 =  load24_le(s + 26) << 4;          // t8 < 2^28
1140         i64 t9 = (load24_le(s + 29) & mask) << 2;  // t9 < 2^25
1141         FE_CARRY;                                  // Carry precondition OK
1142 }
1143
1144 static void fe_frombytes(fe h, const u8 s[32])
1145 {
1146         fe_frombytes_mask(h, s, 1);
1147 }
1148
1149
1150 // Precondition
1151 //   |h[0]|, |h[2]|, |h[4]|, |h[6]|, |h[8]|  <  1.1 * 2^25
1152 //   |h[1]|, |h[3]|, |h[5]|, |h[7]|, |h[9]|  <  1.1 * 2^24
1153 //
1154 // Therefore, |h| < 2^255-19
1155 // There are two possibilities:
1156 //
1157 // - If h is positive, all we need to do is reduce its individual
1158 //   limbs down to their tight positive range.
1159 // - If h is negative, we also need to add 2^255-19 to it.
1160 //   Or just remove 19 and chop off any excess bit.
1161 static void fe_tobytes(u8 s[32], const fe h)
1162 {
1163         i32 t[10];
1164         COPY(t, h, 10);
1165         i32 q = (19 * t[9] + (((i32) 1) << 24)) >> 25;
1166         //                 |t9|                    < 1.1 * 2^24
1167         //  -1.1 * 2^24  <  t9                     < 1.1 * 2^24
1168         //  -21  * 2^24  <  19 * t9                < 21  * 2^24
1169         //  -2^29        <  19 * t9 + 2^24         < 2^29
1170         //  -2^29 / 2^25 < (19 * t9 + 2^24) / 2^25 < 2^29 / 2^25
1171         //  -16          < (19 * t9 + 2^24) / 2^25 < 16
1172         FOR (i, 0, 5) {
1173                 q += t[2*i  ]; q >>= 26; // q = 0 or -1
1174                 q += t[2*i+1]; q >>= 25; // q = 0 or -1
1175         }
1176         // q =  0 iff h >= 0
1177         // q = -1 iff h <  0
1178         // Adding q * 19 to h reduces h to its proper range.
1179         q *= 19;  // Shift carry back to the beginning
1180         FOR (i, 0, 5) {
1181                 t[i*2  ] += q;  q = t[i*2  ] >> 26;  t[i*2  ] -= q * ((i32)1 << 26);
1182                 t[i*2+1] += q;  q = t[i*2+1] >> 25;  t[i*2+1] -= q * ((i32)1 << 25);
1183         }
1184         // h is now fully reduced, and q represents the excess bit.
1185
1186         store32_le(s +  0, ((u32)t[0] >>  0) | ((u32)t[1] << 26));
1187         store32_le(s +  4, ((u32)t[1] >>  6) | ((u32)t[2] << 19));
1188         store32_le(s +  8, ((u32)t[2] >> 13) | ((u32)t[3] << 13));
1189         store32_le(s + 12, ((u32)t[3] >> 19) | ((u32)t[4] <<  6));
1190         store32_le(s + 16, ((u32)t[5] >>  0) | ((u32)t[6] << 25));
1191         store32_le(s + 20, ((u32)t[6] >>  7) | ((u32)t[7] << 19));
1192         store32_le(s + 24, ((u32)t[7] >> 13) | ((u32)t[8] << 12));
1193         store32_le(s + 28, ((u32)t[8] >> 20) | ((u32)t[9] <<  6));
1194
1195         WIPE_BUFFER(t);
1196 }
1197
1198 // Precondition
1199 // -------------
1200 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1201 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1202 //
1203 //   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
1204 //   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
1205 static void fe_mul_small(fe h, const fe f, i32 g)
1206 {
1207         i64 t0 = f[0] * (i64) g;  i64 t1 = f[1] * (i64) g;
1208         i64 t2 = f[2] * (i64) g;  i64 t3 = f[3] * (i64) g;
1209         i64 t4 = f[4] * (i64) g;  i64 t5 = f[5] * (i64) g;
1210         i64 t6 = f[6] * (i64) g;  i64 t7 = f[7] * (i64) g;
1211         i64 t8 = f[8] * (i64) g;  i64 t9 = f[9] * (i64) g;
1212         // |t0|, |t2|, |t4|, |t6|, |t8|  <  1.65 * 2^26 * 2^31  < 2^58
1213         // |t1|, |t3|, |t5|, |t7|, |t9|  <  1.65 * 2^25 * 2^31  < 2^57
1214
1215         FE_CARRY; // Carry precondition OK
1216 }
1217
1218 // Precondition
1219 // -------------
1220 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1221 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1222 //
1223 //   |g0|, |g2|, |g4|, |g6|, |g8|  <  1.65 * 2^26
1224 //   |g1|, |g3|, |g5|, |g7|, |g9|  <  1.65 * 2^25
1225 static void fe_mul(fe h, const fe f, const fe g)
1226 {
1227         // Everything is unrolled and put in temporary variables.
1228         // We could roll the loop, but that would make curve25519 twice as slow.
1229         i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1230         i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1231         i32 g0 = g[0]; i32 g1 = g[1]; i32 g2 = g[2]; i32 g3 = g[3]; i32 g4 = g[4];
1232         i32 g5 = g[5]; i32 g6 = g[6]; i32 g7 = g[7]; i32 g8 = g[8]; i32 g9 = g[9];
1233         i32 F1 = f1*2; i32 F3 = f3*2; i32 F5 = f5*2; i32 F7 = f7*2; i32 F9 = f9*2;
1234         i32 G1 = g1*19;  i32 G2 = g2*19;  i32 G3 = g3*19;
1235         i32 G4 = g4*19;  i32 G5 = g5*19;  i32 G6 = g6*19;
1236         i32 G7 = g7*19;  i32 G8 = g8*19;  i32 G9 = g9*19;
1237         // |F1|, |F3|, |F5|, |F7|, |F9|  <  1.65 * 2^26
1238         // |G0|, |G2|, |G4|, |G6|, |G8|  <  2^31
1239         // |G1|, |G3|, |G5|, |G7|, |G9|  <  2^30
1240
1241         i64 t0 = f0*(i64)g0 + F1*(i64)G9 + f2*(i64)G8 + F3*(i64)G7 + f4*(i64)G6
1242                + F5*(i64)G5 + f6*(i64)G4 + F7*(i64)G3 + f8*(i64)G2 + F9*(i64)G1;
1243         i64 t1 = f0*(i64)g1 + f1*(i64)g0 + f2*(i64)G9 + f3*(i64)G8 + f4*(i64)G7
1244                + f5*(i64)G6 + f6*(i64)G5 + f7*(i64)G4 + f8*(i64)G3 + f9*(i64)G2;
1245         i64 t2 = f0*(i64)g2 + F1*(i64)g1 + f2*(i64)g0 + F3*(i64)G9 + f4*(i64)G8
1246                + F5*(i64)G7 + f6*(i64)G6 + F7*(i64)G5 + f8*(i64)G4 + F9*(i64)G3;
1247         i64 t3 = f0*(i64)g3 + f1*(i64)g2 + f2*(i64)g1 + f3*(i64)g0 + f4*(i64)G9
1248                + f5*(i64)G8 + f6*(i64)G7 + f7*(i64)G6 + f8*(i64)G5 + f9*(i64)G4;
1249         i64 t4 = f0*(i64)g4 + F1*(i64)g3 + f2*(i64)g2 + F3*(i64)g1 + f4*(i64)g0
1250                + F5*(i64)G9 + f6*(i64)G8 + F7*(i64)G7 + f8*(i64)G6 + F9*(i64)G5;
1251         i64 t5 = f0*(i64)g5 + f1*(i64)g4 + f2*(i64)g3 + f3*(i64)g2 + f4*(i64)g1
1252                + f5*(i64)g0 + f6*(i64)G9 + f7*(i64)G8 + f8*(i64)G7 + f9*(i64)G6;
1253         i64 t6 = f0*(i64)g6 + F1*(i64)g5 + f2*(i64)g4 + F3*(i64)g3 + f4*(i64)g2
1254                + F5*(i64)g1 + f6*(i64)g0 + F7*(i64)G9 + f8*(i64)G8 + F9*(i64)G7;
1255         i64 t7 = f0*(i64)g7 + f1*(i64)g6 + f2*(i64)g5 + f3*(i64)g4 + f4*(i64)g3
1256                + f5*(i64)g2 + f6*(i64)g1 + f7*(i64)g0 + f8*(i64)G9 + f9*(i64)G8;
1257         i64 t8 = f0*(i64)g8 + F1*(i64)g7 + f2*(i64)g6 + F3*(i64)g5 + f4*(i64)g4
1258                + F5*(i64)g3 + f6*(i64)g2 + F7*(i64)g1 + f8*(i64)g0 + F9*(i64)G9;
1259         i64 t9 = f0*(i64)g9 + f1*(i64)g8 + f2*(i64)g7 + f3*(i64)g6 + f4*(i64)g5
1260                + f5*(i64)g4 + f6*(i64)g3 + f7*(i64)g2 + f8*(i64)g1 + f9*(i64)g0;
1261         // t0 < 0.67 * 2^61
1262         // t1 < 0.41 * 2^61
1263         // t2 < 0.52 * 2^61
1264         // t3 < 0.32 * 2^61
1265         // t4 < 0.38 * 2^61
1266         // t5 < 0.22 * 2^61
1267         // t6 < 0.23 * 2^61
1268         // t7 < 0.13 * 2^61
1269         // t8 < 0.09 * 2^61
1270         // t9 < 0.03 * 2^61
1271
1272         FE_CARRY; // Everything below 2^62, Carry precondition OK
1273 }
1274
1275 // Precondition
1276 // -------------
1277 //   |f0|, |f2|, |f4|, |f6|, |f8|  <  1.65 * 2^26
1278 //   |f1|, |f3|, |f5|, |f7|, |f9|  <  1.65 * 2^25
1279 //
1280 // Note: we could use fe_mul() for this, but this is significantly faster
1281 static void fe_sq(fe h, const fe f)
1282 {
1283         i32 f0 = f[0]; i32 f1 = f[1]; i32 f2 = f[2]; i32 f3 = f[3]; i32 f4 = f[4];
1284         i32 f5 = f[5]; i32 f6 = f[6]; i32 f7 = f[7]; i32 f8 = f[8]; i32 f9 = f[9];
1285         i32 f0_2  = f0*2;   i32 f1_2  = f1*2;   i32 f2_2  = f2*2;   i32 f3_2 = f3*2;
1286         i32 f4_2  = f4*2;   i32 f5_2  = f5*2;   i32 f6_2  = f6*2;   i32 f7_2 = f7*2;
1287         i32 f5_38 = f5*38;  i32 f6_19 = f6*19;  i32 f7_38 = f7*38;
1288         i32 f8_19 = f8*19;  i32 f9_38 = f9*38;
1289         // |f0_2| , |f2_2| , |f4_2| , |f6_2| , |f8_2|  <  1.65 * 2^27
1290         // |f1_2| , |f3_2| , |f5_2| , |f7_2| , |f9_2|  <  1.65 * 2^26
1291         // |f5_38|, |f6_19|, |f7_38|, |f8_19|, |f9_38| <  2^31
1292
1293         i64 t0 = f0  *(i64)f0    + f1_2*(i64)f9_38 + f2_2*(i64)f8_19
1294                + f3_2*(i64)f7_38 + f4_2*(i64)f6_19 + f5  *(i64)f5_38;
1295         i64 t1 = f0_2*(i64)f1    + f2  *(i64)f9_38 + f3_2*(i64)f8_19
1296                + f4  *(i64)f7_38 + f5_2*(i64)f6_19;
1297         i64 t2 = f0_2*(i64)f2    + f1_2*(i64)f1    + f3_2*(i64)f9_38
1298                + f4_2*(i64)f8_19 + f5_2*(i64)f7_38 + f6  *(i64)f6_19;
1299         i64 t3 = f0_2*(i64)f3    + f1_2*(i64)f2    + f4  *(i64)f9_38
1300                + f5_2*(i64)f8_19 + f6  *(i64)f7_38;
1301         i64 t4 = f0_2*(i64)f4    + f1_2*(i64)f3_2  + f2  *(i64)f2
1302                + f5_2*(i64)f9_38 + f6_2*(i64)f8_19 + f7  *(i64)f7_38;
1303         i64 t5 = f0_2*(i64)f5    + f1_2*(i64)f4    + f2_2*(i64)f3
1304                + f6  *(i64)f9_38 + f7_2*(i64)f8_19;
1305         i64 t6 = f0_2*(i64)f6    + f1_2*(i64)f5_2  + f2_2*(i64)f4
1306                + f3_2*(i64)f3    + f7_2*(i64)f9_38 + f8  *(i64)f8_19;
1307         i64 t7 = f0_2*(i64)f7    + f1_2*(i64)f6    + f2_2*(i64)f5
1308                + f3_2*(i64)f4    + f8  *(i64)f9_38;
1309         i64 t8 = f0_2*(i64)f8    + f1_2*(i64)f7_2  + f2_2*(i64)f6
1310                + f3_2*(i64)f5_2  + f4  *(i64)f4    + f9  *(i64)f9_38;
1311         i64 t9 = f0_2*(i64)f9    + f1_2*(i64)f8    + f2_2*(i64)f7
1312                + f3_2*(i64)f6    + f4  *(i64)f5_2;
1313         // t0 < 0.67 * 2^61
1314         // t1 < 0.41 * 2^61
1315         // t2 < 0.52 * 2^61
1316         // t3 < 0.32 * 2^61
1317         // t4 < 0.38 * 2^61
1318         // t5 < 0.22 * 2^61
1319         // t6 < 0.23 * 2^61
1320         // t7 < 0.13 * 2^61
1321         // t8 < 0.09 * 2^61
1322         // t9 < 0.03 * 2^61
1323
1324         FE_CARRY;
1325 }
1326
1327 //  Parity check.  Returns 0 if even, 1 if odd
1328 static int fe_isodd(const fe f)
1329 {
1330         u8 s[32];
1331         fe_tobytes(s, f);
1332         u8 isodd = s[0] & 1;
1333         WIPE_BUFFER(s);
1334         return isodd;
1335 }
1336
1337 // Returns 1 if equal, 0 if not equal
1338 static int fe_isequal(const fe f, const fe g)
1339 {
1340         u8 fs[32];
1341         u8 gs[32];
1342         fe_tobytes(fs, f);
1343         fe_tobytes(gs, g);
1344         int isdifferent = crypto_verify32(fs, gs);
1345         WIPE_BUFFER(fs);
1346         WIPE_BUFFER(gs);
1347         return 1 + isdifferent;
1348 }
1349
1350 // Inverse square root.
1351 // Returns true if x is a square, false otherwise.
1352 // After the call:
1353 //   isr = sqrt(1/x)        if x is a non-zero square.
1354 //   isr = sqrt(sqrt(-1)/x) if x is not a square.
1355 //   isr = 0                if x is zero.
1356 // We do not guarantee the sign of the square root.
1357 //
1358 // Notes:
1359 // Let quartic = x^((p-1)/4)
1360 //
1361 // x^((p-1)/2) = chi(x)
1362 // quartic^2   = chi(x)
1363 // quartic     = sqrt(chi(x))
1364 // quartic     = 1 or -1 or sqrt(-1) or -sqrt(-1)
1365 //
1366 // Note that x is a square if quartic is 1 or -1
1367 // There are 4 cases to consider:
1368 //
1369 // if   quartic         = 1  (x is a square)
1370 // then x^((p-1)/4)     = 1
1371 //      x^((p-5)/4) * x = 1
1372 //      x^((p-5)/4)     = 1/x
1373 //      x^((p-5)/8)     = sqrt(1/x) or -sqrt(1/x)
1374 //
1375 // if   quartic                = -1  (x is a square)
1376 // then x^((p-1)/4)            = -1
1377 //      x^((p-5)/4) * x        = -1
1378 //      x^((p-5)/4)            = -1/x
1379 //      x^((p-5)/8)            = sqrt(-1)   / sqrt(x)
1380 //      x^((p-5)/8) * sqrt(-1) = sqrt(-1)^2 / sqrt(x)
1381 //      x^((p-5)/8) * sqrt(-1) = -1/sqrt(x)
1382 //      x^((p-5)/8) * sqrt(-1) = -sqrt(1/x) or sqrt(1/x)
1383 //
1384 // if   quartic         = sqrt(-1)  (x is not a square)
1385 // then x^((p-1)/4)     = sqrt(-1)
1386 //      x^((p-5)/4) * x = sqrt(-1)
1387 //      x^((p-5)/4)     = sqrt(-1)/x
1388 //      x^((p-5)/8)     = sqrt(sqrt(-1)/x) or -sqrt(sqrt(-1)/x)
1389 //
1390 // Note that the product of two non-squares is always a square:
1391 //   For any non-squares a and b, chi(a) = -1 and chi(b) = -1.
1392 //   Since chi(x) = x^((p-1)/2), chi(a)*chi(b) = chi(a*b) = 1.
1393 //   Therefore a*b is a square.
1394 //
1395 //   Since sqrt(-1) and x are both non-squares, their product is a
1396 //   square, and we can compute their square root.
1397 //
1398 // if   quartic                = -sqrt(-1)  (x is not a square)
1399 // then x^((p-1)/4)            = -sqrt(-1)
1400 //      x^((p-5)/4) * x        = -sqrt(-1)
1401 //      x^((p-5)/4)            = -sqrt(-1)/x
1402 //      x^((p-5)/8)            = sqrt(-sqrt(-1)/x)
1403 //      x^((p-5)/8)            = sqrt( sqrt(-1)/x) * sqrt(-1)
1404 //      x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * sqrt(-1)^2
1405 //      x^((p-5)/8) * sqrt(-1) = sqrt( sqrt(-1)/x) * -1
1406 //      x^((p-5)/8) * sqrt(-1) = -sqrt(sqrt(-1)/x) or sqrt(sqrt(-1)/x)
1407 static int invsqrt(fe isr, const fe x)
1408 {
1409         fe t0, t1, t2;
1410
1411         // t0 = x^((p-5)/8)
1412         // Can be achieved with a simple double & add ladder,
1413         // but it would be slower.
1414         fe_sq(t0, x);
1415         fe_sq(t1,t0);                     fe_sq(t1, t1);    fe_mul(t1, x, t1);
1416         fe_mul(t0, t0, t1);
1417         fe_sq(t0, t0);                                      fe_mul(t0, t1, t0);
1418         fe_sq(t1, t0);  FOR (i, 1,   5) { fe_sq(t1, t1); }  fe_mul(t0, t1, t0);
1419         fe_sq(t1, t0);  FOR (i, 1,  10) { fe_sq(t1, t1); }  fe_mul(t1, t1, t0);
1420         fe_sq(t2, t1);  FOR (i, 1,  20) { fe_sq(t2, t2); }  fe_mul(t1, t2, t1);
1421         fe_sq(t1, t1);  FOR (i, 1,  10) { fe_sq(t1, t1); }  fe_mul(t0, t1, t0);
1422         fe_sq(t1, t0);  FOR (i, 1,  50) { fe_sq(t1, t1); }  fe_mul(t1, t1, t0);
1423         fe_sq(t2, t1);  FOR (i, 1, 100) { fe_sq(t2, t2); }  fe_mul(t1, t2, t1);
1424         fe_sq(t1, t1);  FOR (i, 1,  50) { fe_sq(t1, t1); }  fe_mul(t0, t1, t0);
1425         fe_sq(t0, t0);  FOR (i, 1,   2) { fe_sq(t0, t0); }  fe_mul(t0, t0, x);
1426
1427         // quartic = x^((p-1)/4)
1428         i32 *quartic = t1;
1429         fe_sq (quartic, t0);
1430         fe_mul(quartic, quartic, x);
1431
1432         i32 *check = t2;
1433         fe_0  (check);          int z0 = fe_isequal(x      , check);
1434         fe_1  (check);          int p1 = fe_isequal(quartic, check);
1435         fe_neg(check, check );  int m1 = fe_isequal(quartic, check);
1436         fe_neg(check, sqrtm1);  int ms = fe_isequal(quartic, check);
1437
1438         // if quartic == -1 or sqrt(-1)
1439         // then  isr = x^((p-1)/4) * sqrt(-1)
1440         // else  isr = x^((p-1)/4)
1441         fe_mul(isr, t0, sqrtm1);
1442         fe_ccopy(isr, t0, 1 - (m1 | ms));
1443
1444         WIPE_BUFFER(t0);
1445         WIPE_BUFFER(t1);
1446         WIPE_BUFFER(t2);
1447         return p1 | m1 | z0;
1448 }
1449
1450 // Inverse in terms of inverse square root.
1451 // Requires two additional squarings to get rid of the sign.
1452 //
1453 //   1/x = x * (+invsqrt(x^2))^2
1454 //       = x * (-invsqrt(x^2))^2
1455 //
1456 // A fully optimised exponentiation by p-1 would save 6 field
1457 // multiplications, but it would require more code.
1458 static void fe_invert(fe out, const fe x)
1459 {
1460         fe tmp;
1461         fe_sq(tmp, x);
1462         invsqrt(tmp, tmp);
1463         fe_sq(tmp, tmp);
1464         fe_mul(out, tmp, x);
1465         WIPE_BUFFER(tmp);
1466 }
1467
1468 // trim a scalar for scalar multiplication
1469 void crypto_eddsa_trim_scalar(u8 out[32], const u8 in[32])
1470 {
1471         COPY(out, in, 32);
1472         out[ 0] &= 248;
1473         out[31] &= 127;
1474         out[31] |= 64;
1475 }
1476
1477 // get bit from scalar at position i
1478 static int scalar_bit(const u8 s[32], int i)
1479 {
1480         if (i < 0) { return 0; } // handle -1 for sliding windows
1481         return (s[i>>3] >> (i&7)) & 1;
1482 }
1483
1484 ///////////////
1485 /// X-25519 /// Taken from SUPERCOP's ref10 implementation.
1486 ///////////////
1487 static void scalarmult(u8 q[32], const u8 scalar[32], const u8 p[32],
1488                        int nb_bits)
1489 {
1490         // computes the scalar product
1491         fe x1;
1492         fe_frombytes(x1, p);
1493
1494         // computes the actual scalar product (the result is in x2 and z2)
1495         fe x2, z2, x3, z3, t0, t1;
1496         // Montgomery ladder
1497         // In projective coordinates, to avoid divisions: x = X / Z
1498         // We don't care about the y coordinate, it's only 1 bit of information
1499         fe_1(x2);        fe_0(z2); // "zero" point
1500         fe_copy(x3, x1); fe_1(z3); // "one"  point
1501         int swap = 0;
1502         for (int pos = nb_bits-1; pos >= 0; --pos) {
1503                 // constant time conditional swap before ladder step
1504                 int b = scalar_bit(scalar, pos);
1505                 swap ^= b; // xor trick avoids swapping at the end of the loop
1506                 fe_cswap(x2, x3, swap);
1507                 fe_cswap(z2, z3, swap);
1508                 swap = b;  // anticipates one last swap after the loop
1509
1510                 // Montgomery ladder step: replaces (P2, P3) by (P2*2, P2+P3)
1511                 // with differential addition
1512                 fe_sub(t0, x3, z3);
1513                 fe_sub(t1, x2, z2);
1514                 fe_add(x2, x2, z2);
1515                 fe_add(z2, x3, z3);
1516                 fe_mul(z3, t0, x2);
1517                 fe_mul(z2, z2, t1);
1518                 fe_sq (t0, t1    );
1519                 fe_sq (t1, x2    );
1520                 fe_add(x3, z3, z2);
1521                 fe_sub(z2, z3, z2);
1522                 fe_mul(x2, t1, t0);
1523                 fe_sub(t1, t1, t0);
1524                 fe_sq (z2, z2    );
1525                 fe_mul_small(z3, t1, 121666);
1526                 fe_sq (x3, x3    );
1527                 fe_add(t0, t0, z3);
1528                 fe_mul(z3, x1, z2);
1529                 fe_mul(z2, t1, t0);
1530         }
1531         // last swap is necessary to compensate for the xor trick
1532         // Note: after this swap, P3 == P2 + P1.
1533         fe_cswap(x2, x3, swap);
1534         fe_cswap(z2, z3, swap);
1535
1536         // normalises the coordinates: x == X / Z
1537         fe_invert(z2, z2);
1538         fe_mul(x2, x2, z2);
1539         fe_tobytes(q, x2);
1540
1541         WIPE_BUFFER(x1);
1542         WIPE_BUFFER(x2);  WIPE_BUFFER(z2);  WIPE_BUFFER(t0);
1543         WIPE_BUFFER(x3);  WIPE_BUFFER(z3);  WIPE_BUFFER(t1);
1544 }
1545
1546 void crypto_x25519(u8       raw_shared_secret[32],
1547                    const u8 your_secret_key  [32],
1548                    const u8 their_public_key [32])
1549 {
1550         // restrict the possible scalar values
1551         u8 e[32];
1552         crypto_eddsa_trim_scalar(e, your_secret_key);
1553         scalarmult(raw_shared_secret, e, their_public_key, 255);
1554         WIPE_BUFFER(e);
1555 }
1556
1557 void crypto_x25519_public_key(u8       public_key[32],
1558                               const u8 secret_key[32])
1559 {
1560         static const u8 base_point[32] = {9};
1561         crypto_x25519(public_key, secret_key, base_point);
1562 }
1563
1564 ///////////////////////////
1565 /// Arithmetic modulo L ///
1566 ///////////////////////////
1567 static const u32 L[8] = {
1568         0x5cf5d3ed, 0x5812631a, 0xa2f79cd6, 0x14def9de,
1569         0x00000000, 0x00000000, 0x00000000, 0x10000000,
1570 };
1571
1572 //  p = a*b + p
1573 static void multiply(u32 p[16], const u32 a[8], const u32 b[8])
1574 {
1575         FOR (i, 0, 8) {
1576                 u64 carry = 0;
1577                 FOR (j, 0, 8) {
1578                         carry  += p[i+j] + (u64)a[i] * b[j];
1579                         p[i+j]  = (u32)carry;
1580                         carry >>= 32;
1581                 }
1582                 p[i+8] = (u32)carry;
1583         }
1584 }
1585
1586 static int is_above_l(const u32 x[8])
1587 {
1588         // We work with L directly, in a 2's complement encoding
1589         // (-L == ~L + 1)
1590         u64 carry = 1;
1591         FOR (i, 0, 8) {
1592                 carry  += (u64)x[i] + (~L[i] & 0xffffffff);
1593                 carry >>= 32;
1594         }
1595         return (int)carry; // carry is either 0 or 1
1596 }
1597
1598 // Final reduction modulo L, by conditionally removing L.
1599 // if x < l     , then r = x
1600 // if l <= x 2*l, then r = x-l
1601 // otherwise the result will be wrong
1602 static void remove_l(u32 r[8], const u32 x[8])
1603 {
1604         u64 carry = (u64)is_above_l(x);
1605         u32 mask  = ~(u32)carry + 1; // carry == 0 or 1
1606         FOR (i, 0, 8) {
1607                 carry += (u64)x[i] + (~L[i] & mask);
1608                 r[i]   = (u32)carry;
1609                 carry >>= 32;
1610         }
1611 }
1612
1613 // Full reduction modulo L (Barrett reduction)
1614 static void mod_l(u8 reduced[32], const u32 x[16])
1615 {
1616         static const u32 r[9] = {
1617                 0x0a2c131b,0xed9ce5a3,0x086329a7,0x2106215d,
1618                 0xffffffeb,0xffffffff,0xffffffff,0xffffffff,0xf,
1619         };
1620         // xr = x * r
1621         u32 xr[25] = {0};
1622         FOR (i, 0, 9) {
1623                 u64 carry = 0;
1624                 FOR (j, 0, 16) {
1625                         carry  += xr[i+j] + (u64)r[i] * x[j];
1626                         xr[i+j] = (u32)carry;
1627                         carry >>= 32;
1628                 }
1629                 xr[i+16] = (u32)carry;
1630         }
1631         // xr = floor(xr / 2^512) * L
1632         // Since the result is guaranteed to be below 2*L,
1633         // it is enough to only compute the first 256 bits.
1634         // The division is performed by saying xr[i+16]. (16 * 32 = 512)
1635         ZERO(xr, 8);
1636         FOR (i, 0, 8) {
1637                 u64 carry = 0;
1638                 FOR (j, 0, 8-i) {
1639                         carry   += xr[i+j] + (u64)xr[i+16] * L[j];
1640                         xr[i+j] = (u32)carry;
1641                         carry >>= 32;
1642                 }
1643         }
1644         // xr = x - xr
1645         u64 carry = 1;
1646         FOR (i, 0, 8) {
1647                 carry  += (u64)x[i] + (~xr[i] & 0xffffffff);
1648                 xr[i]   = (u32)carry;
1649                 carry >>= 32;
1650         }
1651         // Final reduction modulo L (conditional subtraction)
1652         remove_l(xr, xr);
1653         store32_le_buf(reduced, xr, 8);
1654
1655         WIPE_BUFFER(xr);
1656 }
1657
1658 void crypto_eddsa_reduce(u8 reduced[32], const u8 expanded[64])
1659 {
1660         u32 x[16];
1661         load32_le_buf(x, expanded, 16);
1662         mod_l(reduced, x);
1663         WIPE_BUFFER(x);
1664 }
1665
1666 // r = (a * b) + c
1667 void crypto_eddsa_mul_add(u8 r[32],
1668                           const u8 a[32], const u8 b[32], const u8 c[32])
1669 {
1670         u32 A[8];  load32_le_buf(A, a, 8);
1671         u32 B[8];  load32_le_buf(B, b, 8);
1672         u32 p[16]; load32_le_buf(p, c, 8);  ZERO(p + 8, 8);
1673         multiply(p, A, B);
1674         mod_l(r, p);
1675         WIPE_BUFFER(p);
1676         WIPE_BUFFER(A);
1677         WIPE_BUFFER(B);
1678 }
1679
1680 ///////////////
1681 /// Ed25519 ///
1682 ///////////////
1683
1684 // Point (group element, ge) in a twisted Edwards curve,
1685 // in extended projective coordinates.
1686 // ge        : x  = X/Z, y  = Y/Z, T  = XY/Z
1687 // ge_cached : Yp = X+Y, Ym = X-Y, T2 = T*D2
1688 // ge_precomp: Z  = 1
1689 typedef struct { fe X;  fe Y;  fe Z; fe T;  } ge;
1690 typedef struct { fe Yp; fe Ym; fe Z; fe T2; } ge_cached;
1691 typedef struct { fe Yp; fe Ym;       fe T2; } ge_precomp;
1692
1693 static void ge_zero(ge *p)
1694 {
1695         fe_0(p->X);
1696         fe_1(p->Y);
1697         fe_1(p->Z);
1698         fe_0(p->T);
1699 }
1700
1701 static void ge_tobytes(u8 s[32], const ge *h)
1702 {
1703         fe recip, x, y;
1704         fe_invert(recip, h->Z);
1705         fe_mul(x, h->X, recip);
1706         fe_mul(y, h->Y, recip);
1707         fe_tobytes(s, y);
1708         s[31] ^= fe_isodd(x) << 7;
1709
1710         WIPE_BUFFER(recip);
1711         WIPE_BUFFER(x);
1712         WIPE_BUFFER(y);
1713 }
1714
1715 // h = -s, where s is a point encoded in 32 bytes
1716 //
1717 // Variable time!  Inputs must not be secret!
1718 // => Use only to *check* signatures.
1719 //
1720 // From the specifications:
1721 //   The encoding of s contains y and the sign of x
1722 //   x = sqrt((y^2 - 1) / (d*y^2 + 1))
1723 // In extended coordinates:
1724 //   X = x, Y = y, Z = 1, T = x*y
1725 //
1726 //    Note that num * den is a square iff num / den is a square
1727 //    If num * den is not a square, the point was not on the curve.
1728 // From the above:
1729 //   Let num =   y^2 - 1
1730 //   Let den = d*y^2 + 1
1731 //   x = sqrt((y^2 - 1) / (d*y^2 + 1))
1732 //   x = sqrt(num / den)
1733 //   x = sqrt(num^2 / (num * den))
1734 //   x = num * sqrt(1 / (num * den))
1735 //
1736 // Therefore, we can just compute:
1737 //   num =   y^2 - 1
1738 //   den = d*y^2 + 1
1739 //   isr = invsqrt(num * den)  // abort if not square
1740 //   x   = num * isr
1741 // Finally, negate x if its sign is not as specified.
1742 static int ge_frombytes_neg_vartime(ge *h, const u8 s[32])
1743 {
1744         fe_frombytes(h->Y, s);
1745         fe_1(h->Z);
1746         fe_sq (h->T, h->Y);        // t =   y^2
1747         fe_mul(h->X, h->T, d   );  // x = d*y^2
1748         fe_sub(h->T, h->T, h->Z);  // t =   y^2 - 1
1749         fe_add(h->X, h->X, h->Z);  // x = d*y^2 + 1
1750         fe_mul(h->X, h->T, h->X);  // x = (y^2 - 1) * (d*y^2 + 1)
1751         int is_square = invsqrt(h->X, h->X);
1752         if (!is_square) {
1753                 return -1;             // Not on the curve, abort
1754         }
1755         fe_mul(h->X, h->T, h->X);  // x = sqrt((y^2 - 1) / (d*y^2 + 1))
1756         if (fe_isodd(h->X) == (s[31] >> 7)) {
1757                 fe_neg(h->X, h->X);
1758         }
1759         fe_mul(h->T, h->X, h->Y);
1760         return 0;
1761 }
1762
1763 static void ge_cache(ge_cached *c, const ge *p)
1764 {
1765         fe_add (c->Yp, p->Y, p->X);
1766         fe_sub (c->Ym, p->Y, p->X);
1767         fe_copy(c->Z , p->Z      );
1768         fe_mul (c->T2, p->T, D2  );
1769 }
1770
1771 // Internal buffers are not wiped! Inputs must not be secret!
1772 // => Use only to *check* signatures.
1773 static void ge_add(ge *s, const ge *p, const ge_cached *q)
1774 {
1775         fe a, b;
1776         fe_add(a   , p->Y, p->X );
1777         fe_sub(b   , p->Y, p->X );
1778         fe_mul(a   , a   , q->Yp);
1779         fe_mul(b   , b   , q->Ym);
1780         fe_add(s->Y, a   , b    );
1781         fe_sub(s->X, a   , b    );
1782
1783         fe_add(s->Z, p->Z, p->Z );
1784         fe_mul(s->Z, s->Z, q->Z );
1785         fe_mul(s->T, p->T, q->T2);
1786         fe_add(a   , s->Z, s->T );
1787         fe_sub(b   , s->Z, s->T );
1788
1789         fe_mul(s->T, s->X, s->Y);
1790         fe_mul(s->X, s->X, b   );
1791         fe_mul(s->Y, s->Y, a   );
1792         fe_mul(s->Z, a   , b   );
1793 }
1794
1795 // Internal buffers are not wiped! Inputs must not be secret!
1796 // => Use only to *check* signatures.
1797 static void ge_sub(ge *s, const ge *p, const ge_cached *q)
1798 {
1799         ge_cached neg;
1800         fe_copy(neg.Ym, q->Yp);
1801         fe_copy(neg.Yp, q->Ym);
1802         fe_copy(neg.Z , q->Z );
1803         fe_neg (neg.T2, q->T2);
1804         ge_add(s, p, &neg);
1805 }
1806
1807 static void ge_madd(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1808 {
1809         fe_add(a   , p->Y, p->X );
1810         fe_sub(b   , p->Y, p->X );
1811         fe_mul(a   , a   , q->Yp);
1812         fe_mul(b   , b   , q->Ym);
1813         fe_add(s->Y, a   , b    );
1814         fe_sub(s->X, a   , b    );
1815
1816         fe_add(s->Z, p->Z, p->Z );
1817         fe_mul(s->T, p->T, q->T2);
1818         fe_add(a   , s->Z, s->T );
1819         fe_sub(b   , s->Z, s->T );
1820
1821         fe_mul(s->T, s->X, s->Y);
1822         fe_mul(s->X, s->X, b   );
1823         fe_mul(s->Y, s->Y, a   );
1824         fe_mul(s->Z, a   , b   );
1825 }
1826
1827 // Internal buffers are not wiped! Inputs must not be secret!
1828 // => Use only to *check* signatures.
1829 static void ge_msub(ge *s, const ge *p, const ge_precomp *q, fe a, fe b)
1830 {
1831         ge_precomp neg;
1832         fe_copy(neg.Ym, q->Yp);
1833         fe_copy(neg.Yp, q->Ym);
1834         fe_neg (neg.T2, q->T2);
1835         ge_madd(s, p, &neg, a, b);
1836 }
1837
1838 static void ge_double(ge *s, const ge *p, ge *q)
1839 {
1840         fe_sq (q->X, p->X);
1841         fe_sq (q->Y, p->Y);
1842         fe_sq (q->Z, p->Z);          // qZ = pZ^2
1843         fe_mul_small(q->Z, q->Z, 2); // qZ = pZ^2 * 2
1844         fe_add(q->T, p->X, p->Y);
1845         fe_sq (s->T, q->T);
1846         fe_add(q->T, q->Y, q->X);
1847         fe_sub(q->Y, q->Y, q->X);
1848         fe_sub(q->X, s->T, q->T);
1849         fe_sub(q->Z, q->Z, q->Y);
1850
1851         fe_mul(s->X, q->X , q->Z);
1852         fe_mul(s->Y, q->T , q->Y);
1853         fe_mul(s->Z, q->Y , q->Z);
1854         fe_mul(s->T, q->X , q->T);
1855 }
1856
1857 // 5-bit signed window in cached format (Niels coordinates, Z=1)
1858 static const ge_precomp b_window[8] = {
1859         {{25967493,-14356035,29566456,3660896,-12694345,
1860           4014787,27544626,-11754271,-6079156,2047605,},
1861          {-12545711,934262,-2722910,3049990,-727428,
1862           9406986,12720692,5043384,19500929,-15469378,},
1863          {-8738181,4489570,9688441,-14785194,10184609,
1864           -12363380,29287919,11864899,-24514362,-4438546,},},
1865         {{15636291,-9688557,24204773,-7912398,616977,
1866           -16685262,27787600,-14772189,28944400,-1550024,},
1867          {16568933,4717097,-11556148,-1102322,15682896,
1868           -11807043,16354577,-11775962,7689662,11199574,},
1869          {30464156,-5976125,-11779434,-15670865,23220365,
1870           15915852,7512774,10017326,-17749093,-9920357,},},
1871         {{10861363,11473154,27284546,1981175,-30064349,
1872           12577861,32867885,14515107,-15438304,10819380,},
1873          {4708026,6336745,20377586,9066809,-11272109,
1874           6594696,-25653668,12483688,-12668491,5581306,},
1875          {19563160,16186464,-29386857,4097519,10237984,
1876           -4348115,28542350,13850243,-23678021,-15815942,},},
1877         {{5153746,9909285,1723747,-2777874,30523605,
1878           5516873,19480852,5230134,-23952439,-15175766,},
1879          {-30269007,-3463509,7665486,10083793,28475525,
1880           1649722,20654025,16520125,30598449,7715701,},
1881          {28881845,14381568,9657904,3680757,-20181635,
1882           7843316,-31400660,1370708,29794553,-1409300,},},
1883         {{-22518993,-6692182,14201702,-8745502,-23510406,
1884           8844726,18474211,-1361450,-13062696,13821877,},
1885          {-6455177,-7839871,3374702,-4740862,-27098617,
1886           -10571707,31655028,-7212327,18853322,-14220951,},
1887          {4566830,-12963868,-28974889,-12240689,-7602672,
1888           -2830569,-8514358,-10431137,2207753,-3209784,},},
1889         {{-25154831,-4185821,29681144,7868801,-6854661,
1890           -9423865,-12437364,-663000,-31111463,-16132436,},
1891          {25576264,-2703214,7349804,-11814844,16472782,
1892           9300885,3844789,15725684,171356,6466918,},
1893          {23103977,13316479,9739013,-16149481,817875,
1894           -15038942,8965339,-14088058,-30714912,16193877,},},
1895         {{-33521811,3180713,-2394130,14003687,-16903474,
1896           -16270840,17238398,4729455,-18074513,9256800,},
1897          {-25182317,-4174131,32336398,5036987,-21236817,
1898           11360617,22616405,9761698,-19827198,630305,},
1899          {-13720693,2639453,-24237460,-7406481,9494427,
1900           -5774029,-6554551,-15960994,-2449256,-14291300,},},
1901         {{-3151181,-5046075,9282714,6866145,-31907062,
1902           -863023,-18940575,15033784,25105118,-7894876,},
1903          {-24326370,15950226,-31801215,-14592823,-11662737,
1904           -5090925,1573892,-2625887,2198790,-15804619,},
1905          {-3099351,10324967,-2241613,7453183,-5446979,
1906           -2735503,-13812022,-16236442,-32461234,-12290683,},},
1907 };
1908
1909 // Incremental sliding windows (left to right)
1910 // Based on Roberto Maria Avanzi[2005]
1911 typedef struct {
1912         i16 next_index; // position of the next signed digit
1913         i8  next_digit; // next signed digit (odd number below 2^window_width)
1914         u8  next_check; // point at which we must check for a new window
1915 } slide_ctx;
1916
1917 static void slide_init(slide_ctx *ctx, const u8 scalar[32])
1918 {
1919         // scalar is guaranteed to be below L, either because we checked (s),
1920         // or because we reduced it modulo L (h_ram). L is under 2^253, so
1921         // so bits 253 to 255 are guaranteed to be zero. No need to test them.
1922         //
1923         // Note however that L is very close to 2^252, so bit 252 is almost
1924         // always zero.  If we were to start at bit 251, the tests wouldn't
1925         // catch the off-by-one error (constructing one that does would be
1926         // prohibitively expensive).
1927         //
1928         // We should still check bit 252, though.
1929         int i = 252;
1930         while (i > 0 && scalar_bit(scalar, i) == 0) {
1931                 i--;
1932         }
1933         ctx->next_check = (u8)(i + 1);
1934         ctx->next_index = -1;
1935         ctx->next_digit = -1;
1936 }
1937
1938 static int slide_step(slide_ctx *ctx, int width, int i, const u8 scalar[32])
1939 {
1940         if (i == ctx->next_check) {
1941                 if (scalar_bit(scalar, i) == scalar_bit(scalar, i - 1)) {
1942                         ctx->next_check--;
1943                 } else {
1944                         // compute digit of next window
1945                         int w = MC_MIN(width, i + 1);
1946                         int v = -(scalar_bit(scalar, i) << (w-1));
1947                         FOR_T (int, j, 0, w-1) {
1948                                 v += scalar_bit(scalar, i-(w-1)+j) << j;
1949                         }
1950                         v += scalar_bit(scalar, i-w);
1951                         int lsb = v & (~v + 1); // smallest bit of v
1952                         int s   =               // log2(lsb)
1953                                 (((lsb & 0xAA) != 0) << 0) |
1954                                 (((lsb & 0xCC) != 0) << 1) |
1955                                 (((lsb & 0xF0) != 0) << 2);
1956                         ctx->next_index  = (i16)(i-(w-1)+s);
1957                         ctx->next_digit  = (i8) (v >> s   );
1958                         ctx->next_check -= (u8) w;
1959                 }
1960         }
1961         return i == ctx->next_index ? ctx->next_digit: 0;
1962 }
1963
1964 #define P_W_WIDTH 3 // Affects the size of the stack
1965 #define B_W_WIDTH 5 // Affects the size of the binary
1966 #define P_W_SIZE  (1<<(P_W_WIDTH-2))
1967
1968 int crypto_eddsa_check_equation(const u8 signature[64], const u8 public_key[32],
1969                                 const u8 h[32])
1970 {
1971         ge minus_A; // -public_key
1972         ge minus_R; // -first_half_of_signature
1973         const u8 *s = signature + 32;
1974
1975         // Check that A and R are on the curve
1976         // Check that 0 <= S < L (prevents malleability)
1977         // *Allow* non-cannonical encoding for A and R
1978         {
1979                 u32 s32[8];
1980                 load32_le_buf(s32, s, 8);
1981                 if (ge_frombytes_neg_vartime(&minus_A, public_key) ||
1982                     ge_frombytes_neg_vartime(&minus_R, signature)  ||
1983                     is_above_l(s32)) {
1984                         return -1;
1985                 }
1986         }
1987
1988         // look-up table for minus_A
1989         ge_cached lutA[P_W_SIZE];
1990         {
1991                 ge minus_A2, tmp;
1992                 ge_double(&minus_A2, &minus_A, &tmp);
1993                 ge_cache(&lutA[0], &minus_A);
1994                 FOR (i, 1, P_W_SIZE) {
1995                         ge_add(&tmp, &minus_A2, &lutA[i-1]);
1996                         ge_cache(&lutA[i], &tmp);
1997                 }
1998         }
1999
2000         // sum = [s]B - [h]A
2001         // Merged double and add ladder, fused with sliding
2002         slide_ctx h_slide;  slide_init(&h_slide, h);
2003         slide_ctx s_slide;  slide_init(&s_slide, s);
2004         int i = MC_MAX(h_slide.next_check, s_slide.next_check);
2005         ge *sum = &minus_A; // reuse minus_A for the sum
2006         ge_zero(sum);
2007         while (i >= 0) {
2008                 ge tmp;
2009                 ge_double(sum, sum, &tmp);
2010                 int h_digit = slide_step(&h_slide, P_W_WIDTH, i, h);
2011                 int s_digit = slide_step(&s_slide, B_W_WIDTH, i, s);
2012                 if (h_digit > 0) { ge_add(sum, sum, &lutA[ h_digit / 2]); }
2013                 if (h_digit < 0) { ge_sub(sum, sum, &lutA[-h_digit / 2]); }
2014                 fe t1, t2;
2015                 if (s_digit > 0) { ge_madd(sum, sum, b_window +  s_digit/2, t1, t2); }
2016                 if (s_digit < 0) { ge_msub(sum, sum, b_window + -s_digit/2, t1, t2); }
2017                 i--;
2018         }
2019
2020         // Compare [8](sum-R) and the zero point
2021         // The multiplication by 8 eliminates any low-order component
2022         // and ensures consistency with batched verification.
2023         ge_cached cached;
2024         u8 check[32];
2025         static const u8 zero_point[32] = {1}; // Point of order 1
2026         ge_cache(&cached, &minus_R);
2027         ge_add(sum, sum, &cached);
2028         ge_double(sum, sum, &minus_R); // reuse minus_R as temporary
2029         ge_double(sum, sum, &minus_R); // reuse minus_R as temporary
2030         ge_double(sum, sum, &minus_R); // reuse minus_R as temporary
2031         ge_tobytes(check, sum);
2032         return crypto_verify32(check, zero_point);
2033 }
2034
2035 // 5-bit signed comb in cached format (Niels coordinates, Z=1)
2036 static const ge_precomp b_comb_low[8] = {
2037         {{-6816601,-2324159,-22559413,124364,18015490,
2038           8373481,19993724,1979872,-18549925,9085059,},
2039          {10306321,403248,14839893,9633706,8463310,
2040           -8354981,-14305673,14668847,26301366,2818560,},
2041          {-22701500,-3210264,-13831292,-2927732,-16326337,
2042           -14016360,12940910,177905,12165515,-2397893,},},
2043         {{-12282262,-7022066,9920413,-3064358,-32147467,
2044           2927790,22392436,-14852487,2719975,16402117,},
2045          {-7236961,-4729776,2685954,-6525055,-24242706,
2046           -15940211,-6238521,14082855,10047669,12228189,},
2047          {-30495588,-12893761,-11161261,3539405,-11502464,
2048           16491580,-27286798,-15030530,-7272871,-15934455,},},
2049         {{17650926,582297,-860412,-187745,-12072900,
2050           -10683391,-20352381,15557840,-31072141,-5019061,},
2051          {-6283632,-2259834,-4674247,-4598977,-4089240,
2052           12435688,-31278303,1060251,6256175,10480726,},
2053          {-13871026,2026300,-21928428,-2741605,-2406664,
2054           -8034988,7355518,15733500,-23379862,7489131,},},
2055         {{6883359,695140,23196907,9644202,-33430614,
2056           11354760,-20134606,6388313,-8263585,-8491918,},
2057          {-7716174,-13605463,-13646110,14757414,-19430591,
2058           -14967316,10359532,-11059670,-21935259,12082603,},
2059          {-11253345,-15943946,10046784,5414629,24840771,
2060           8086951,-6694742,9868723,15842692,-16224787,},},
2061         {{9639399,11810955,-24007778,-9320054,3912937,
2062           -9856959,996125,-8727907,-8919186,-14097242,},
2063          {7248867,14468564,25228636,-8795035,14346339,
2064           8224790,6388427,-7181107,6468218,-8720783,},
2065          {15513115,15439095,7342322,-10157390,18005294,
2066           -7265713,2186239,4884640,10826567,7135781,},},
2067         {{-14204238,5297536,-5862318,-6004934,28095835,
2068           4236101,-14203318,1958636,-16816875,3837147,},
2069          {-5511166,-13176782,-29588215,12339465,15325758,
2070           -15945770,-8813185,11075932,-19608050,-3776283,},
2071          {11728032,9603156,-4637821,-5304487,-7827751,
2072           2724948,31236191,-16760175,-7268616,14799772,},},
2073         {{-28842672,4840636,-12047946,-9101456,-1445464,
2074           381905,-30977094,-16523389,1290540,12798615,},
2075          {27246947,-10320914,14792098,-14518944,5302070,
2076           -8746152,-3403974,-4149637,-27061213,10749585,},
2077          {25572375,-6270368,-15353037,16037944,1146292,
2078           32198,23487090,9585613,24714571,-1418265,},},
2079         {{19844825,282124,-17583147,11004019,-32004269,
2080           -2716035,6105106,-1711007,-21010044,14338445,},
2081          {8027505,8191102,-18504907,-12335737,25173494,
2082           -5923905,15446145,7483684,-30440441,10009108,},
2083          {-14134701,-4174411,10246585,-14677495,33553567,
2084           -14012935,23366126,15080531,-7969992,7663473,},},
2085 };
2086
2087 static const ge_precomp b_comb_high[8] = {
2088         {{33055887,-4431773,-521787,6654165,951411,
2089           -6266464,-5158124,6995613,-5397442,-6985227,},
2090          {4014062,6967095,-11977872,3960002,8001989,
2091           5130302,-2154812,-1899602,-31954493,-16173976,},
2092          {16271757,-9212948,23792794,731486,-25808309,
2093           -3546396,6964344,-4767590,10976593,10050757,},},
2094         {{2533007,-4288439,-24467768,-12387405,-13450051,
2095           14542280,12876301,13893535,15067764,8594792,},
2096          {20073501,-11623621,3165391,-13119866,13188608,
2097           -11540496,-10751437,-13482671,29588810,2197295,},
2098          {-1084082,11831693,6031797,14062724,14748428,
2099           -8159962,-20721760,11742548,31368706,13161200,},},
2100         {{2050412,-6457589,15321215,5273360,25484180,
2101           124590,-18187548,-7097255,-6691621,-14604792,},
2102          {9938196,2162889,-6158074,-1711248,4278932,
2103           -2598531,-22865792,-7168500,-24323168,11746309,},
2104          {-22691768,-14268164,5965485,9383325,20443693,
2105           5854192,28250679,-1381811,-10837134,13717818,},},
2106         {{-8495530,16382250,9548884,-4971523,-4491811,
2107           -3902147,6182256,-12832479,26628081,10395408,},
2108          {27329048,-15853735,7715764,8717446,-9215518,
2109           -14633480,28982250,-5668414,4227628,242148,},
2110          {-13279943,-7986904,-7100016,8764468,-27276630,
2111           3096719,29678419,-9141299,3906709,11265498,},},
2112         {{11918285,15686328,-17757323,-11217300,-27548967,
2113           4853165,-27168827,6807359,6871949,-1075745,},
2114          {-29002610,13984323,-27111812,-2713442,28107359,
2115           -13266203,6155126,15104658,3538727,-7513788,},
2116          {14103158,11233913,-33165269,9279850,31014152,
2117           4335090,-1827936,4590951,13960841,12787712,},},
2118         {{1469134,-16738009,33411928,13942824,8092558,
2119           -8778224,-11165065,1437842,22521552,-2792954,},
2120          {31352705,-4807352,-25327300,3962447,12541566,
2121           -9399651,-27425693,7964818,-23829869,5541287,},
2122          {-25732021,-6864887,23848984,3039395,-9147354,
2123           6022816,-27421653,10590137,25309915,-1584678,},},
2124         {{-22951376,5048948,31139401,-190316,-19542447,
2125           -626310,-17486305,-16511925,-18851313,-12985140,},
2126          {-9684890,14681754,30487568,7717771,-10829709,
2127           9630497,30290549,-10531496,-27798994,-13812825,},
2128          {5827835,16097107,-24501327,12094619,7413972,
2129           11447087,28057551,-1793987,-14056981,4359312,},},
2130         {{26323183,2342588,-21887793,-1623758,-6062284,
2131           2107090,-28724907,9036464,-19618351,-13055189,},
2132          {-29697200,14829398,-4596333,14220089,-30022969,
2133           2955645,12094100,-13693652,-5941445,7047569,},
2134          {-3201977,14413268,-12058324,-16417589,-9035655,
2135           -7224648,9258160,1399236,30397584,-5684634,},},
2136 };
2137
2138 static void lookup_add(ge *p, ge_precomp *tmp_c, fe tmp_a, fe tmp_b,
2139                        const ge_precomp comb[8], const u8 scalar[32], int i)
2140 {
2141         u8 teeth = (u8)((scalar_bit(scalar, i)          ) +
2142                         (scalar_bit(scalar, i + 32) << 1) +
2143                         (scalar_bit(scalar, i + 64) << 2) +
2144                         (scalar_bit(scalar, i + 96) << 3));
2145         u8 high  = teeth >> 3;
2146         u8 index = (teeth ^ (high - 1)) & 7;
2147         FOR (j, 0, 8) {
2148                 i32 select = 1 & (((j ^ index) - 1) >> 8);
2149                 fe_ccopy(tmp_c->Yp, comb[j].Yp, select);
2150                 fe_ccopy(tmp_c->Ym, comb[j].Ym, select);
2151                 fe_ccopy(tmp_c->T2, comb[j].T2, select);
2152         }
2153         fe_neg(tmp_a, tmp_c->T2);
2154         fe_cswap(tmp_c->T2, tmp_a    , high ^ 1);
2155         fe_cswap(tmp_c->Yp, tmp_c->Ym, high ^ 1);
2156         ge_madd(p, p, tmp_c, tmp_a, tmp_b);
2157 }
2158
2159 // p = [scalar]B, where B is the base point
2160 static void ge_scalarmult_base(ge *p, const u8 scalar[32])
2161 {
2162         // twin 4-bits signed combs, from Mike Hamburg's
2163         // Fast and compact elliptic-curve cryptography (2012)
2164         // 1 / 2 modulo L
2165         static const u8 half_mod_L[32] = {
2166                 247,233,122,46,141,49,9,44,107,206,123,81,239,124,111,10,
2167                 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,
2168         };
2169         // (2^256 - 1) / 2 modulo L
2170         static const u8 half_ones[32] = {
2171                 142,74,204,70,186,24,118,107,184,231,190,57,250,173,119,99,
2172                 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,7,
2173         };
2174
2175         // All bits set form: 1 means 1, 0 means -1
2176         u8 s_scalar[32];
2177         crypto_eddsa_mul_add(s_scalar, scalar, half_mod_L, half_ones);
2178
2179         // Double and add ladder
2180         fe tmp_a, tmp_b;  // temporaries for addition
2181         ge_precomp tmp_c; // temporary for comb lookup
2182         ge tmp_d;         // temporary for doubling
2183         fe_1(tmp_c.Yp);
2184         fe_1(tmp_c.Ym);
2185         fe_0(tmp_c.T2);
2186
2187         // Save a double on the first iteration
2188         ge_zero(p);
2189         lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, 31);
2190         lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, 31+128);
2191         // Regular double & add for the rest
2192         for (int i = 30; i >= 0; i--) {
2193                 ge_double(p, p, &tmp_d);
2194                 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_low , s_scalar, i);
2195                 lookup_add(p, &tmp_c, tmp_a, tmp_b, b_comb_high, s_scalar, i+128);
2196         }
2197         // Note: we could save one addition at the end if we assumed the
2198         // scalar fit in 252 bits.  Which it does in practice if it is
2199         // selected at random.  However, non-random, non-hashed scalars
2200         // *can* overflow 252 bits in practice.  Better account for that
2201         // than leaving that kind of subtle corner case.
2202
2203         WIPE_BUFFER(tmp_a);  WIPE_CTX(&tmp_d);
2204         WIPE_BUFFER(tmp_b);  WIPE_CTX(&tmp_c);
2205         WIPE_BUFFER(s_scalar);
2206 }
2207
2208 void crypto_eddsa_scalarbase(u8 point[32], const u8 scalar[32])
2209 {
2210         ge P;
2211         ge_scalarmult_base(&P, scalar);
2212         ge_tobytes(point, &P);
2213         WIPE_CTX(&P);
2214 }
2215
2216 void crypto_eddsa_key_pair(u8 secret_key[64], u8 public_key[32], u8 seed[32])
2217 {
2218         // To allow overlaps, observable writes happen in this order:
2219         // 1. seed
2220         // 2. secret_key
2221         // 3. public_key
2222         u8 a[64];
2223         COPY(a, seed, 32);
2224         crypto_wipe(seed, 32);
2225         COPY(secret_key, a, 32);
2226         crypto_blake2b(a, 64, a, 32);
2227         crypto_eddsa_trim_scalar(a, a);
2228         crypto_eddsa_scalarbase(secret_key + 32, a);
2229         COPY(public_key, secret_key + 32, 32);
2230         WIPE_BUFFER(a);
2231 }
2232
2233 static void hash_reduce(u8 h[32],
2234                         const u8 *a, size_t a_size,
2235                         const u8 *b, size_t b_size,
2236                         const u8 *c, size_t c_size)
2237 {
2238         u8 hash[64];
2239         crypto_blake2b_ctx ctx;
2240         crypto_blake2b_init  (&ctx, 64);
2241         crypto_blake2b_update(&ctx, a, a_size);
2242         crypto_blake2b_update(&ctx, b, b_size);
2243         crypto_blake2b_update(&ctx, c, c_size);
2244         crypto_blake2b_final (&ctx, hash);
2245         crypto_eddsa_reduce(h, hash);
2246 }
2247
2248 // Digital signature of a message with from a secret key.
2249 //
2250 // The secret key comprises two parts:
2251 // - The seed that generates the key (secret_key[ 0..31])
2252 // - The public key                  (secret_key[32..63])
2253 //
2254 // The seed and the public key are bundled together to make sure users
2255 // don't use mismatched seeds and public keys, which would instantly
2256 // leak the secret scalar and allow forgeries (allowing this to happen
2257 // has resulted in critical vulnerabilities in the wild).
2258 //
2259 // The seed is hashed to derive the secret scalar and a secret prefix.
2260 // The sole purpose of the prefix is to generate a secret random nonce.
2261 // The properties of that nonce must be as follows:
2262 // - Unique: we need a different one for each message.
2263 // - Secret: third parties must not be able to predict it.
2264 // - Random: any detectable bias would break all security.
2265 //
2266 // There are two ways to achieve these properties.  The obvious one is
2267 // to simply generate a random number.  Here that would be a parameter
2268 // (Monocypher doesn't have an RNG).  It works, but then users may reuse
2269 // the nonce by accident, which _also_ leaks the secret scalar and
2270 // allows forgeries.  This has happened in the wild too.
2271 //
2272 // This is no good, so instead we generate that nonce deterministically
2273 // by reducing modulo L a hash of the secret prefix and the message.
2274 // The secret prefix makes the nonce unpredictable, the message makes it
2275 // unique, and the hash/reduce removes all bias.
2276 //
2277 // The cost of that safety is hashing the message twice.  If that cost
2278 // is unacceptable, there are two alternatives:
2279 //
2280 // - Signing a hash of the message instead of the message itself.  This
2281 //   is fine as long as the hash is collision resistant. It is not
2282 //   compatible with existing "pure" signatures, but at least it's safe.
2283 //
2284 // - Using a random nonce.  Please exercise **EXTREME CAUTION** if you
2285 //   ever do that.  It is absolutely **critical** that the nonce is
2286 //   really an unbiased random number between 0 and L-1, never reused,
2287 //   and wiped immediately.
2288 //
2289 //   To lower the likelihood of complete catastrophe if the RNG is
2290 //   either flawed or misused, you can hash the RNG output together with
2291 //   the secret prefix and the beginning of the message, and use the
2292 //   reduction of that hash instead of the RNG output itself.  It's not
2293 //   foolproof (you'd need to hash the whole message) but it helps.
2294 //
2295 // Signing a message involves the following operations:
2296 //
2297 //   scalar, prefix = HASH(secret_key)
2298 //   r              = HASH(prefix || message) % L
2299 //   R              = [r]B
2300 //   h              = HASH(R || public_key || message) % L
2301 //   S              = ((h * a) + r) % L
2302 //   signature      = R || S
2303 void crypto_eddsa_sign(u8 signature [64], const u8 secret_key[64],
2304                        const u8 *message, size_t message_size)
2305 {
2306         u8 a[64];  // secret scalar and prefix
2307         u8 r[32];  // secret deterministic "random" nonce
2308         u8 h[32];  // publically verifiable hash of the message (not wiped)
2309         u8 R[32];  // first half of the signature (allows overlapping inputs)
2310
2311         crypto_blake2b(a, 64, secret_key, 32);
2312         crypto_eddsa_trim_scalar(a, a);
2313         hash_reduce(r, a + 32, 32, message, message_size, 0, 0);
2314         crypto_eddsa_scalarbase(R, r);
2315         hash_reduce(h, R, 32, secret_key + 32, 32, message, message_size);
2316         COPY(signature, R, 32);
2317         crypto_eddsa_mul_add(signature + 32, h, a, r);
2318
2319         WIPE_BUFFER(a);
2320         WIPE_BUFFER(r);
2321 }
2322
2323 // To check the signature R, S of the message M with the public key A,
2324 // there are 3 steps:
2325 //
2326 //   compute h = HASH(R || A || message) % L
2327 //   check that A is on the curve.
2328 //   check that R == [s]B - [h]A
2329 //
2330 // The last two steps are done in crypto_eddsa_check_equation()
2331 int crypto_eddsa_check(const u8  signature[64], const u8 public_key[32],
2332                        const u8 *message, size_t message_size)
2333 {
2334         u8 h[32];
2335         hash_reduce(h, signature, 32, public_key, 32, message, message_size);
2336         return crypto_eddsa_check_equation(signature, public_key, h);
2337 }
2338
2339 /////////////////////////
2340 /// EdDSA <--> X25519 ///
2341 /////////////////////////
2342 void crypto_eddsa_to_x25519(u8 x25519[32], const u8 eddsa[32])
2343 {
2344         // (u, v) = ((1+y)/(1-y), sqrt(-486664)*u/x)
2345         // Only converting y to u, the sign of x is ignored.
2346         fe t1, t2;
2347         fe_frombytes(t2, eddsa);
2348         fe_add(t1, fe_one, t2);
2349         fe_sub(t2, fe_one, t2);
2350         fe_invert(t2, t2);
2351         fe_mul(t1, t1, t2);
2352         fe_tobytes(x25519, t1);
2353         WIPE_BUFFER(t1);
2354         WIPE_BUFFER(t2);
2355 }
2356
2357 void crypto_x25519_to_eddsa(u8 eddsa[32], const u8 x25519[32])
2358 {
2359         // (x, y) = (sqrt(-486664)*u/v, (u-1)/(u+1))
2360         // Only converting u to y, x is assumed positive.
2361         fe t1, t2;
2362         fe_frombytes(t2, x25519);
2363         fe_sub(t1, t2, fe_one);
2364         fe_add(t2, t2, fe_one);
2365         fe_invert(t2, t2);
2366         fe_mul(t1, t1, t2);
2367         fe_tobytes(eddsa, t1);
2368         WIPE_BUFFER(t1);
2369         WIPE_BUFFER(t2);
2370 }
2371
2372 /////////////////////////////////////////////
2373 /// Dirty ephemeral public key generation ///
2374 /////////////////////////////////////////////
2375
2376 // Those functions generates a public key, *without* clearing the
2377 // cofactor.  Sending that key over the network leaks 3 bits of the
2378 // private key.  Use only to generate ephemeral keys that will be hidden
2379 // with crypto_curve_to_hidden().
2380 //
2381 // The public key is otherwise compatible with crypto_x25519(), which
2382 // properly clears the cofactor.
2383 //
2384 // Note that the distribution of the resulting public keys is almost
2385 // uniform.  Flipping the sign of the v coordinate (not provided by this
2386 // function), covers the entire key space almost perfectly, where
2387 // "almost" means a 2^-128 bias (undetectable).  This uniformity is
2388 // needed to ensure the proper randomness of the resulting
2389 // representatives (once we apply crypto_curve_to_hidden()).
2390 //
2391 // Recall that Curve25519 has order C = 2^255 + e, with e < 2^128 (not
2392 // to be confused with the prime order of the main subgroup, L, which is
2393 // 8 times less than that).
2394 //
2395 // Generating all points would require us to multiply a point of order C
2396 // (the base point plus any point of order 8) by all scalars from 0 to
2397 // C-1.  Clamping limits us to scalars between 2^254 and 2^255 - 1. But
2398 // by negating the resulting point at random, we also cover scalars from
2399 // -2^255 + 1 to -2^254 (which modulo C is congruent to e+1 to 2^254 + e).
2400 //
2401 // In practice:
2402 // - Scalars from 0         to e + 1     are never generated
2403 // - Scalars from 2^255     to 2^255 + e are never generated
2404 // - Scalars from 2^254 + 1 to 2^254 + e are generated twice
2405 //
2406 // Since e < 2^128, detecting this bias requires observing over 2^100
2407 // representatives from a given source (this will never happen), *and*
2408 // recovering enough of the private key to determine that they do, or do
2409 // not, belong to the biased set (this practically requires solving
2410 // discrete logarithm, which is conjecturally intractable).
2411 //
2412 // In practice, this means the bias is impossible to detect.
2413
2414 // s + (x*L) % 8*L
2415 // Guaranteed to fit in 256 bits iff s fits in 255 bits.
2416 //   L             < 2^253
2417 //   x%8           < 2^3
2418 //   L * (x%8)     < 2^255
2419 //   s             < 2^255
2420 //   s + L * (x%8) < 2^256
2421 static void add_xl(u8 s[32], u8 x)
2422 {
2423         u64 mod8  = x & 7;
2424         u64 carry = 0;
2425         FOR (i , 0, 8) {
2426                 carry = carry + load32_le(s + 4*i) + L[i] * mod8;
2427                 store32_le(s + 4*i, (u32)carry);
2428                 carry >>= 32;
2429         }
2430 }
2431
2432 // "Small" dirty ephemeral key.
2433 // Use if you need to shrink the size of the binary, and can afford to
2434 // slow down by a factor of two (compared to the fast version)
2435 //
2436 // This version works by decoupling the cofactor from the main factor.
2437 //
2438 // - The trimmed scalar determines the main factor
2439 // - The clamped bits of the scalar determine the cofactor.
2440 //
2441 // Cofactor and main factor are combined into a single scalar, which is
2442 // then multiplied by a point of order 8*L (unlike the base point, which
2443 // has prime order).  That "dirty" base point is the addition of the
2444 // regular base point (9), and a point of order 8.
2445 void crypto_x25519_dirty_small(u8 public_key[32], const u8 secret_key[32])
2446 {
2447         // Base point of order 8*L
2448         // Raw scalar multiplication with it does not clear the cofactor,
2449         // and the resulting public key will reveal 3 bits of the scalar.
2450         //
2451         // The low order component of this base point  has been chosen
2452         // to yield the same results as crypto_x25519_dirty_fast().
2453         static const u8 dirty_base_point[32] = {
2454                 0xd8, 0x86, 0x1a, 0xa2, 0x78, 0x7a, 0xd9, 0x26,
2455                 0x8b, 0x74, 0x74, 0xb6, 0x82, 0xe3, 0xbe, 0xc3,
2456                 0xce, 0x36, 0x9a, 0x1e, 0x5e, 0x31, 0x47, 0xa2,
2457                 0x6d, 0x37, 0x7c, 0xfd, 0x20, 0xb5, 0xdf, 0x75,
2458         };
2459         // separate the main factor & the cofactor of the scalar
2460         u8 scalar[32];
2461         crypto_eddsa_trim_scalar(scalar, secret_key);
2462
2463         // Separate the main factor and the cofactor
2464         //
2465         // The scalar is trimmed, so its cofactor is cleared.  The three
2466         // least significant bits however still have a main factor.  We must
2467         // remove it for X25519 compatibility.
2468         //
2469         //   cofactor = lsb * L            (modulo 8*L)
2470         //   combined = scalar + cofactor  (modulo 8*L)
2471         add_xl(scalar, secret_key[0]);
2472         scalarmult(public_key, scalar, dirty_base_point, 256);
2473         WIPE_BUFFER(scalar);
2474 }
2475
2476 // Select low order point
2477 // We're computing the [cofactor]lop scalar multiplication, where:
2478 //
2479 //   cofactor = tweak & 7.
2480 //   lop      = (lop_x, lop_y)
2481 //   lop_x    = sqrt((sqrt(d + 1) + 1) / d)
2482 //   lop_y    = -lop_x * sqrtm1
2483 //
2484 // The low order point has order 8. There are 4 such points.  We've
2485 // chosen the one whose both coordinates are positive (below p/2).
2486 // The 8 low order points are as follows:
2487 //
2488 // [0]lop = ( 0       ,  1    )
2489 // [1]lop = ( lop_x   ,  lop_y)
2490 // [2]lop = ( sqrt(-1), -0    )
2491 // [3]lop = ( lop_x   , -lop_y)
2492 // [4]lop = (-0       , -1    )
2493 // [5]lop = (-lop_x   , -lop_y)
2494 // [6]lop = (-sqrt(-1),  0    )
2495 // [7]lop = (-lop_x   ,  lop_y)
2496 //
2497 // The x coordinate is either 0, sqrt(-1), lop_x, or their opposite.
2498 // The y coordinate is either 0,      -1 , lop_y, or their opposite.
2499 // The pattern for both is the same, except for a rotation of 2 (modulo 8)
2500 //
2501 // This helper function captures the pattern, and we can use it thus:
2502 //
2503 //    select_lop(x, lop_x, sqrtm1, cofactor);
2504 //    select_lop(y, lop_y, fe_one, cofactor + 2);
2505 //
2506 // This is faster than an actual scalar multiplication,
2507 // and requires less code than naive constant time look up.
2508 static void select_lop(fe out, const fe x, const fe k, u8 cofactor)
2509 {
2510         fe tmp;
2511         fe_0(out);
2512         fe_ccopy(out, k  , (cofactor >> 1) & 1); // bit 1
2513         fe_ccopy(out, x  , (cofactor >> 0) & 1); // bit 0
2514         fe_neg  (tmp, out);
2515         fe_ccopy(out, tmp, (cofactor >> 2) & 1); // bit 2
2516         WIPE_BUFFER(tmp);
2517 }
2518
2519 // "Fast" dirty ephemeral key
2520 // We use this one by default.
2521 //
2522 // This version works by performing a regular scalar multiplication,
2523 // then add a low order point.  The scalar multiplication is done in
2524 // Edwards space for more speed (*2 compared to the "small" version).
2525 // The cost is a bigger binary for programs that don't also sign messages.
2526 void crypto_x25519_dirty_fast(u8 public_key[32], const u8 secret_key[32])
2527 {
2528         // Compute clean scalar multiplication
2529         u8 scalar[32];
2530         ge pk;
2531         crypto_eddsa_trim_scalar(scalar, secret_key);
2532         ge_scalarmult_base(&pk, scalar);
2533
2534         // Compute low order point
2535         fe t1, t2;
2536         select_lop(t1, lop_x, sqrtm1, secret_key[0]);
2537         select_lop(t2, lop_y, fe_one, secret_key[0] + 2);
2538         ge_precomp low_order_point;
2539         fe_add(low_order_point.Yp, t2, t1);
2540         fe_sub(low_order_point.Ym, t2, t1);
2541         fe_mul(low_order_point.T2, t2, t1);
2542         fe_mul(low_order_point.T2, low_order_point.T2, D2);
2543
2544         // Add low order point to the public key
2545         ge_madd(&pk, &pk, &low_order_point, t1, t2);
2546
2547         // Convert to Montgomery u coordinate (we ignore the sign)
2548         fe_add(t1, pk.Z, pk.Y);
2549         fe_sub(t2, pk.Z, pk.Y);
2550         fe_invert(t2, t2);
2551         fe_mul(t1, t1, t2);
2552
2553         fe_tobytes(public_key, t1);
2554
2555         WIPE_BUFFER(t1);    WIPE_CTX(&pk);
2556         WIPE_BUFFER(t2);    WIPE_CTX(&low_order_point);
2557         WIPE_BUFFER(scalar);
2558 }
2559
2560 ///////////////////
2561 /// Elligator 2 ///
2562 ///////////////////
2563 static const fe A = {486662};
2564
2565 // Elligator direct map
2566 //
2567 // Computes the point corresponding to a representative, encoded in 32
2568 // bytes (little Endian).  Since positive representatives fits in 254
2569 // bits, The two most significant bits are ignored.
2570 //
2571 // From the paper:
2572 // w = -A / (fe(1) + non_square * r^2)
2573 // e = chi(w^3 + A*w^2 + w)
2574 // u = e*w - (fe(1)-e)*(A//2)
2575 // v = -e * sqrt(u^3 + A*u^2 + u)
2576 //
2577 // We ignore v because we don't need it for X25519 (the Montgomery
2578 // ladder only uses u).
2579 //
2580 // Note that e is either 0, 1 or -1
2581 // if e = 0    u = 0  and v = 0
2582 // if e = 1    u = w
2583 // if e = -1   u = -w - A = w * non_square * r^2
2584 //
2585 // Let r1 = non_square * r^2
2586 // Let r2 = 1 + r1
2587 // Note that r2 cannot be zero, -1/non_square is not a square.
2588 // We can (tediously) verify that:
2589 //   w^3 + A*w^2 + w = (A^2*r1 - r2^2) * A / r2^3
2590 // Therefore:
2591 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3))
2592 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * 1
2593 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)) * chi(r2^6)
2594 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) * (A / r2^3)  *     r2^6)
2595 //   chi(w^3 + A*w^2 + w) = chi((A^2*r1 - r2^2) *  A * r2^3)
2596 // Corollary:
2597 //   e =  1 if (A^2*r1 - r2^2) *  A * r2^3) is a non-zero square
2598 //   e = -1 if (A^2*r1 - r2^2) *  A * r2^3) is not a square
2599 //   Note that w^3 + A*w^2 + w (and therefore e) can never be zero:
2600 //     w^3 + A*w^2 + w = w * (w^2 + A*w + 1)
2601 //     w^3 + A*w^2 + w = w * (w^2 + A*w + A^2/4 - A^2/4 + 1)
2602 //     w^3 + A*w^2 + w = w * (w + A/2)^2        - A^2/4 + 1)
2603 //     which is zero only if:
2604 //       w = 0                   (impossible)
2605 //       (w + A/2)^2 = A^2/4 - 1 (impossible, because A^2/4-1 is not a square)
2606 //
2607 // Let isr   = invsqrt((A^2*r1 - r2^2) *  A * r2^3)
2608 //     isr   = sqrt(1        / ((A^2*r1 - r2^2) *  A * r2^3)) if e =  1
2609 //     isr   = sqrt(sqrt(-1) / ((A^2*r1 - r2^2) *  A * r2^3)) if e = -1
2610 //
2611 // if e = 1
2612 //   let u1 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2
2613 //       u1 = w
2614 //       u1 = u
2615 //
2616 // if e = -1
2617 //   let ufactor = -non_square * sqrt(-1) * r^2
2618 //   let vfactor = sqrt(ufactor)
2619 //   let u2 = -A * (A^2*r1 - r2^2) * A * r2^2 * isr^2 * ufactor
2620 //       u2 = w * -1 * -non_square * r^2
2621 //       u2 = w * non_square * r^2
2622 //       u2 = u
2623 void crypto_elligator_map(u8 curve[32], const u8 hidden[32])
2624 {
2625         fe r, u, t1, t2, t3;
2626         fe_frombytes_mask(r, hidden, 2); // r is encoded in 254 bits.
2627         fe_sq(r, r);
2628         fe_add(t1, r, r);
2629         fe_add(u, t1, fe_one);
2630         fe_sq (t2, u);
2631         fe_mul(t3, A2, t1);
2632         fe_sub(t3, t3, t2);
2633         fe_mul(t3, t3, A);
2634         fe_mul(t1, t2, u);
2635         fe_mul(t1, t3, t1);
2636         int is_square = invsqrt(t1, t1);
2637         fe_mul(u, r, ufactor);
2638         fe_ccopy(u, fe_one, is_square);
2639         fe_sq (t1, t1);
2640         fe_mul(u, u, A);
2641         fe_mul(u, u, t3);
2642         fe_mul(u, u, t2);
2643         fe_mul(u, u, t1);
2644         fe_neg(u, u);
2645         fe_tobytes(curve, u);
2646
2647         WIPE_BUFFER(t1);  WIPE_BUFFER(r);
2648         WIPE_BUFFER(t2);  WIPE_BUFFER(u);
2649         WIPE_BUFFER(t3);
2650 }
2651
2652 // Elligator inverse map
2653 //
2654 // Computes the representative of a point, if possible.  If not, it does
2655 // nothing and returns -1.  Note that the success of the operation
2656 // depends only on the point (more precisely its u coordinate).  The
2657 // tweak parameter is used only upon success
2658 //
2659 // The tweak should be a random byte.  Beyond that, its contents are an
2660 // implementation detail. Currently, the tweak comprises:
2661 // - Bit  1  : sign of the v coordinate (0 if positive, 1 if negative)
2662 // - Bit  2-5: not used
2663 // - Bits 6-7: random padding
2664 //
2665 // From the paper:
2666 // Let sq = -non_square * u * (u+A)
2667 // if sq is not a square, or u = -A, there is no mapping
2668 // Assuming there is a mapping:
2669 //    if v is positive: r = sqrt(-u     / (non_square * (u+A)))
2670 //    if v is negative: r = sqrt(-(u+A) / (non_square * u    ))
2671 //
2672 // We compute isr = invsqrt(-non_square * u * (u+A))
2673 // if it wasn't a square, abort.
2674 // else, isr = sqrt(-1 / (non_square * u * (u+A))
2675 //
2676 // If v is positive, we return isr * u:
2677 //   isr * u = sqrt(-1 / (non_square * u * (u+A)) * u
2678 //   isr * u = sqrt(-u / (non_square * (u+A))
2679 //
2680 // If v is negative, we return isr * (u+A):
2681 //   isr * (u+A) = sqrt(-1     / (non_square * u * (u+A)) * (u+A)
2682 //   isr * (u+A) = sqrt(-(u+A) / (non_square * u)
2683 int crypto_elligator_rev(u8 hidden[32], const u8 public_key[32], u8 tweak)
2684 {
2685         fe t1, t2, t3;
2686         fe_frombytes(t1, public_key);    // t1 = u
2687
2688         fe_add(t2, t1, A);               // t2 = u + A
2689         fe_mul(t3, t1, t2);
2690         fe_mul_small(t3, t3, -2);
2691         int is_square = invsqrt(t3, t3); // t3 = sqrt(-1 / non_square * u * (u+A))
2692         if (is_square) {
2693                 // The only variable time bit.  This ultimately reveals how many
2694                 // tries it took us to find a representable key.
2695                 // This does not affect security as long as we try keys at random.
2696
2697                 fe_ccopy    (t1, t2, tweak & 1); // multiply by u if v is positive,
2698                 fe_mul      (t3, t1, t3);        // multiply by u+A otherwise
2699                 fe_mul_small(t1, t3, 2);
2700                 fe_neg      (t2, t3);
2701                 fe_ccopy    (t3, t2, fe_isodd(t1));
2702                 fe_tobytes(hidden, t3);
2703
2704                 // Pad with two random bits
2705                 hidden[31] |= tweak & 0xc0;
2706         }
2707
2708         WIPE_BUFFER(t1);
2709         WIPE_BUFFER(t2);
2710         WIPE_BUFFER(t3);
2711         return is_square - 1;
2712 }
2713
2714 void crypto_elligator_key_pair(u8 hidden[32], u8 secret_key[32], u8 seed[32])
2715 {
2716         u8 pk [32]; // public key
2717         u8 buf[64]; // seed + representative
2718         COPY(buf + 32, seed, 32);
2719         do {
2720                 crypto_chacha20_djb(buf, 0, 64, buf+32, zero, 0);
2721                 crypto_x25519_dirty_fast(pk, buf); // or the "small" version
2722         } while(crypto_elligator_rev(buf+32, pk, buf[32]));
2723         // Note that the return value of crypto_elligator_rev() is
2724         // independent from its tweak parameter.
2725         // Therefore, buf[32] is not actually reused.  Either we loop one
2726         // more time and buf[32] is used for the new seed, or we succeeded,
2727         // and buf[32] becomes the tweak parameter.
2728
2729         crypto_wipe(seed, 32);
2730         COPY(hidden    , buf + 32, 32);
2731         COPY(secret_key, buf     , 32);
2732         WIPE_BUFFER(buf);
2733         WIPE_BUFFER(pk);
2734 }
2735
2736 ///////////////////////
2737 /// Scalar division ///
2738 ///////////////////////
2739
2740 // Montgomery reduction.
2741 // Divides x by (2^256), and reduces the result modulo L
2742 //
2743 // Precondition:
2744 //   x < L * 2^256
2745 // Constants:
2746 //   r = 2^256                 (makes division by r trivial)
2747 //   k = (r * (1/r) - 1) // L  (1/r is computed modulo L   )
2748 // Algorithm:
2749 //   s = (x * k) % r
2750 //   t = x + s*L      (t is always a multiple of r)
2751 //   u = (t/r) % L    (u is always below 2*L, conditional subtraction is enough)
2752 static void redc(u32 u[8], u32 x[16])
2753 {
2754         static const u32 k[8] = {
2755                 0x12547e1b, 0xd2b51da3, 0xfdba84ff, 0xb1a206f2,
2756                 0xffa36bea, 0x14e75438, 0x6fe91836, 0x9db6c6f2,
2757         };
2758
2759         // s = x * k (modulo 2^256)
2760         // This is cheaper than the full multiplication.
2761         u32 s[8] = {0};
2762         FOR (i, 0, 8) {
2763                 u64 carry = 0;
2764                 FOR (j, 0, 8-i) {
2765                         carry  += s[i+j] + (u64)x[i] * k[j];
2766                         s[i+j]  = (u32)carry;
2767                         carry >>= 32;
2768                 }
2769         }
2770         u32 t[16] = {0};
2771         multiply(t, s, L);
2772
2773         // t = t + x
2774         u64 carry = 0;
2775         FOR (i, 0, 16) {
2776                 carry  += (u64)t[i] + x[i];
2777                 t[i]    = (u32)carry;
2778                 carry >>= 32;
2779         }
2780
2781         // u = (t / 2^256) % L
2782         // Note that t / 2^256 is always below 2*L,
2783         // So a constant time conditional subtraction is enough
2784         remove_l(u, t+8);
2785
2786         WIPE_BUFFER(s);
2787         WIPE_BUFFER(t);
2788 }
2789
2790 void crypto_x25519_inverse(u8 blind_salt [32], const u8 private_key[32],
2791                            const u8 curve_point[32])
2792 {
2793         static const  u8 Lm2[32] = { // L - 2
2794                 0xeb, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58,
2795                 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14,
2796                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2797                 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
2798         };
2799         // 1 in Montgomery form
2800         u32 m_inv [8] = {
2801                 0x8d98951d, 0xd6ec3174, 0x737dcf70, 0xc6ef5bf4,
2802                 0xfffffffe, 0xffffffff, 0xffffffff, 0x0fffffff,
2803         };
2804
2805         u8 scalar[32];
2806         crypto_eddsa_trim_scalar(scalar, private_key);
2807
2808         // Convert the scalar in Montgomery form
2809         // m_scl = scalar * 2^256 (modulo L)
2810         u32 m_scl[8];
2811         {
2812                 u32 tmp[16];
2813                 ZERO(tmp, 8);
2814                 load32_le_buf(tmp+8, scalar, 8);
2815                 mod_l(scalar, tmp);
2816                 load32_le_buf(m_scl, scalar, 8);
2817                 WIPE_BUFFER(tmp); // Wipe ASAP to save stack space
2818         }
2819
2820         // Compute the inverse
2821         u32 product[16];
2822         for (int i = 252; i >= 0; i--) {
2823                 ZERO(product, 16);
2824                 multiply(product, m_inv, m_inv);
2825                 redc(m_inv, product);
2826                 if (scalar_bit(Lm2, i)) {
2827                         ZERO(product, 16);
2828                         multiply(product, m_inv, m_scl);
2829                         redc(m_inv, product);
2830                 }
2831         }
2832         // Convert the inverse *out* of Montgomery form
2833         // scalar = m_inv / 2^256 (modulo L)
2834         COPY(product, m_inv, 8);
2835         ZERO(product + 8, 8);
2836         redc(m_inv, product);
2837         store32_le_buf(scalar, m_inv, 8); // the *inverse* of the scalar
2838
2839         // Clear the cofactor of scalar:
2840         //   cleared = scalar * (3*L + 1)      (modulo 8*L)
2841         //   cleared = scalar + scalar * 3 * L (modulo 8*L)
2842         // Note that (scalar * 3) is reduced modulo 8, so we only need the
2843         // first byte.
2844         add_xl(scalar, scalar[0] * 3);
2845
2846         // Recall that 8*L < 2^256. However it is also very close to
2847         // 2^255. If we spanned the ladder over 255 bits, random tests
2848         // wouldn't catch the off-by-one error.
2849         scalarmult(blind_salt, scalar, curve_point, 256);
2850
2851         WIPE_BUFFER(scalar);   WIPE_BUFFER(m_scl);
2852         WIPE_BUFFER(product);  WIPE_BUFFER(m_inv);
2853 }
2854
2855 ////////////////////////////////
2856 /// Authenticated encryption ///
2857 ////////////////////////////////
2858 static void lock_auth(u8 mac[16], const u8  auth_key[32],
2859                       const u8 *ad         , size_t ad_size,
2860                       const u8 *cipher_text, size_t text_size)
2861 {
2862         u8 sizes[16]; // Not secret, not wiped
2863         store64_le(sizes + 0, ad_size);
2864         store64_le(sizes + 8, text_size);
2865         crypto_poly1305_ctx poly_ctx;           // auto wiped...
2866         crypto_poly1305_init  (&poly_ctx, auth_key);
2867         crypto_poly1305_update(&poly_ctx, ad         , ad_size);
2868         crypto_poly1305_update(&poly_ctx, zero       , gap(ad_size, 16));
2869         crypto_poly1305_update(&poly_ctx, cipher_text, text_size);
2870         crypto_poly1305_update(&poly_ctx, zero       , gap(text_size, 16));
2871         crypto_poly1305_update(&poly_ctx, sizes      , 16);
2872         crypto_poly1305_final (&poly_ctx, mac); // ...here
2873 }
2874
2875 void crypto_aead_init_x(crypto_aead_ctx *ctx,
2876                         u8 const key[32], const u8 nonce[24])
2877 {
2878         crypto_chacha20_h(ctx->key, key, nonce);
2879         COPY(ctx->nonce, nonce + 16, 8);
2880         ctx->counter = 0;
2881 }
2882
2883 void crypto_aead_init_djb(crypto_aead_ctx *ctx,
2884                           const u8 key[32], const u8 nonce[8])
2885 {
2886         COPY(ctx->key  , key  , 32);
2887         COPY(ctx->nonce, nonce,  8);
2888         ctx->counter = 0;
2889 }
2890
2891 void crypto_aead_init_ietf(crypto_aead_ctx *ctx,
2892                            const u8 key[32], const u8 nonce[12])
2893 {
2894         COPY(ctx->key  , key      , 32);
2895         COPY(ctx->nonce, nonce + 4,  8);
2896         ctx->counter = (u64)load32_le(nonce) << 32;
2897 }
2898
2899 void crypto_aead_write(crypto_aead_ctx *ctx, u8 *cipher_text, u8 mac[16],
2900                        const u8 *ad,         size_t ad_size,
2901                        const u8 *plain_text, size_t text_size)
2902 {
2903         u8 auth_key[64]; // the last 32 bytes are used for rekeying.
2904         crypto_chacha20_djb(auth_key, 0, 64, ctx->key, ctx->nonce, ctx->counter);
2905         crypto_chacha20_djb(cipher_text, plain_text, text_size,
2906                             ctx->key, ctx->nonce, ctx->counter + 1);
2907         lock_auth(mac, auth_key, ad, ad_size, cipher_text, text_size);
2908         COPY(ctx->key, auth_key + 32, 32);
2909         WIPE_BUFFER(auth_key);
2910 }
2911
2912 int crypto_aead_read(crypto_aead_ctx *ctx, u8 *plain_text, const u8 mac[16],
2913                      const u8 *ad,          size_t ad_size,
2914                      const u8 *cipher_text, size_t text_size)
2915 {
2916         u8 auth_key[64]; // the last 32 bytes are used for rekeying.
2917         u8 real_mac[16];
2918         crypto_chacha20_djb(auth_key, 0, 64, ctx->key, ctx->nonce, ctx->counter);
2919         lock_auth(real_mac, auth_key, ad, ad_size, cipher_text, text_size);
2920         int mismatch = crypto_verify16(mac, real_mac);
2921         if (!mismatch) {
2922                 crypto_chacha20_djb(plain_text, cipher_text, text_size,
2923                                     ctx->key, ctx->nonce, ctx->counter + 1);
2924                 COPY(ctx->key, auth_key + 32, 32);
2925         }
2926         WIPE_BUFFER(auth_key);
2927         WIPE_BUFFER(real_mac);
2928         return mismatch;
2929 }
2930
2931 void crypto_aead_lock(u8 *cipher_text, u8 mac[16], const u8 key[32],
2932                       const u8  nonce[24], const u8 *ad, size_t ad_size,
2933                       const u8 *plain_text, size_t text_size)
2934 {
2935         crypto_aead_ctx ctx;
2936         crypto_aead_init_x(&ctx, key, nonce);
2937         crypto_aead_write(&ctx, cipher_text, mac, ad, ad_size,
2938                           plain_text, text_size);
2939         crypto_wipe(&ctx, sizeof(ctx));
2940 }
2941
2942 int crypto_aead_unlock(u8 *plain_text, const u8  mac[16], const u8 key[32],
2943                        const u8 nonce[24], const u8 *ad, size_t ad_size,
2944                        const u8 *cipher_text, size_t text_size)
2945 {
2946         crypto_aead_ctx ctx;
2947         crypto_aead_init_x(&ctx, key, nonce);
2948         int mismatch = crypto_aead_read(&ctx, plain_text, mac, ad, ad_size,
2949                                         cipher_text, text_size);
2950         crypto_wipe(&ctx, sizeof(ctx));
2951         return mismatch;
2952 }
2953
2954 #ifdef MONOCYPHER_CPP_NAMESPACE
2955 }
2956 #endif