module/icp/algs/skein/skein_block.c

   1 /*
   2  * Implementation of the Skein block functions.
   3  * Source code author: Doug Whiting, 2008.
   4  * This algorithm and source code is released to the public domain.
   5  * Compile-time switches:
   6  *  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
   7  *                    versions use ASM code for block processing
   8  *                    [default: use C for all block sizes]
   9  */
  10 /* Copyright 2013 Doug Whiting. This code is released to the public domain. */
  11
  12 #include <sys/skein.h>
  13 #include "skein_impl.h"
  14 #include <sys/isa_defs.h>       /* for _ILP32 */
  15
  16 #ifndef SKEIN_USE_ASM
  17 #define SKEIN_USE_ASM   (0)     /* default is all C code (no ASM) */
  18 #endif
  19
  20 #ifndef SKEIN_LOOP
  21 /*
  22  * The low-level checksum routines use a lot of stack space. On systems where
  23  * small stacks frame are enforced (like 32-bit kernel builds), do not unroll
  24  * checksum calculations to save stack space.
  25  *
  26  * Even with no loops unrolled, we still can exceed the 1k stack frame limit
  27  * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32).  We can
  28  * safely ignore it though, since that the checksum functions will be called
  29  * from a worker thread that won't be using much stack.  That's why we have
  30  * the #pragma here to ignore the warning.
  31  */
  32 #if defined(_ILP32) || defined(__powerpc)       /* Assume small stack */
  33 #if defined(__GNUC__) && !defined(__clang__)
  34 #pragma GCC diagnostic ignored "-Wframe-larger-than="
  35 #endif
  36 /*
  37  * We're running on 32-bit, don't unroll loops to save stack frame space
  38  *
  39  * Due to the ways the calculations on SKEIN_LOOP are done in
  40  * Skein_*_Process_Block(), a value of 111 disables unrolling loops
  41  * in any of those functions.
  42  */
  43 #define SKEIN_LOOP 111
  44 #else
  45 /* We're compiling with large stacks */
  46 #define SKEIN_LOOP 001          /* default: unroll 256 and 512, but not 1024 */
  47 #endif
  48 #endif
  49
  50 /* some useful definitions for code here */
  51 #define BLK_BITS        (WCNT*64)
  52 #define KW_TWK_BASE     (0)
  53 #define KW_KEY_BASE     (3)
  54 #define ks              (kw + KW_KEY_BASE)
  55 #define ts              (kw + KW_TWK_BASE)
  56
  57 /* no debugging in Illumos version */
  58 #define DebugSaveTweak(ctx)
  59
  60 /* Skein_256 */
  61 #if     !(SKEIN_USE_ASM & 256)
  62 void
  63 Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
  64     size_t blkCnt, size_t byteCntAdd)
  65 {
  66         enum {
  67                 WCNT = SKEIN_256_STATE_WORDS
  68         };
  69 #undef  RCNT
  70 #define RCNT  (SKEIN_256_ROUNDS_TOTAL / 8)
  71
  72 #ifdef  SKEIN_LOOP              /* configure how much to unroll the loop */
  73 #define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
  74 #else
  75 #define SKEIN_UNROLL_256 (0)
  76 #endif
  77
  78 #if     SKEIN_UNROLL_256
  79 #if     (RCNT % SKEIN_UNROLL_256)
  80 #error "Invalid SKEIN_UNROLL_256"       /* sanity check on unroll count */
  81 #endif
  82         size_t r;
  83         /* key schedule words : chaining vars + tweak + "rotation" */
  84         uint64_t kw[WCNT + 4 + RCNT * 2];
  85 #else
  86         uint64_t kw[WCNT + 4];  /* key schedule words : chaining vars + tweak */
  87 #endif
  88         /* local copy of context vars, for speed */
  89         uint64_t X0, X1, X2, X3;
  90         uint64_t w[WCNT];               /* local copy of input block */
  91 #ifdef  SKEIN_DEBUG
  92         /* use for debugging (help compiler put Xn in registers) */
  93         const uint64_t *Xptr[4];
  94         Xptr[0] = &X0;
  95         Xptr[1] = &X1;
  96         Xptr[2] = &X2;
  97         Xptr[3] = &X3;
  98 #endif
  99         Skein_assert(blkCnt != 0);      /* never call with blkCnt == 0! */
 100         ts[0] = ctx->h.T[0];
 101         ts[1] = ctx->h.T[1];
 102         do {
 103                 /*
 104                  * this implementation only supports 2**64 input bytes
 105                  * (no carry out here)
 106                  */
 107                 ts[0] += byteCntAdd;    /* update processed length */
 108
 109                 /* precompute the key schedule for this block */
 110                 ks[0] = ctx->X[0];
 111                 ks[1] = ctx->X[1];
 112                 ks[2] = ctx->X[2];
 113                 ks[3] = ctx->X[3];
 114                 ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
 115
 116                 ts[2] = ts[0] ^ ts[1];
 117
 118                 /* get input block in little-endian format */
 119                 Skein_Get64_LSB_First(w, blkPtr, WCNT);
 120                 DebugSaveTweak(ctx);
 121                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 122
 123                 X0 = w[0] + ks[0];      /* do the first full key injection */
 124                 X1 = w[1] + ks[1] + ts[0];
 125                 X2 = w[2] + ks[2] + ts[1];
 126                 X3 = w[3] + ks[3];
 127
 128                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
 129                     Xptr);      /* show starting state values */
 130
 131                 blkPtr += SKEIN_256_BLOCK_BYTES;
 132
 133                 /* run the rounds */
 134
 135 #define Round256(p0, p1, p2, p3, ROT, rNum)                          \
 136         X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
 137         X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
 138
 139 #if     SKEIN_UNROLL_256 == 0
 140 #define R256(p0, p1, p2, p3, ROT, rNum)         /* fully unrolled */    \
 141         Round256(p0, p1, p2, p3, ROT, rNum)             \
 142         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
 143
 144 #define I256(R)                                                         \
 145         X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \
 146         X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3];                    \
 147         X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3];                    \
 148         X3 += ks[((R) + 4) % 5] + (R) + 1;                      \
 149         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 150 #else                           /* looping version */
 151 #define R256(p0, p1, p2, p3, ROT, rNum)                             \
 152         Round256(p0, p1, p2, p3, ROT, rNum)                             \
 153         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
 154
 155 #define I256(R)                                                         \
 156         X0 += ks[r + (R) + 0];  /* inject the key schedule value */     \
 157         X1 += ks[r + (R) + 1] + ts[r + (R) + 0];                        \
 158         X2 += ks[r + (R) + 2] + ts[r + (R) + 1];                        \
 159         X3 += ks[r + (R) + 3] + r + (R);                                \
 160         ks[r + (R) + 4] = ks[r + (R) - 1];   /* rotate key schedule */  \
 161         ts[r + (R) + 2] = ts[r + (R) - 1];                      \
 162         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 163
 164                 /* loop through it */
 165                 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)
 166 #endif
 167                 {
 168 #define R256_8_rounds(R)                         \
 169         R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1);  \
 170         R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2);  \
 171         R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3);  \
 172         R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4);  \
 173         I256(2 * (R));                           \
 174         R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5);  \
 175         R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6);  \
 176         R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7);  \
 177         R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8);  \
 178         I256(2 * (R) + 1);
 179
 180                         R256_8_rounds(0);
 181
 182 #define R256_Unroll_R(NN) \
 183         ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
 184         (SKEIN_UNROLL_256 > (NN)))
 185
 186 #if     R256_Unroll_R(1)
 187                         R256_8_rounds(1);
 188 #endif
 189 #if     R256_Unroll_R(2)
 190                         R256_8_rounds(2);
 191 #endif
 192 #if     R256_Unroll_R(3)
 193                         R256_8_rounds(3);
 194 #endif
 195 #if     R256_Unroll_R(4)
 196                         R256_8_rounds(4);
 197 #endif
 198 #if     R256_Unroll_R(5)
 199                         R256_8_rounds(5);
 200 #endif
 201 #if     R256_Unroll_R(6)
 202                         R256_8_rounds(6);
 203 #endif
 204 #if     R256_Unroll_R(7)
 205                         R256_8_rounds(7);
 206 #endif
 207 #if     R256_Unroll_R(8)
 208                         R256_8_rounds(8);
 209 #endif
 210 #if     R256_Unroll_R(9)
 211                         R256_8_rounds(9);
 212 #endif
 213 #if     R256_Unroll_R(10)
 214                         R256_8_rounds(10);
 215 #endif
 216 #if     R256_Unroll_R(11)
 217                         R256_8_rounds(11);
 218 #endif
 219 #if     R256_Unroll_R(12)
 220                         R256_8_rounds(12);
 221 #endif
 222 #if     R256_Unroll_R(13)
 223                         R256_8_rounds(13);
 224 #endif
 225 #if     R256_Unroll_R(14)
 226                         R256_8_rounds(14);
 227 #endif
 228 #if     (SKEIN_UNROLL_256 > 14)
 229 #error  "need more unrolling in Skein_256_Process_Block"
 230 #endif
 231                 }
 232                 /*
 233                  * do the final "feedforward" xor, update context chaining vars
 234                  */
 235                 ctx->X[0] = X0 ^ w[0];
 236                 ctx->X[1] = X1 ^ w[1];
 237                 ctx->X[2] = X2 ^ w[2];
 238                 ctx->X[3] = X3 ^ w[3];
 239
 240                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 241
 242                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 243         } while (--blkCnt);
 244         ctx->h.T[0] = ts[0];
 245         ctx->h.T[1] = ts[1];
 246 }
 247
 248 #if     defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 249 size_t
 250 Skein_256_Process_Block_CodeSize(void)
 251 {
 252         return ((uint8_t *)Skein_256_Process_Block_CodeSize) -
 253             ((uint8_t *)Skein_256_Process_Block);
 254 }
 255
 256 uint_t
 257 Skein_256_Unroll_Cnt(void)
 258 {
 259         return (SKEIN_UNROLL_256);
 260 }
 261 #endif
 262 #endif
 263
 264 /* Skein_512 */
 265 #if     !(SKEIN_USE_ASM & 512)
 266 void
 267 Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
 268     size_t blkCnt, size_t byteCntAdd)
 269 {
 270         enum {
 271                 WCNT = SKEIN_512_STATE_WORDS
 272         };
 273 #undef  RCNT
 274 #define RCNT  (SKEIN_512_ROUNDS_TOTAL / 8)
 275
 276 #ifdef  SKEIN_LOOP              /* configure how much to unroll the loop */
 277 #define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
 278 #else
 279 #define SKEIN_UNROLL_512 (0)
 280 #endif
 281
 282 #if     SKEIN_UNROLL_512
 283 #if     (RCNT % SKEIN_UNROLL_512)
 284 #error "Invalid SKEIN_UNROLL_512"       /* sanity check on unroll count */
 285 #endif
 286         size_t r;
 287         /* key schedule words : chaining vars + tweak + "rotation" */
 288         uint64_t kw[WCNT + 4 + RCNT * 2];
 289 #else
 290         uint64_t kw[WCNT + 4];  /* key schedule words : chaining vars + tweak */
 291 #endif
 292         /* local copy of vars, for speed */
 293         uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
 294         uint64_t w[WCNT];               /* local copy of input block */
 295 #ifdef  SKEIN_DEBUG
 296         /* use for debugging (help compiler put Xn in registers) */
 297         const uint64_t *Xptr[8];
 298         Xptr[0] = &X0;
 299         Xptr[1] = &X1;
 300         Xptr[2] = &X2;
 301         Xptr[3] = &X3;
 302         Xptr[4] = &X4;
 303         Xptr[5] = &X5;
 304         Xptr[6] = &X6;
 305         Xptr[7] = &X7;
 306 #endif
 307
 308         Skein_assert(blkCnt != 0);      /* never call with blkCnt == 0! */
 309         ts[0] = ctx->h.T[0];
 310         ts[1] = ctx->h.T[1];
 311         do {
 312                 /*
 313                  * this implementation only supports 2**64 input bytes
 314                  * (no carry out here)
 315                  */
 316                 ts[0] += byteCntAdd;    /* update processed length */
 317
 318                 /* precompute the key schedule for this block */
 319                 ks[0] = ctx->X[0];
 320                 ks[1] = ctx->X[1];
 321                 ks[2] = ctx->X[2];
 322                 ks[3] = ctx->X[3];
 323                 ks[4] = ctx->X[4];
 324                 ks[5] = ctx->X[5];
 325                 ks[6] = ctx->X[6];
 326                 ks[7] = ctx->X[7];
 327                 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
 328                     ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
 329
 330                 ts[2] = ts[0] ^ ts[1];
 331
 332                 /* get input block in little-endian format */
 333                 Skein_Get64_LSB_First(w, blkPtr, WCNT);
 334                 DebugSaveTweak(ctx);
 335                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 336
 337                 X0 = w[0] + ks[0];      /* do the first full key injection */
 338                 X1 = w[1] + ks[1];
 339                 X2 = w[2] + ks[2];
 340                 X3 = w[3] + ks[3];
 341                 X4 = w[4] + ks[4];
 342                 X5 = w[5] + ks[5] + ts[0];
 343                 X6 = w[6] + ks[6] + ts[1];
 344                 X7 = w[7] + ks[7];
 345
 346                 blkPtr += SKEIN_512_BLOCK_BYTES;
 347
 348                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
 349                     Xptr);
 350                 /* run the rounds */
 351 #define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)             \
 352         X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
 353         X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
 354         X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
 355         X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;
 356
 357 #if     SKEIN_UNROLL_512 == 0
 358 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */  \
 359         Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)             \
 360         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
 361
 362 #define I512(R)                                                         \
 363         X0 += ks[((R) + 1) % 9];        /* inject the key schedule value */\
 364         X1 += ks[((R) + 2) % 9];                                        \
 365         X2 += ks[((R) + 3) % 9];                                        \
 366         X3 += ks[((R) + 4) % 9];                                        \
 367         X4 += ks[((R) + 5) % 9];                                        \
 368         X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];                    \
 369         X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];                    \
 370         X7 += ks[((R) + 8) % 9] + (R) + 1;                              \
 371         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 372 #else                           /* looping version */
 373 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)                 \
 374         Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)             \
 375         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
 376
 377 #define I512(R)                                                         \
 378         X0 += ks[r + (R) + 0];  /* inject the key schedule value */     \
 379         X1 += ks[r + (R) + 1];                                          \
 380         X2 += ks[r + (R) + 2];                                          \
 381         X3 += ks[r + (R) + 3];                                          \
 382         X4 += ks[r + (R) + 4];                                          \
 383         X5 += ks[r + (R) + 5] + ts[r + (R) + 0];                        \
 384         X6 += ks[r + (R) + 6] + ts[r + (R) + 1];                        \
 385         X7 += ks[r + (R) + 7] + r + (R);                                \
 386         ks[r + (R)+8] = ks[r + (R) - 1];        /* rotate key schedule */\
 387         ts[r + (R)+2] = ts[r + (R) - 1];                                \
 388         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 389
 390                 /* loop through it */
 391                 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)
 392 #endif                          /* end of looped code definitions */
 393                 {
 394 #define R512_8_rounds(R)        /* do 8 full rounds */                  \
 395         R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1);             \
 396         R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2);             \
 397         R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3);             \
 398         R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4);             \
 399         I512(2 * (R));                                                  \
 400         R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5);             \
 401         R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6);             \
 402         R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7);             \
 403         R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8);             \
 404         I512(2*(R) + 1);                /* and key injection */
 405
 406                         R512_8_rounds(0);
 407
 408 #define R512_Unroll_R(NN) \
 409         ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
 410         (SKEIN_UNROLL_512 > (NN)))
 411
 412 #if     R512_Unroll_R(1)
 413                         R512_8_rounds(1);
 414 #endif
 415 #if     R512_Unroll_R(2)
 416                         R512_8_rounds(2);
 417 #endif
 418 #if     R512_Unroll_R(3)
 419                         R512_8_rounds(3);
 420 #endif
 421 #if     R512_Unroll_R(4)
 422                         R512_8_rounds(4);
 423 #endif
 424 #if     R512_Unroll_R(5)
 425                         R512_8_rounds(5);
 426 #endif
 427 #if     R512_Unroll_R(6)
 428                         R512_8_rounds(6);
 429 #endif
 430 #if     R512_Unroll_R(7)
 431                         R512_8_rounds(7);
 432 #endif
 433 #if     R512_Unroll_R(8)
 434                         R512_8_rounds(8);
 435 #endif
 436 #if     R512_Unroll_R(9)
 437                         R512_8_rounds(9);
 438 #endif
 439 #if     R512_Unroll_R(10)
 440                         R512_8_rounds(10);
 441 #endif
 442 #if     R512_Unroll_R(11)
 443                         R512_8_rounds(11);
 444 #endif
 445 #if     R512_Unroll_R(12)
 446                         R512_8_rounds(12);
 447 #endif
 448 #if     R512_Unroll_R(13)
 449                         R512_8_rounds(13);
 450 #endif
 451 #if     R512_Unroll_R(14)
 452                         R512_8_rounds(14);
 453 #endif
 454 #if     (SKEIN_UNROLL_512 > 14)
 455 #error "need more unrolling in Skein_512_Process_Block"
 456 #endif
 457                 }
 458
 459                 /*
 460                  * do the final "feedforward" xor, update context chaining vars
 461                  */
 462                 ctx->X[0] = X0 ^ w[0];
 463                 ctx->X[1] = X1 ^ w[1];
 464                 ctx->X[2] = X2 ^ w[2];
 465                 ctx->X[3] = X3 ^ w[3];
 466                 ctx->X[4] = X4 ^ w[4];
 467                 ctx->X[5] = X5 ^ w[5];
 468                 ctx->X[6] = X6 ^ w[6];
 469                 ctx->X[7] = X7 ^ w[7];
 470                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 471
 472                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 473         } while (--blkCnt);
 474         ctx->h.T[0] = ts[0];
 475         ctx->h.T[1] = ts[1];
 476 }
 477
 478 #if     defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 479 size_t
 480 Skein_512_Process_Block_CodeSize(void)
 481 {
 482         return ((uint8_t *)Skein_512_Process_Block_CodeSize) -
 483             ((uint8_t *)Skein_512_Process_Block);
 484 }
 485
 486 uint_t
 487 Skein_512_Unroll_Cnt(void)
 488 {
 489         return (SKEIN_UNROLL_512);
 490 }
 491 #endif
 492 #endif
 493
 494 /*  Skein1024 */
 495 #if     !(SKEIN_USE_ASM & 1024)
 496 void
 497 Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
 498     size_t blkCnt, size_t byteCntAdd)
 499 {
 500         /* do it in C, always looping (unrolled is bigger AND slower!) */
 501         enum {
 502                 WCNT = SKEIN1024_STATE_WORDS
 503         };
 504 #undef  RCNT
 505 #define RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
 506
 507 #ifdef  SKEIN_LOOP              /* configure how much to unroll the loop */
 508 #define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
 509 #else
 510 #define SKEIN_UNROLL_1024 (0)
 511 #endif
 512
 513 #if     (SKEIN_UNROLL_1024 != 0)
 514 #if     (RCNT % SKEIN_UNROLL_1024)
 515 #error "Invalid SKEIN_UNROLL_1024"      /* sanity check on unroll count */
 516 #endif
 517         size_t r;
 518         /* key schedule words : chaining vars + tweak + "rotation" */
 519         uint64_t kw[WCNT + 4 + RCNT * 2];
 520 #else
 521         uint64_t kw[WCNT + 4];  /* key schedule words : chaining vars + tweak */
 522 #endif
 523
 524         /* local copy of vars, for speed */
 525         uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,
 526             X12, X13, X14, X15;
 527         uint64_t w[WCNT];               /* local copy of input block */
 528 #ifdef  SKEIN_DEBUG
 529         /* use for debugging (help compiler put Xn in registers) */
 530         const uint64_t *Xptr[16];
 531         Xptr[0] = &X00;
 532         Xptr[1] = &X01;
 533         Xptr[2] = &X02;
 534         Xptr[3] = &X03;
 535         Xptr[4] = &X04;
 536         Xptr[5] = &X05;
 537         Xptr[6] = &X06;
 538         Xptr[7] = &X07;
 539         Xptr[8] = &X08;
 540         Xptr[9] = &X09;
 541         Xptr[10] = &X10;
 542         Xptr[11] = &X11;
 543         Xptr[12] = &X12;
 544         Xptr[13] = &X13;
 545         Xptr[14] = &X14;
 546         Xptr[15] = &X15;
 547 #endif
 548
 549         Skein_assert(blkCnt != 0);      /* never call with blkCnt == 0! */
 550         ts[0] = ctx->h.T[0];
 551         ts[1] = ctx->h.T[1];
 552         do {
 553                 /*
 554                  * this implementation only supports 2**64 input bytes
 555                  * (no carry out here)
 556                  */
 557                 ts[0] += byteCntAdd;    /* update processed length */
 558
 559                 /* precompute the key schedule for this block */
 560                 ks[0] = ctx->X[0];
 561                 ks[1] = ctx->X[1];
 562                 ks[2] = ctx->X[2];
 563                 ks[3] = ctx->X[3];
 564                 ks[4] = ctx->X[4];
 565                 ks[5] = ctx->X[5];
 566                 ks[6] = ctx->X[6];
 567                 ks[7] = ctx->X[7];
 568                 ks[8] = ctx->X[8];
 569                 ks[9] = ctx->X[9];
 570                 ks[10] = ctx->X[10];
 571                 ks[11] = ctx->X[11];
 572                 ks[12] = ctx->X[12];
 573                 ks[13] = ctx->X[13];
 574                 ks[14] = ctx->X[14];
 575                 ks[15] = ctx->X[15];
 576                 ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
 577                     ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
 578                     ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
 579                     ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
 580
 581                 ts[2] = ts[0] ^ ts[1];
 582
 583                 /* get input block in little-endian format */
 584                 Skein_Get64_LSB_First(w, blkPtr, WCNT);
 585                 DebugSaveTweak(ctx);
 586                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 587
 588                 X00 = w[0] + ks[0];     /* do the first full key injection */
 589                 X01 = w[1] + ks[1];
 590                 X02 = w[2] + ks[2];
 591                 X03 = w[3] + ks[3];
 592                 X04 = w[4] + ks[4];
 593                 X05 = w[5] + ks[5];
 594                 X06 = w[6] + ks[6];
 595                 X07 = w[7] + ks[7];
 596                 X08 = w[8] + ks[8];
 597                 X09 = w[9] + ks[9];
 598                 X10 = w[10] + ks[10];
 599                 X11 = w[11] + ks[11];
 600                 X12 = w[12] + ks[12];
 601                 X13 = w[13] + ks[13] + ts[0];
 602                 X14 = w[14] + ks[14] + ts[1];
 603                 X15 = w[15] + ks[15];
 604
 605                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
 606                     Xptr);
 607
 608 #define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,   \
 609         pD, pE, pF, ROT, rNum)                                          \
 610         X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
 611         X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
 612         X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
 613         X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\
 614         X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\
 615         X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\
 616         X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\
 617         X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;
 618
 619 #if     SKEIN_UNROLL_1024 == 0
 620 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,   \
 621         pE, pF, ROT, rn)                                                \
 622         Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,   \
 623         pD, pE, pF, ROT, rn)                                            \
 624         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
 625
 626 #define I1024(R)                                                        \
 627         X00 += ks[((R) + 1) % 17];      /* inject the key schedule value */\
 628         X01 += ks[((R) + 2) % 17];                                      \
 629         X02 += ks[((R) + 3) % 17];                                      \
 630         X03 += ks[((R) + 4) % 17];                                      \
 631         X04 += ks[((R) + 5) % 17];                                      \
 632         X05 += ks[((R) + 6) % 17];                                      \
 633         X06 += ks[((R) + 7) % 17];                                      \
 634         X07 += ks[((R) + 8) % 17];                                      \
 635         X08 += ks[((R) + 9) % 17];                                      \
 636         X09 += ks[((R) + 10) % 17];                                     \
 637         X10 += ks[((R) + 11) % 17];                                     \
 638         X11 += ks[((R) + 12) % 17];                                     \
 639         X12 += ks[((R) + 13) % 17];                                     \
 640         X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3];                 \
 641         X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3];                 \
 642         X15 += ks[((R) + 16) % 17] + (R) +1;                            \
 643         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 644 #else                           /* looping version */
 645 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,   \
 646         pE, pF, ROT, rn)                                                \
 647         Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,   \
 648         pD, pE, pF, ROT, rn)                                            \
 649         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
 650
 651 #define I1024(R)                                                        \
 652         X00 += ks[r + (R) + 0]; /* inject the key schedule value */     \
 653         X01 += ks[r + (R) + 1];                                         \
 654         X02 += ks[r + (R) + 2];                                         \
 655         X03 += ks[r + (R) + 3];                                         \
 656         X04 += ks[r + (R) + 4];                                         \
 657         X05 += ks[r + (R) + 5];                                         \
 658         X06 += ks[r + (R) + 6];                                         \
 659         X07 += ks[r + (R) + 7];                                         \
 660         X08 += ks[r + (R) + 8];                                         \
 661         X09 += ks[r + (R) + 9];                                         \
 662         X10 += ks[r + (R) + 10];                                        \
 663         X11 += ks[r + (R) + 11];                                        \
 664         X12 += ks[r + (R) + 12];                                        \
 665         X13 += ks[r + (R) + 13] + ts[r + (R) + 0];                      \
 666         X14 += ks[r + (R) + 14] + ts[r + (R) + 1];                      \
 667         X15 += ks[r + (R) + 15] +  r + (R);                             \
 668         ks[r + (R) + 16] = ks[r + (R) - 1];     /* rotate key schedule */\
 669         ts[r + (R) + 2] = ts[r + (R) - 1];                              \
 670         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 671
 672                 /* loop through it */
 673                 for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)
 674 #endif
 675                 {
 676 #define R1024_8_rounds(R)       /* do 8 full rounds */                  \
 677         R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,   \
 678             14, 15, R1024_0, 8 * (R) + 1);                              \
 679         R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,   \
 680             08, 01, R1024_1, 8 * (R) + 2);                              \
 681         R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,   \
 682             10, 09, R1024_2, 8 * (R) + 3);                              \
 683         R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,   \
 684             12, 07, R1024_3, 8 * (R) + 4);                              \
 685         I1024(2 * (R));                                                 \
 686         R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,   \
 687             14, 15, R1024_4, 8 * (R) + 5);                              \
 688         R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,   \
 689             08, 01, R1024_5, 8 * (R) + 6);                              \
 690         R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,   \
 691             10, 09, R1024_6, 8 * (R) + 7);                              \
 692         R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,   \
 693             12, 07, R1024_7, 8 * (R) + 8);                              \
 694         I1024(2 * (R) + 1);
 695
 696                         R1024_8_rounds(0);
 697
 698 #define R1024_Unroll_R(NN)                                              \
 699         ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \
 700         (SKEIN_UNROLL_1024 > (NN)))
 701
 702 #if     R1024_Unroll_R(1)
 703                         R1024_8_rounds(1);
 704 #endif
 705 #if     R1024_Unroll_R(2)
 706                         R1024_8_rounds(2);
 707 #endif
 708 #if     R1024_Unroll_R(3)
 709                         R1024_8_rounds(3);
 710 #endif
 711 #if     R1024_Unroll_R(4)
 712                         R1024_8_rounds(4);
 713 #endif
 714 #if     R1024_Unroll_R(5)
 715                         R1024_8_rounds(5);
 716 #endif
 717 #if     R1024_Unroll_R(6)
 718                         R1024_8_rounds(6);
 719 #endif
 720 #if     R1024_Unroll_R(7)
 721                         R1024_8_rounds(7);
 722 #endif
 723 #if     R1024_Unroll_R(8)
 724                         R1024_8_rounds(8);
 725 #endif
 726 #if     R1024_Unroll_R(9)
 727                         R1024_8_rounds(9);
 728 #endif
 729 #if     R1024_Unroll_R(10)
 730                         R1024_8_rounds(10);
 731 #endif
 732 #if     R1024_Unroll_R(11)
 733                         R1024_8_rounds(11);
 734 #endif
 735 #if     R1024_Unroll_R(12)
 736                         R1024_8_rounds(12);
 737 #endif
 738 #if     R1024_Unroll_R(13)
 739                         R1024_8_rounds(13);
 740 #endif
 741 #if     R1024_Unroll_R(14)
 742                         R1024_8_rounds(14);
 743 #endif
 744 #if     (SKEIN_UNROLL_1024 > 14)
 745 #error  "need more unrolling in Skein_1024_Process_Block"
 746 #endif
 747                 }
 748                 /*
 749                  * do the final "feedforward" xor, update context chaining vars
 750                  */
 751
 752                 ctx->X[0] = X00 ^ w[0];
 753                 ctx->X[1] = X01 ^ w[1];
 754                 ctx->X[2] = X02 ^ w[2];
 755                 ctx->X[3] = X03 ^ w[3];
 756                 ctx->X[4] = X04 ^ w[4];
 757                 ctx->X[5] = X05 ^ w[5];
 758                 ctx->X[6] = X06 ^ w[6];
 759                 ctx->X[7] = X07 ^ w[7];
 760                 ctx->X[8] = X08 ^ w[8];
 761                 ctx->X[9] = X09 ^ w[9];
 762                 ctx->X[10] = X10 ^ w[10];
 763                 ctx->X[11] = X11 ^ w[11];
 764                 ctx->X[12] = X12 ^ w[12];
 765                 ctx->X[13] = X13 ^ w[13];
 766                 ctx->X[14] = X14 ^ w[14];
 767                 ctx->X[15] = X15 ^ w[15];
 768
 769                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 770
 771                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 772                 blkPtr += SKEIN1024_BLOCK_BYTES;
 773         } while (--blkCnt);
 774         ctx->h.T[0] = ts[0];
 775         ctx->h.T[1] = ts[1];
 776 }
 777
 778 #if     defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 779 size_t
 780 Skein1024_Process_Block_CodeSize(void)
 781 {
 782         return ((uint8_t *)Skein1024_Process_Block_CodeSize) -
 783             ((uint8_t *)Skein1024_Process_Block);
 784 }
 785
 786 uint_t
 787 Skein1024_Unroll_Cnt(void)
 788 {
 789         return (SKEIN_UNROLL_1024);
 790 }
 791 #endif
 792 #endif