usr/src/common/crypto/arcfour/sun4v/arcfour_crypt.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25
  26 #include "../arcfour.h"
  27
  28 /* Initialize the key stream 'key' using the key value */
  29 void
  30 arcfour_key_init(ARCFour_key *key, uchar_t *keyval, int keyvallen)
  31 {
  32         uchar_t ext_keyval[256];
  33         uchar_t tmp;
  34         int i, j;
  35
  36         for (i = j = 0; i < 256; i++, j++) {
  37                 if (j == keyvallen)
  38                         j = 0;
  39
  40                 ext_keyval[i] = keyval[j];
  41         }
  42         for (i = 0; i < 256; i++)
  43                 key->arr[i] = (uchar_t)i;
  44
  45         j = 0;
  46         for (i = 0; i < 256; i++) {
  47                 j = (j + key->arr[i] + ext_keyval[i]) % 256;
  48                 tmp = key->arr[i];
  49                 key->arr[i] = key->arr[j];
  50                 key->arr[j] = tmp;
  51         }
  52         key->i = 0;
  53         key->j = 0;
  54 }
  55
  56
  57 /*
  58  * Encipher 'in' using 'key.
  59  * in and out can point to the same location
  60  */
  61 void
  62 arcfour_crypt(ARCFour_key *key, uchar_t *in, uchar_t *out, size_t len)
  63 {
  64         size_t ii;
  65         unsigned long long in0, merge = 0, merge0 = 0, merge1, mask = 0;
  66         uchar_t i, j, *base, jj, *base1, tmp;
  67         unsigned int tmp0, tmp1, i_accum, shift = 0, i1;
  68
  69         int index;
  70
  71         base = key->arr;
  72
  73         index = (((uintptr_t)in) & 0x7);
  74
  75         /* Get the 'in' on an 8-byte alignment */
  76         if (index > 0) {
  77                 i = key->i;
  78                 j = key->j;
  79
  80                 for (index = 8 - index; (index-- > 0) && len > 0;
  81                     len--, in++, out++) {
  82
  83                         i = i + 1;
  84                         j = j + key->arr[i];
  85                         tmp = key->arr[i];
  86                         key->arr[i] = key->arr[j];
  87                         key->arr[j] = tmp;
  88                         tmp = key->arr[i] + key->arr[j];
  89                         *out = *in ^ key->arr[tmp];
  90                 }
  91                 key->i = i;
  92                 key->j = j;
  93
  94         }
  95         if (len == 0)
  96                 return;
  97
  98         /* See if we're fortunate and 'out' got aligned as well */
  99
 100
 101         /*
 102          * Niagara optimized version for
 103          * the cases where the input and output  buffers are aligned on
 104          * a multiple of 8-byte boundary.
 105          */
 106 #ifdef  sun4v
 107         if ((((uintptr_t)out) & 7) != 0) {
 108 #endif  /* sun4v */
 109                 i = key->i;
 110                 j = key->j;
 111                 for (ii = 0; ii < len; ii++) {
 112                         i = i + 1;
 113                         tmp0 = base[i];
 114                         j = j + tmp0;
 115                         tmp1 = base[j];
 116                         base[i] = (uchar_t)tmp1;
 117                         base[j] = (uchar_t)tmp0;
 118                         tmp0 += tmp1;
 119                         tmp0 = tmp0 & 0xff;
 120                         out[ii] = in[ii] ^ base[tmp0];
 121                 }
 122                 key->i = i;
 123                 key->j = j;
 124 #ifdef  sun4v
 125         } else {
 126                 i = key->i;
 127                 j = key->j;
 128
 129                 /*
 130                  * Want to align base[i] on a 2B boundary -- allows updates
 131                  * via [i] to be performed in 2B chunks (reducing # of stores).
 132                  * Requires appropriate alias detection.
 133                  */
 134
 135                 if (((i+1) % 2) != 0) {
 136                         i = i + 1;
 137                         tmp0 = base[i];
 138                         j = j + tmp0;
 139                         tmp1 = base[j];
 140
 141                         base[i] = (uchar_t)tmp1;
 142                         base[j] = (uchar_t)tmp0;
 143
 144                         tmp0 += tmp1;
 145                         tmp0 = tmp0 & 0xff;
 146
 147                         merge0 = (unsigned long long)(base[tmp0]) << 56;
 148                         shift = 8; mask = 0xff;
 149                 }
 150
 151                 /*
 152                  * Note - in and out may now be misaligned -
 153                  * as updating [out] in 8B chunks need to handle this
 154                  * possibility. Also could have a 1B overrun.
 155                  * Need to drop out of loop early as a result.
 156                  */
 157
 158                 for (ii = 0, i1 = i; ii < ((len-1)  & (~7));
 159                     ii += 8, i1 = i1&0xff) {
 160
 161                         /*
 162                          * If i < less than 248, know wont wrap around
 163                          * (i % 256), so don't need to bother with masking i
 164                          * after each increment
 165                          */
 166                         if (i1 < 248) {
 167
 168                                 /* BYTE 0 */
 169                                 i1 = (i1 + 1);
 170
 171                                 /*
 172                                  * Creating this base pointer reduces subsequent
 173                                  * arihmetic ops required to load [i]
 174                                  *
 175                                  * N.B. don't need to check if [j] aliases.
 176                                  * [i] and [j] end up with the same values
 177                                  * anyway.
 178                                  */
 179                                 base1 = &base[i1];
 180
 181                                 tmp0 = base1[0];
 182                                 j = j + tmp0;
 183
 184                                 tmp1 = base[j];
 185                                 /*
 186                                  * Don't store [i] yet
 187                                  */
 188                                 i_accum = tmp1;
 189                                 base[j] = (uchar_t)tmp0;
 190
 191                                 tmp0 += tmp1;
 192                                 tmp0 = tmp0 & 0xff;
 193
 194                                 /*
 195                                  * Check [tmp0] doesn't alias with [i]
 196                                  */
 197
 198                                 /*
 199                                  * Updating [out] in 8B chunks
 200                                  */
 201                                 if (i1 == tmp0) {
 202                                         merge =
 203                                             (unsigned long long)(i_accum) << 56;
 204                                 } else {
 205                                         merge =
 206                                             (unsigned long long)(base[tmp0]) <<
 207                                             56;
 208                                 }
 209
 210                                 /* BYTE 1 */
 211                                 tmp0 = base1[1];
 212
 213                                 j = j + tmp0;
 214
 215                                 /*
 216                                  * [j] can now alias with [i] and [i-1]
 217                                  * If alias abort speculation
 218                                  */
 219                                 if ((i1 ^ j) < 2) {
 220                                         base1[0] = (uchar_t)i_accum;
 221
 222                                         tmp1 = base[j];
 223
 224                                         base1[1] = (uchar_t)tmp1;
 225                                         base[j] = (uchar_t)tmp0;
 226
 227                                         tmp0 += tmp1;
 228                                         tmp0 = tmp0 & 0xff;
 229
 230                                         merge |= (unsigned long long)
 231                                             (base[tmp0]) << 48;
 232                                 } else {
 233
 234                                         tmp1 = base[j];
 235
 236                                         i_accum = i_accum << 8;
 237                                         i_accum |= tmp1;
 238
 239                                         base[j] = (uchar_t)tmp0;
 240
 241                                         tmp0 += tmp1;
 242                                         tmp0 = tmp0 & 0xff;
 243
 244                                         /*
 245                                          * Speculation suceeded! Update [i]
 246                                          * in 2B chunk
 247                                          */
 248                                         /* LINTED E_BAD_PTR_CAST_ALIGN */
 249                                         *((unsigned short *) &base[i1]) =
 250                                             i_accum;
 251
 252                                         merge |=
 253                                             (unsigned long long)(base[tmp0]) <<
 254                                             48;
 255                                 }
 256
 257
 258                                 /*
 259                                  * Too expensive to perform [i] speculation for
 260                                  * every byte. Just need to reduce frequency
 261                                  * of stores until store buffer full stalls
 262                                  * are not the bottleneck.
 263                                  */
 264
 265                                 /* BYTE 2 */
 266                                 tmp0 = base1[2];
 267                                 j = j + tmp0;
 268                                 tmp1 = base[j];
 269                                 base1[2] = (uchar_t)tmp1;
 270                                 base[j] = (uchar_t)tmp0;
 271                                 tmp1 += tmp0;
 272                                 tmp1 = tmp1 & 0xff;
 273                                 merge |= (unsigned long long)(base[tmp1]) << 40;
 274
 275                                 /* BYTE 3 */
 276                                 tmp0 = base1[3];
 277                                 j = j + tmp0;
 278                                 tmp1 = base[j];
 279                                 base1[3] = (uchar_t)tmp1;
 280                                 base[j] = (uchar_t)tmp0;
 281                                 tmp0 += tmp1;
 282                                 tmp0 = tmp0 & 0xff;
 283                                 merge |= (unsigned long long)(base[tmp0]) << 32;
 284
 285                                 /* BYTE 4 */
 286                                 tmp0 = base1[4];
 287                                 j = j + tmp0;
 288                                 tmp1 = base[j];
 289                                 base1[4] = (uchar_t)tmp1;
 290                                 base[j] = (uchar_t)tmp0;
 291                                 tmp0 += tmp1;
 292                                 tmp0 = tmp0 & 0xff;
 293                                 merge |= (unsigned long long)(base[tmp0]) << 24;
 294
 295                                 /* BYTE 5 */
 296                                 tmp0 = base1[5];
 297                                 j = j + tmp0;
 298                                 tmp1 = base[j];
 299                                 base1[5] = (uchar_t)tmp1;
 300                                 base[j] = (uchar_t)tmp0;
 301                                 tmp0 += tmp1;
 302                                 tmp0 = tmp0 & 0xff;
 303                                 merge |= (unsigned long long)(base[tmp0]) << 16;
 304
 305                                 /* BYTE 6 */
 306                                 i1 = (i1+6);
 307                                 tmp0 = base1[6];
 308                                 j = j + tmp0;
 309                                 tmp1 = base[j];
 310                                 i_accum = tmp1;
 311                                 base[j] = (uchar_t)tmp0;
 312
 313                                 tmp0 += tmp1;
 314                                 tmp0 = tmp0 & 0xff;
 315
 316                                 if (i1 == tmp0) {
 317                                         merge |=
 318                                             (unsigned long long)(i_accum) << 8;
 319                                 } else {
 320                                         merge |=
 321                                             (unsigned long long)(base[tmp0]) <<
 322                                             8;
 323                                 }
 324
 325                                 /* BYTE 7 */
 326                                 tmp0 = base1[7];
 327
 328                                 /*
 329                                  * Perform [i] speculation again. Indentical
 330                                  * to that performed for BYTE0 and BYTE1.
 331                                  */
 332                                 j = j + tmp0;
 333                                 if ((i1 ^ j) < 2) {
 334                                         base1[6] = (uchar_t)i_accum;
 335                                         tmp1 = base[j];
 336
 337                                         base1[7] = (uchar_t)tmp1;
 338                                         base[j] = (uchar_t)tmp0;
 339
 340                                         tmp0 += tmp1;
 341                                         tmp0 = tmp0 & 0xff;
 342
 343                                         merge |=
 344                                             (unsigned long long)(base[tmp0]);
 345
 346                                 } else {
 347                                         tmp1 = base[j];
 348
 349                                         i_accum = i_accum << 8;
 350                                         i_accum |= tmp1;
 351
 352                                         base[j] = (uchar_t)tmp0;
 353
 354                                         tmp0 += tmp1;
 355                                         tmp0 = tmp0 & 0xff;
 356
 357                                         /* LINTED E_BAD_PTR_CAST_ALIGN */
 358                                         *((unsigned short *) &base[i1]) =
 359                                             i_accum;
 360
 361                                         merge |=
 362                                             (unsigned long long)(base[tmp0]);
 363                                 }
 364                                 i1++;
 365                         } else {
 366                                 /*
 367                                  * i is too close to wrap-around to allow
 368                                  * masking to be disregarded
 369                                  */
 370
 371                                 /*
 372                                  * Same old speculation for BYTE 0 and BYTE 1
 373                                  */
 374
 375                                 /* BYTE 0 */
 376                                 i1 = (i1 + 1) & 0xff;
 377                                 jj = (uchar_t)i1;
 378
 379                                 tmp0 = base[i1];
 380                                 j = j + tmp0;
 381
 382                                 tmp1 = base[j];
 383                                 i_accum = tmp1;
 384                                 base[j] = (uchar_t)tmp0;
 385
 386                                 tmp0 += tmp1;
 387                                 tmp0 = tmp0 & 0xff;
 388
 389                                 if (i1 == tmp0) {
 390                                         merge =
 391                                             (unsigned long long)(i_accum) << 56;
 392                                 } else {
 393                                         merge =
 394                                             (unsigned long long)(base[tmp0]) <<
 395                                             56;
 396                                 }
 397
 398                                 /* BYTE 1 */
 399                                 tmp0 = base[i1+1];
 400
 401                                 j = j + tmp0;
 402
 403                                 if ((jj ^ j) < 2) {
 404                                         base[jj] = (uchar_t)i_accum;
 405
 406                                         tmp1 = base[j];
 407
 408                                         base[i1+1] = (uchar_t)tmp1;
 409                                         base[j] = (uchar_t)tmp0;
 410
 411                                         tmp0 += tmp1;
 412                                         tmp0 = tmp0 & 0xff;
 413
 414                                         merge |=
 415                                             (unsigned long long)(base[tmp0]) <<
 416                                             48;
 417                                 } else {
 418
 419                                         tmp1 = base[j];
 420
 421                                         i_accum = i_accum << 8;
 422                                         i_accum |= tmp1;
 423
 424                                         base[j] = (uchar_t)tmp0;
 425
 426                                         tmp0 += tmp1;
 427                                         tmp0 = tmp0 & 0xff;
 428
 429                                         /* LINTED E_BAD_PTR_CAST_ALIGN */
 430                                         *((unsigned short *) &base[jj]) =
 431                                             i_accum;
 432
 433                                         merge |=
 434                                             (unsigned long long)(base[tmp0]) <<
 435                                             48;
 436                                 }
 437
 438                                 /* BYTE 2 */
 439                                 /*
 440                                  * As know i must be even when enter loop (to
 441                                  * satisfy alignment), can only wrap around
 442                                  * on the even bytes. So just need to perform
 443                                  * mask every 2nd byte
 444                                  */
 445                                 i1 = (i1 + 2) & 0xff;
 446                                 tmp0 = base[i1];
 447                                 j = j + tmp0;
 448                                 tmp1 = base[j];
 449                                 base[i1] = (uchar_t)tmp1;
 450                                 base[j] = (uchar_t)tmp0;
 451                                 tmp0 += tmp1;
 452                                 tmp0 = tmp0 & 0xff;
 453                                 merge |= (unsigned long long)(base[tmp0]) << 40;
 454
 455                                 /* BYTE 3 */
 456                                 tmp0 = base[i1+1];
 457                                 j = j + tmp0;
 458                                 tmp1 = base[j];
 459                                 base[i1+1] = (uchar_t)tmp1;
 460                                 base[j] = (uchar_t)tmp0;
 461                                 tmp0 += tmp1;
 462                                 tmp0 = tmp0 & 0xff;
 463                                 merge |= (unsigned long long)(base[tmp0]) << 32;
 464
 465                                 /* BYTE 4 */
 466                                 i1 = (i1 + 2) & 0xff;
 467                                 tmp0 = base[i1];
 468                                 j = j + tmp0;
 469                                 tmp1 = base[j];
 470                                 base[i1] = (uchar_t)tmp1;
 471                                 base[j] = (uchar_t)tmp0;
 472                                 tmp0 += tmp1;
 473                                 tmp0 = tmp0 & 0xff;
 474                                 merge |= (unsigned long long)(base[tmp0]) << 24;
 475
 476                                 /* BYTE 5 */
 477                                 tmp0 = base[i1+1];
 478                                 j = j + tmp0;
 479                                 tmp1 = base[j];
 480                                 base[i1+1] = (uchar_t)tmp1;
 481                                 base[j] = (uchar_t)tmp0;
 482                                 tmp0 += tmp1;
 483                                 tmp0 = tmp0 & 0xff;
 484                                 merge |= (unsigned long long)(base[tmp0]) << 16;
 485
 486                                 /* BYTE 6 */
 487                                 i1 = (i1+2) &0xff;
 488                                 jj = (uchar_t)i1;
 489                                 tmp0 = base[i1];
 490
 491                                 j = j + tmp0;
 492
 493                                 tmp1 = base[j];
 494                                 i_accum = tmp1;
 495                                 base[j] = (uchar_t)tmp0;
 496
 497
 498                                 tmp0 += tmp1;
 499                                 tmp0 = tmp0 & 0xff;
 500
 501                                 if (i1 == tmp0) {
 502                                         merge |=
 503                                             (unsigned long long)(i_accum) << 8;
 504                                 } else {
 505                                         merge |=
 506                                             (unsigned long long)(base[tmp0]) <<
 507                                             8;
 508                                 }
 509
 510                                 /* BYTE 7 */
 511                                 i1++;
 512                                 tmp0 = base[i1];
 513
 514                                 j = j + tmp0;
 515                                 if ((jj ^ j) < 2) {
 516                                         base[jj] = (uchar_t)i_accum;
 517                                         tmp1 = base[j];
 518
 519                                         base[i1] = (uchar_t)tmp1;
 520                                         base[j] = (uchar_t)tmp0;
 521
 522                                         tmp0 += tmp1;
 523                                         tmp0 = tmp0 & 0xff;
 524
 525                                         merge |=
 526                                             (unsigned long long)(base[tmp0]);
 527
 528                                 } else {
 529
 530                                         tmp1 = base[j];
 531
 532                                         i_accum = i_accum << 8;
 533                                         i_accum |= tmp1;
 534
 535                                         base[j] = (uchar_t)tmp0;
 536
 537                                         tmp0 += tmp1;
 538                                         tmp0 = tmp0 & 0xff;
 539
 540                                         /* LINTED E_BAD_PTR_CAST_ALIGN */
 541                                         *((unsigned short *) &base[jj]) =
 542                                             i_accum;
 543
 544                                         merge |=
 545                                             (unsigned long long)(base[tmp0]);
 546                                 }
 547                         }
 548
 549                         /*
 550                          * Perform update to [out]
 551                          * Remember could be alignment issues
 552                          */
 553                         /* LINTED E_BAD_PTR_CAST_ALIGN */
 554                         in0 = *((unsigned long long *) (&in[ii]));
 555
 556                         merge1 = merge0 | (merge >> shift);
 557
 558                         merge0 = (merge & mask) << 56;
 559
 560                         in0 = in0 ^ merge1;
 561
 562                         /* LINTED E_BAD_PTR_CAST_ALIGN */
 563                         *((unsigned long long *) (&out[ii])) = in0;
 564                 }
 565
 566                 i = (uchar_t)i1;
 567
 568                 /*
 569                  * Handle any overrun
 570                  */
 571                 if (shift) {
 572                         out[ii] = in[ii] ^ (merge0 >> 56);
 573                         ii++;
 574                 }
 575
 576                 /*
 577                  * Handle final few bytes
 578                  */
 579                 for (; ii < len; ii++) {
 580                         i = i + 1;
 581                         tmp0 = base[i];
 582                         j = j + tmp0;
 583                         tmp1 = base[j];
 584
 585                         base[i] = (uchar_t)tmp1;
 586                         base[j] = (uchar_t)tmp0;
 587
 588                         tmp0 += tmp1;
 589                         tmp0 = tmp0 & 0xff;
 590                         out[ii] = in[ii] ^ base[tmp0];
 591                 }
 592                 key->i = i;
 593                 key->j = j;
 594         }
 595 #endif /* sun4v */
 596 }