sys/crypto/aesni/aesni_ghash.c

   1 /*-
   2  * Copyright (c) 2014 The FreeBSD Foundation
   3  * All rights reserved.
   4  *
   5  * This software was developed by John-Mark Gurney under
   6  * the sponsorship of the FreeBSD Foundation and
   7  * Rubicon Communications, LLC (Netgate).
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1.  Redistributions of source code must retain the above copyright
  12  *     notice, this list of conditions and the following disclaimer.
  13  * 2.  Redistributions in binary form must reproduce the above copyright
  14  *     notice, this list of conditions and the following disclaimer in the
  15  *     documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  *
  29  *
  30  */
  31
  32 /*
  33  * Figure 5, 8 and 12 are copied from the Intel white paper:
  34  * Intel® Carry-Less Multiplication Instruction and its Usage for
  35  * Computing the GCM Mode
  36  *
  37  * and as such are:
  38  * Copyright © 2010 Intel Corporation.
  39  * All rights reserved.
  40  *
  41  * Redistribution and use in source and binary forms, with or without
  42  * modification, are permitted provided that the following conditions
  43  * are met:
  44  *   * Redistributions of source code must retain the above copyright
  45  *     notice, this list of conditions and the following disclaimer.
  46  *   * Redistributions in binary form must reproduce the above copyright
  47  *     notice, this list of conditions and the following disclaimer in the
  48  *     documentation and/or other materials provided with the distribution.
  49  *   * Neither the name of Intel Corporation nor the
  50  *     names of its contributors may be used to endorse or promote products
  51  *     derived from this software without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  54  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  55  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  56  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  57  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  58  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  59  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  60  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  61  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  62  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  63  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  64  */
  65
  66 #ifdef _KERNEL
  67 #include <crypto/aesni/aesni.h>
  68 #include <crypto/aesni/aesni_os.h>
  69 #else
  70 #include <stdint.h>
  71 #endif
  72
  73 #include <wmmintrin.h>
  74 #include <emmintrin.h>
  75 #include <smmintrin.h>
  76
  77 static inline int
  78 m128icmp(__m128i a, __m128i b)
  79 {
  80         __m128i cmp;
  81
  82         cmp = _mm_cmpeq_epi32(a, b);
  83
  84         return _mm_movemask_epi8(cmp) == 0xffff;
  85 }
  86
  87 #ifdef __i386__
  88 static inline __m128i
  89 _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
  90 {
  91
  92         if (!ndx) {
  93                 a = _mm_insert_epi32(a, b, 0);
  94                 a = _mm_insert_epi32(a, b >> 32, 1);
  95         } else {
  96                 a = _mm_insert_epi32(a, b, 2);
  97                 a = _mm_insert_epi32(a, b >> 32, 3);
  98         }
  99
 100         return a;
 101 }
 102 #endif
 103
 104 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
 105
 106 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
 107 static void
 108 gfmul(__m128i a, __m128i b, __m128i *res)
 109 {
 110         __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
 111
 112         tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
 113         tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
 114         tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
 115         tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
 116
 117         tmp4 = _mm_xor_si128(tmp4, tmp5);
 118         tmp5 = _mm_slli_si128(tmp4, 8);
 119         tmp4 = _mm_srli_si128(tmp4, 8);
 120         tmp3 = _mm_xor_si128(tmp3, tmp5);
 121         tmp6 = _mm_xor_si128(tmp6, tmp4);
 122
 123         tmp7 = _mm_srli_epi32(tmp3, 31);
 124         tmp8 = _mm_srli_epi32(tmp6, 31);
 125         tmp3 = _mm_slli_epi32(tmp3, 1);
 126         tmp6 = _mm_slli_epi32(tmp6, 1);
 127
 128         tmp9 = _mm_srli_si128(tmp7, 12);
 129         tmp8 = _mm_slli_si128(tmp8, 4);
 130         tmp7 = _mm_slli_si128(tmp7, 4);
 131         tmp3 = _mm_or_si128(tmp3, tmp7);
 132         tmp6 = _mm_or_si128(tmp6, tmp8);
 133         tmp6 = _mm_or_si128(tmp6, tmp9);
 134
 135         tmp7 = _mm_slli_epi32(tmp3, 31);
 136         tmp8 = _mm_slli_epi32(tmp3, 30);
 137         tmp9 = _mm_slli_epi32(tmp3, 25);
 138
 139         tmp7 = _mm_xor_si128(tmp7, tmp8);
 140         tmp7 = _mm_xor_si128(tmp7, tmp9);
 141         tmp8 = _mm_srli_si128(tmp7, 4);
 142         tmp7 = _mm_slli_si128(tmp7, 12);
 143         tmp3 = _mm_xor_si128(tmp3, tmp7);
 144
 145         tmp2 = _mm_srli_epi32(tmp3, 1);
 146         tmp4 = _mm_srli_epi32(tmp3, 2);
 147         tmp5 = _mm_srli_epi32(tmp3, 7);
 148         tmp2 = _mm_xor_si128(tmp2, tmp4);
 149         tmp2 = _mm_xor_si128(tmp2, tmp5);
 150         tmp2 = _mm_xor_si128(tmp2, tmp8);
 151         tmp3 = _mm_xor_si128(tmp3, tmp2);
 152         tmp6 = _mm_xor_si128(tmp6, tmp3);
 153
 154         *res = tmp6;
 155 }
 156
 157 /*
 158  * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
 159  * Method */
 160 static void
 161 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
 162     __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
 163 {
 164         /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
 165         __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
 166             H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
 167         __m128i tmp0, tmp1, tmp2, tmp3;
 168         __m128i tmp4, tmp5, tmp6, tmp7;
 169         __m128i tmp8, tmp9;
 170
 171         H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
 172         H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
 173         H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
 174         H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
 175
 176         lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
 177         lo = _mm_xor_si128(lo, H3_X3_lo);
 178         lo = _mm_xor_si128(lo, H4_X4_lo);
 179
 180         H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
 181         H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
 182         H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
 183         H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
 184
 185         hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
 186         hi = _mm_xor_si128(hi, H3_X3_hi);
 187         hi = _mm_xor_si128(hi, H4_X4_hi);
 188
 189         tmp0 = _mm_shuffle_epi32(H1, 78);
 190         tmp4 = _mm_shuffle_epi32(X1, 78);
 191         tmp0 = _mm_xor_si128(tmp0, H1);
 192         tmp4 = _mm_xor_si128(tmp4, X1);
 193         tmp1 = _mm_shuffle_epi32(H2, 78);
 194         tmp5 = _mm_shuffle_epi32(X2, 78);
 195         tmp1 = _mm_xor_si128(tmp1, H2);
 196         tmp5 = _mm_xor_si128(tmp5, X2);
 197         tmp2 = _mm_shuffle_epi32(H3, 78);
 198         tmp6 = _mm_shuffle_epi32(X3, 78);
 199         tmp2 = _mm_xor_si128(tmp2, H3);
 200         tmp6 = _mm_xor_si128(tmp6, X3);
 201         tmp3 = _mm_shuffle_epi32(H4, 78);
 202         tmp7 = _mm_shuffle_epi32(X4, 78);
 203         tmp3 = _mm_xor_si128(tmp3, H4);
 204         tmp7 = _mm_xor_si128(tmp7, X4);
 205
 206         tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
 207         tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
 208         tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
 209         tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
 210
 211         tmp0 = _mm_xor_si128(tmp0, lo);
 212         tmp0 = _mm_xor_si128(tmp0, hi);
 213         tmp0 = _mm_xor_si128(tmp1, tmp0);
 214         tmp0 = _mm_xor_si128(tmp2, tmp0);
 215         tmp0 = _mm_xor_si128(tmp3, tmp0);
 216
 217         tmp4 = _mm_slli_si128(tmp0, 8);
 218         tmp0 = _mm_srli_si128(tmp0, 8);
 219
 220         lo = _mm_xor_si128(tmp4, lo);
 221         hi = _mm_xor_si128(tmp0, hi);
 222
 223         tmp3 = lo;
 224         tmp6 = hi;
 225
 226         tmp7 = _mm_srli_epi32(tmp3, 31);
 227         tmp8 = _mm_srli_epi32(tmp6, 31);
 228         tmp3 = _mm_slli_epi32(tmp3, 1);
 229         tmp6 = _mm_slli_epi32(tmp6, 1);
 230
 231         tmp9 = _mm_srli_si128(tmp7, 12);
 232         tmp8 = _mm_slli_si128(tmp8, 4);
 233         tmp7 = _mm_slli_si128(tmp7, 4);
 234         tmp3 = _mm_or_si128(tmp3, tmp7);
 235         tmp6 = _mm_or_si128(tmp6, tmp8);
 236         tmp6 = _mm_or_si128(tmp6, tmp9);
 237
 238         tmp7 = _mm_slli_epi32(tmp3, 31);
 239         tmp8 = _mm_slli_epi32(tmp3, 30);
 240         tmp9 = _mm_slli_epi32(tmp3, 25);
 241
 242         tmp7 = _mm_xor_si128(tmp7, tmp8);
 243         tmp7 = _mm_xor_si128(tmp7, tmp9);
 244         tmp8 = _mm_srli_si128(tmp7, 4);
 245         tmp7 = _mm_slli_si128(tmp7, 12);
 246         tmp3 = _mm_xor_si128(tmp3, tmp7);
 247
 248         tmp2 = _mm_srli_epi32(tmp3, 1);
 249         tmp4 = _mm_srli_epi32(tmp3, 2);
 250         tmp5 = _mm_srli_epi32(tmp3, 7);
 251         tmp2 = _mm_xor_si128(tmp2, tmp4);
 252         tmp2 = _mm_xor_si128(tmp2, tmp5);
 253         tmp2 = _mm_xor_si128(tmp2, tmp8);
 254         tmp3 = _mm_xor_si128(tmp3, tmp2);
 255         tmp6 = _mm_xor_si128(tmp6, tmp3);
 256
 257         *res = tmp6;
 258 }
 259
 260 /*
 261  * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
 262  * Every Four Blocks
 263  */
 264 /*
 265  * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
 266  * 2^32-256*8*16 bytes.
 267  */
 268 void
 269 AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
 270         const unsigned char *addt, const unsigned char *ivec,
 271         unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
 272         const unsigned char *key, int nr)
 273 {
 274         int i, j ,k;
 275         __m128i tmp1, tmp2, tmp3, tmp4;
 276         __m128i tmp5, tmp6, tmp7, tmp8;
 277         __m128i H, H2, H3, H4, Y, T;
 278         const __m128i *KEY = (const __m128i *)key;
 279         __m128i ctr1, ctr2, ctr3, ctr4;
 280         __m128i ctr5, ctr6, ctr7, ctr8;
 281         __m128i last_block = _mm_setzero_si128();
 282         __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
 283         __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
 284         __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
 285             7);
 286         __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
 287             15);
 288         __m128i X = _mm_setzero_si128();
 289
 290         if (ibytes == 96/8) {
 291                 Y = _mm_loadu_si128((const __m128i *)ivec);
 292                 Y = _mm_insert_epi32(Y, 0x1000000, 3);
 293                 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
 294                 tmp1 = _mm_xor_si128(X, KEY[0]);
 295                 tmp2 = _mm_xor_si128(Y, KEY[0]);
 296                 for (j=1; j < nr-1; j+=2) {
 297                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 298                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 299
 300                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 301                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
 302                 }
 303                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 304                 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
 305
 306                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 307                 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
 308
 309                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 310         } else {
 311                 tmp1 = _mm_xor_si128(X, KEY[0]);
 312                 for (j=1; j <nr; j++)
 313                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 314                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 315
 316                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 317                 Y = _mm_setzero_si128();
 318
 319                 for (i=0; i < ibytes/16; i++) {
 320                         tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
 321                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 322                         Y = _mm_xor_si128(Y, tmp1);
 323                         gfmul(Y, H, &Y);
 324                 }
 325                 if (ibytes%16) {
 326                         for (j=0; j < ibytes%16; j++)
 327                                 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
 328                         tmp1 = last_block;
 329                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 330                         Y = _mm_xor_si128(Y, tmp1);
 331                         gfmul(Y, H, &Y);
 332                 }
 333                 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
 334                 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
 335
 336                 Y = _mm_xor_si128(Y, tmp1);
 337                 gfmul(Y, H, &Y);
 338                 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
 339                 tmp1 = _mm_xor_si128(Y, KEY[0]);
 340                 for (j=1; j < nr; j++)
 341                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 342                 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
 343         }
 344
 345         gfmul(H,H,&H2);
 346         gfmul(H,H2,&H3);
 347         gfmul(H,H3,&H4);
 348
 349         for (i=0; i<abytes/16/4; i++) {
 350                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
 351                 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
 352                 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
 353                 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
 354
 355                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 356                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 357                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 358                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 359                 tmp1 = _mm_xor_si128(X, tmp1);
 360
 361                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 362         }
 363         for (i=i*4; i<abytes/16; i++) {
 364                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
 365                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 366                 X = _mm_xor_si128(X,tmp1);
 367                 gfmul(X, H, &X);
 368         }
 369         if (abytes%16) {
 370                 last_block = _mm_setzero_si128();
 371                 for (j=0; j<abytes%16; j++)
 372                         ((unsigned char*)&last_block)[j] = addt[i*16+j];
 373                 tmp1 = last_block;
 374                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 375                 X =_mm_xor_si128(X,tmp1);
 376                 gfmul(X,H,&X);
 377         }
 378
 379         ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
 380         ctr1 = _mm_add_epi64(ctr1, ONE);
 381         ctr2 = _mm_add_epi64(ctr1, ONE);
 382         ctr3 = _mm_add_epi64(ctr2, ONE);
 383         ctr4 = _mm_add_epi64(ctr3, ONE);
 384         ctr5 = _mm_add_epi64(ctr4, ONE);
 385         ctr6 = _mm_add_epi64(ctr5, ONE);
 386         ctr7 = _mm_add_epi64(ctr6, ONE);
 387         ctr8 = _mm_add_epi64(ctr7, ONE);
 388
 389         for (i=0; i<nbytes/16/8; i++) {
 390                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 391                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
 392                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
 393                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
 394                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
 395                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
 396                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
 397                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
 398
 399                 ctr1 = _mm_add_epi64(ctr1, EIGHT);
 400                 ctr2 = _mm_add_epi64(ctr2, EIGHT);
 401                 ctr3 = _mm_add_epi64(ctr3, EIGHT);
 402                 ctr4 = _mm_add_epi64(ctr4, EIGHT);
 403                 ctr5 = _mm_add_epi64(ctr5, EIGHT);
 404                 ctr6 = _mm_add_epi64(ctr6, EIGHT);
 405                 ctr7 = _mm_add_epi64(ctr7, EIGHT);
 406                 ctr8 = _mm_add_epi64(ctr8, EIGHT);
 407
 408                 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
 409                 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
 410                 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
 411                 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
 412                 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
 413                 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
 414                 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
 415                 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
 416
 417                 for (j=1; j<nr; j++) {
 418                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 419                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 420                         tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
 421                         tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
 422                         tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
 423                         tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
 424                         tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
 425                         tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
 426                 }
 427                 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
 428                 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
 429                 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
 430                 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
 431                 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
 432                 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
 433                 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
 434                 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
 435
 436                 tmp1 = _mm_xor_si128(tmp1,
 437                     _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
 438                 tmp2 = _mm_xor_si128(tmp2,
 439                     _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
 440                 tmp3 = _mm_xor_si128(tmp3,
 441                     _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
 442                 tmp4 = _mm_xor_si128(tmp4,
 443                     _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
 444                 tmp5 = _mm_xor_si128(tmp5,
 445                     _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
 446                 tmp6 = _mm_xor_si128(tmp6,
 447                     _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
 448                 tmp7 = _mm_xor_si128(tmp7,
 449                     _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
 450                 tmp8 = _mm_xor_si128(tmp8,
 451                     _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
 452
 453                 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
 454                 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
 455                 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
 456                 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
 457                 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
 458                 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
 459                 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
 460                 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
 461
 462                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 463                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 464                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 465                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 466                 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
 467                 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
 468                 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
 469                 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
 470
 471                 tmp1 = _mm_xor_si128(X, tmp1);
 472
 473                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 474
 475                 tmp5 = _mm_xor_si128(X, tmp5);
 476                 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
 477         }
 478         for (k=i*8; k<nbytes/16; k++) {
 479                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 480                 ctr1 = _mm_add_epi64(ctr1, ONE);
 481                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 482                 for (j=1; j<nr-1; j+=2) {
 483                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 484                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 485                 }
 486                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 487                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 488                 tmp1 = _mm_xor_si128(tmp1,
 489                     _mm_loadu_si128(&((const __m128i *)in)[k]));
 490                 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
 491                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 492                 X = _mm_xor_si128(X, tmp1);
 493                 gfmul(X,H,&X);
 494         }
 495         //If remains one incomplete block
 496         if (nbytes%16) {
 497                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 498                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 499                 for (j=1; j<nr-1; j+=2) {
 500                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 501                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 502                 }
 503                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 504                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 505                 last_block = _mm_setzero_si128();
 506                 memcpy(&last_block, &((const __m128i *)in)[k],
 507                     nbytes % 16);
 508                 last_block = _mm_xor_si128(last_block, tmp1);
 509                 for (j=0; j<nbytes%16; j++)
 510                         out[k*16+j] = ((unsigned char*)&last_block)[j];
 511                 for ((void)j; j<16; j++)
 512                         ((unsigned char*)&last_block)[j] = 0;
 513                 tmp1 = last_block;
 514                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 515                 X = _mm_xor_si128(X, tmp1);
 516                 gfmul(X, H, &X);
 517         }
 518         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
 519         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
 520
 521         X = _mm_xor_si128(X, tmp1);
 522         gfmul(X,H,&X);
 523         X = _mm_shuffle_epi8(X, BSWAP_MASK);
 524         T = _mm_xor_si128(X, T);
 525         _mm_storeu_si128((__m128i*)tag, T);
 526 }
 527
 528 /* My modification of _encrypt to be _decrypt */
 529 int
 530 AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
 531         const unsigned char *addt, const unsigned char *ivec,
 532         const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
 533         const unsigned char *key, int nr)
 534 {
 535         int i, j ,k;
 536         __m128i tmp1, tmp2, tmp3, tmp4;
 537         __m128i tmp5, tmp6, tmp7, tmp8;
 538         __m128i H, H2, H3, H4, Y, T;
 539         const __m128i *KEY = (const __m128i *)key;
 540         __m128i ctr1, ctr2, ctr3, ctr4;
 541         __m128i ctr5, ctr6, ctr7, ctr8;
 542         __m128i last_block = _mm_setzero_si128();
 543         __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
 544         __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
 545         __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
 546             7);
 547         __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
 548             15);
 549         __m128i X = _mm_setzero_si128();
 550
 551         if (ibytes == 96/8) {
 552                 Y = _mm_loadu_si128((const __m128i *)ivec);
 553                 Y = _mm_insert_epi32(Y, 0x1000000, 3);
 554                 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
 555                 tmp1 = _mm_xor_si128(X, KEY[0]);
 556                 tmp2 = _mm_xor_si128(Y, KEY[0]);
 557                 for (j=1; j < nr-1; j+=2) {
 558                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 559                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 560
 561                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 562                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
 563                 }
 564                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 565                 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
 566
 567                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 568                 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
 569
 570                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 571         } else {
 572                 tmp1 = _mm_xor_si128(X, KEY[0]);
 573                 for (j=1; j <nr; j++)
 574                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 575                 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 576
 577                 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 578                 Y = _mm_setzero_si128();
 579
 580                 for (i=0; i < ibytes/16; i++) {
 581                         tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
 582                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 583                         Y = _mm_xor_si128(Y, tmp1);
 584                         gfmul(Y, H, &Y);
 585                 }
 586                 if (ibytes%16) {
 587                         for (j=0; j < ibytes%16; j++)
 588                                 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
 589                         tmp1 = last_block;
 590                         tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 591                         Y = _mm_xor_si128(Y, tmp1);
 592                         gfmul(Y, H, &Y);
 593                 }
 594                 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
 595                 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
 596
 597                 Y = _mm_xor_si128(Y, tmp1);
 598                 gfmul(Y, H, &Y);
 599                 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
 600                 tmp1 = _mm_xor_si128(Y, KEY[0]);
 601                 for (j=1; j < nr; j++)
 602                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 603                 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
 604         }
 605
 606         gfmul(H,H,&H2);
 607         gfmul(H,H2,&H3);
 608         gfmul(H,H3,&H4);
 609
 610         for (i=0; i<abytes/16/4; i++) {
 611                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
 612                 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
 613                 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
 614                 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
 615
 616                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 617                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 618                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 619                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 620
 621                 tmp1 = _mm_xor_si128(X, tmp1);
 622
 623                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 624         }
 625         for (i=i*4; i<abytes/16; i++) {
 626                 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
 627                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 628                 X = _mm_xor_si128(X,tmp1);
 629                 gfmul(X, H, &X);
 630         }
 631         if (abytes%16) {
 632                 last_block = _mm_setzero_si128();
 633                 for (j=0; j<abytes%16; j++)
 634                         ((unsigned char*)&last_block)[j] = addt[i*16+j];
 635                 tmp1 = last_block;
 636                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 637                 X =_mm_xor_si128(X,tmp1);
 638                 gfmul(X,H,&X);
 639         }
 640
 641         /* This is where we validate the cipher text before decrypt */
 642         for (i = 0; i<nbytes/16/4; i++) {
 643                 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
 644                 tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
 645                 tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
 646                 tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
 647
 648                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 649                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 650                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 651                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 652
 653                 tmp1 = _mm_xor_si128(X, tmp1);
 654
 655                 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
 656         }
 657         for (i = i*4; i<nbytes/16; i++) {
 658                 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
 659                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 660                 X = _mm_xor_si128(X, tmp1);
 661                 gfmul(X,H,&X);
 662         }
 663         if (nbytes%16) {
 664                 last_block = _mm_setzero_si128();
 665                 for (j=0; j<nbytes%16; j++)
 666                         ((unsigned char*)&last_block)[j] = in[i*16+j];
 667                 tmp1 = last_block;
 668                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 669                 X = _mm_xor_si128(X, tmp1);
 670                 gfmul(X, H, &X);
 671         }
 672
 673         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
 674         tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
 675
 676         X = _mm_xor_si128(X, tmp1);
 677         gfmul(X,H,&X);
 678         X = _mm_shuffle_epi8(X, BSWAP_MASK);
 679         T = _mm_xor_si128(X, T);
 680
 681         if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
 682                 return 0; //in case the authentication failed
 683
 684         ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
 685         ctr1 = _mm_add_epi64(ctr1, ONE);
 686         ctr2 = _mm_add_epi64(ctr1, ONE);
 687         ctr3 = _mm_add_epi64(ctr2, ONE);
 688         ctr4 = _mm_add_epi64(ctr3, ONE);
 689         ctr5 = _mm_add_epi64(ctr4, ONE);
 690         ctr6 = _mm_add_epi64(ctr5, ONE);
 691         ctr7 = _mm_add_epi64(ctr6, ONE);
 692         ctr8 = _mm_add_epi64(ctr7, ONE);
 693
 694         for (i=0; i<nbytes/16/8; i++) {
 695                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 696                 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
 697                 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
 698                 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
 699                 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
 700                 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
 701                 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
 702                 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
 703
 704                 ctr1 = _mm_add_epi64(ctr1, EIGHT);
 705                 ctr2 = _mm_add_epi64(ctr2, EIGHT);
 706                 ctr3 = _mm_add_epi64(ctr3, EIGHT);
 707                 ctr4 = _mm_add_epi64(ctr4, EIGHT);
 708                 ctr5 = _mm_add_epi64(ctr5, EIGHT);
 709                 ctr6 = _mm_add_epi64(ctr6, EIGHT);
 710                 ctr7 = _mm_add_epi64(ctr7, EIGHT);
 711                 ctr8 = _mm_add_epi64(ctr8, EIGHT);
 712
 713                 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
 714                 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
 715                 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
 716                 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
 717                 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
 718                 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
 719                 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
 720                 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
 721
 722                 for (j=1; j<nr; j++) {
 723                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 724                         tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 725                         tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
 726                         tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
 727                         tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
 728                         tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
 729                         tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
 730                         tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
 731                 }
 732                 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
 733                 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
 734                 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
 735                 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
 736                 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
 737                 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
 738                 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
 739                 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
 740
 741                 tmp1 = _mm_xor_si128(tmp1,
 742                     _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
 743                 tmp2 = _mm_xor_si128(tmp2,
 744                     _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
 745                 tmp3 = _mm_xor_si128(tmp3,
 746                     _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
 747                 tmp4 = _mm_xor_si128(tmp4,
 748                     _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
 749                 tmp5 = _mm_xor_si128(tmp5,
 750                     _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
 751                 tmp6 = _mm_xor_si128(tmp6,
 752                     _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
 753                 tmp7 = _mm_xor_si128(tmp7,
 754                     _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
 755                 tmp8 = _mm_xor_si128(tmp8,
 756                     _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
 757
 758                 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
 759                 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
 760                 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
 761                 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
 762                 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
 763                 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
 764                 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
 765                 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
 766
 767                 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 768                 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 769                 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 770                 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 771                 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
 772                 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
 773                 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
 774                 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
 775         }
 776         for (k=i*8; k<nbytes/16; k++) {
 777                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 778                 ctr1 = _mm_add_epi64(ctr1, ONE);
 779                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 780                 for (j=1; j<nr-1; j+=2) {
 781                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 782                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 783                 }
 784                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 785                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 786                 tmp1 = _mm_xor_si128(tmp1,
 787                     _mm_loadu_si128(&((const __m128i *)in)[k]));
 788                 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
 789         }
 790         //If remains one incomplete block
 791         if (nbytes%16) {
 792                 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 793                 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 794                 for (j=1; j<nr-1; j+=2) {
 795                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 796                         tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 797                 }
 798                 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 799                 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 800                 last_block = _mm_setzero_si128();
 801                 memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16);
 802                 tmp1 = _mm_xor_si128(tmp1, last_block);
 803                 last_block = tmp1;
 804                 for (j=0; j<nbytes%16; j++)
 805                         out[k*16+j] = ((unsigned char*)&last_block)[j];
 806         }
 807         return 1; //when sucessfull returns 1
 808 }