lib/libcrypto/modes/gcm128.c

   1 /* $OpenBSD: gcm128.c,v 1.20 2017/09/03 13:07:34 inoguchi Exp $ */
   2 /* ====================================================================
   3  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  *
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in
  14  *    the documentation and/or other materials provided with the
  15  *    distribution.
  16  *
  17  * 3. All advertising materials mentioning features or use of this
  18  *    software must display the following acknowledgment:
  19  *    "This product includes software developed by the OpenSSL Project
  20  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
  21  *
  22  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
  23  *    endorse or promote products derived from this software without
  24  *    prior written permission. For written permission, please contact
  25  *    openssl-core@openssl.org.
  26  *
  27  * 5. Products derived from this software may not be called "OpenSSL"
  28  *    nor may "OpenSSL" appear in their names without prior written
  29  *    permission of the OpenSSL Project.
  30  *
  31  * 6. Redistributions of any form whatsoever must retain the following
  32  *    acknowledgment:
  33  *    "This product includes software developed by the OpenSSL Project
  34  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
  35  *
  36  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
  37  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  38  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  39  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
  40  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  42  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  43  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  44  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  45  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  46  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  47  * OF THE POSSIBILITY OF SUCH DAMAGE.
  48  * ====================================================================
  49  */
  50
  51 #define OPENSSL_FIPSAPI
  52
  53 #include <openssl/crypto.h>
  54 #include "modes_lcl.h"
  55 #include <string.h>
  56
  57 #ifndef MODES_DEBUG
  58 # ifndef NDEBUG
  59 #  define NDEBUG
  60 # endif
  61 #endif
  62
  63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
  64 /* redefine, because alignment is ensured */
  65 #undef  GETU32
  66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
  67 #undef  PUTU32
  68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
  69 #endif
  70
  71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
  72 #define REDUCE1BIT(V)   \
  73         do { \
  74                 if (sizeof(size_t)==8) { \
  75                         u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
  76                         V.lo  = (V.hi<<63)|(V.lo>>1); \
  77                         V.hi  = (V.hi>>1 )^T; \
  78                 } else { \
  79                         u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
  80                         V.lo  = (V.hi<<63)|(V.lo>>1); \
  81                         V.hi  = (V.hi>>1 )^((u64)T<<32); \
  82                 } \
  83         } while(0)
  84
  85 /*
  86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
  87  * never be set to 8. 8 is effectively reserved for testing purposes.
  88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
  89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
  90  * whole spectrum of possible table driven implementations. Why? In
  91  * non-"Shoup's" case memory access pattern is segmented in such manner,
  92  * that it's trivial to see that cache timing information can reveal
  93  * fair portion of intermediate hash value. Given that ciphertext is
  94  * always available to attacker, it's possible for him to attempt to
  95  * deduce secret parameter H and if successful, tamper with messages
  96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
  97  * not as trivial, but there is no reason to believe that it's resistant
  98  * to cache-timing attack. And the thing about "8-bit" implementation is
  99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
 100  * key + 1KB shared. Well, on pros side it should be twice as fast as
 101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
 102  * was observed to run ~75% faster, closer to 100% for commercial
 103  * compilers... Yet "4-bit" procedure is preferred, because it's
 104  * believed to provide better security-performance balance and adequate
 105  * all-round performance. "All-round" refers to things like:
 106  *
 107  * - shorter setup time effectively improves overall timing for
 108  *   handling short messages;
 109  * - larger table allocation can become unbearable because of VM
 110  *   subsystem penalties (for example on Windows large enough free
 111  *   results in VM working set trimming, meaning that consequent
 112  *   malloc would immediately incur working set expansion);
 113  * - larger table has larger cache footprint, which can affect
 114  *   performance of other code paths (not necessarily even from same
 115  *   thread in Hyper-Threading world);
 116  *
 117  * Value of 1 is not appropriate for performance reasons.
 118  */
 119 #if     TABLE_BITS==8
 120
 121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
 122 {
 123         int  i, j;
 124         u128 V;
 125
 126         Htable[0].hi = 0;
 127         Htable[0].lo = 0;
 128         V.hi = H[0];
 129         V.lo = H[1];
 130
 131         for (Htable[128]=V, i=64; i>0; i>>=1) {
 132                 REDUCE1BIT(V);
 133                 Htable[i] = V;
 134         }
 135
 136         for (i=2; i<256; i<<=1) {
 137                 u128 *Hi = Htable+i, H0 = *Hi;
 138                 for (j=1; j<i; ++j) {
 139                         Hi[j].hi = H0.hi^Htable[j].hi;
 140                         Hi[j].lo = H0.lo^Htable[j].lo;
 141                 }
 142         }
 143 }
 144
 145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
 146 {
 147         u128 Z = { 0, 0};
 148         const u8 *xi = (const u8 *)Xi+15;
 149         size_t rem, n = *xi;
 150         static const size_t rem_8bit[256] = {
 151                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
 152                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
 153                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
 154                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
 155                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
 156                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
 157                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
 158                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
 159                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
 160                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
 161                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
 162                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
 163                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
 164                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
 165                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
 166                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
 167                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
 168                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
 169                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
 170                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
 171                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
 172                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
 173                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
 174                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
 175                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
 176                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
 177                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
 178                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
 179                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
 180                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
 181                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
 182                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
 183                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
 184                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
 185                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
 186                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
 187                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
 188                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
 189                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
 190                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
 191                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
 192                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
 193                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
 194                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
 195                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
 196                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
 197                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
 198                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
 199                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
 200                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
 201                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
 202                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
 203                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
 204                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
 205                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
 206                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
 207                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
 208                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
 209                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
 210                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
 211                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
 212                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
 213                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
 214                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
 215
 216         while (1) {
 217                 Z.hi ^= Htable[n].hi;
 218                 Z.lo ^= Htable[n].lo;
 219
 220                 if ((u8 *)Xi==xi)       break;
 221
 222                 n = *(--xi);
 223
 224                 rem  = (size_t)Z.lo&0xff;
 225                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
 226                 Z.hi = (Z.hi>>8);
 227 #if SIZE_MAX == 0xffffffffffffffff
 228                 Z.hi ^= rem_8bit[rem];
 229 #else
 230                 Z.hi ^= (u64)rem_8bit[rem]<<32;
 231 #endif
 232         }
 233
 234 #if BYTE_ORDER == LITTLE_ENDIAN
 235 #ifdef BSWAP8
 236         Xi[0] = BSWAP8(Z.hi);
 237         Xi[1] = BSWAP8(Z.lo);
 238 #else
 239         u8 *p = (u8 *)Xi;
 240         u32 v;
 241         v = (u32)(Z.hi>>32);    PUTU32(p,v);
 242         v = (u32)(Z.hi);        PUTU32(p+4,v);
 243         v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
 244         v = (u32)(Z.lo);        PUTU32(p+12,v);
 245 #endif
 246 #else /* BIG_ENDIAN */
 247         Xi[0] = Z.hi;
 248         Xi[1] = Z.lo;
 249 #endif
 250 }
 251 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
 252
 253 #elif   TABLE_BITS==4
 254
 255 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
 256 {
 257         u128 V;
 258 #if defined(OPENSSL_SMALL_FOOTPRINT)
 259         int  i;
 260 #endif
 261
 262         Htable[0].hi = 0;
 263         Htable[0].lo = 0;
 264         V.hi = H[0];
 265         V.lo = H[1];
 266
 267 #if defined(OPENSSL_SMALL_FOOTPRINT)
 268         for (Htable[8]=V, i=4; i>0; i>>=1) {
 269                 REDUCE1BIT(V);
 270                 Htable[i] = V;
 271         }
 272
 273         for (i=2; i<16; i<<=1) {
 274                 u128 *Hi = Htable+i;
 275                 int   j;
 276                 for (V=*Hi, j=1; j<i; ++j) {
 277                         Hi[j].hi = V.hi^Htable[j].hi;
 278                         Hi[j].lo = V.lo^Htable[j].lo;
 279                 }
 280         }
 281 #else
 282         Htable[8] = V;
 283         REDUCE1BIT(V);
 284         Htable[4] = V;
 285         REDUCE1BIT(V);
 286         Htable[2] = V;
 287         REDUCE1BIT(V);
 288         Htable[1] = V;
 289         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
 290         V=Htable[4];
 291         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
 292         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
 293         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
 294         V=Htable[8];
 295         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
 296         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
 297         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
 298         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
 299         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
 300         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
 301         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
 302 #endif
 303 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
 304         /*
 305          * ARM assembler expects specific dword order in Htable.
 306          */
 307         {
 308                 int j;
 309 #if BYTE_ORDER == LITTLE_ENDIAN
 310                 for (j=0;j<16;++j) {
 311                         V = Htable[j];
 312                         Htable[j].hi = V.lo;
 313                         Htable[j].lo = V.hi;
 314                 }
 315 #else /* BIG_ENDIAN */
 316                 for (j=0;j<16;++j) {
 317                         V = Htable[j];
 318                         Htable[j].hi = V.lo<<32|V.lo>>32;
 319                         Htable[j].lo = V.hi<<32|V.hi>>32;
 320                 }
 321 #endif
 322         }
 323 #endif
 324 }
 325
 326 #ifndef GHASH_ASM
 327 static const size_t rem_4bit[16] = {
 328         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
 329         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
 330         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
 331         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
 332
 333 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
 334 {
 335         u128 Z;
 336         int cnt = 15;
 337         size_t rem, nlo, nhi;
 338
 339         nlo  = ((const u8 *)Xi)[15];
 340         nhi  = nlo>>4;
 341         nlo &= 0xf;
 342
 343         Z.hi = Htable[nlo].hi;
 344         Z.lo = Htable[nlo].lo;
 345
 346         while (1) {
 347                 rem  = (size_t)Z.lo&0xf;
 348                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
 349                 Z.hi = (Z.hi>>4);
 350 #if SIZE_MAX == 0xffffffffffffffff
 351                 Z.hi ^= rem_4bit[rem];
 352 #else
 353                 Z.hi ^= (u64)rem_4bit[rem]<<32;
 354 #endif
 355                 Z.hi ^= Htable[nhi].hi;
 356                 Z.lo ^= Htable[nhi].lo;
 357
 358                 if (--cnt<0)            break;
 359
 360                 nlo  = ((const u8 *)Xi)[cnt];
 361                 nhi  = nlo>>4;
 362                 nlo &= 0xf;
 363
 364                 rem  = (size_t)Z.lo&0xf;
 365                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
 366                 Z.hi = (Z.hi>>4);
 367 #if SIZE_MAX == 0xffffffffffffffff
 368                 Z.hi ^= rem_4bit[rem];
 369 #else
 370                 Z.hi ^= (u64)rem_4bit[rem]<<32;
 371 #endif
 372                 Z.hi ^= Htable[nlo].hi;
 373                 Z.lo ^= Htable[nlo].lo;
 374         }
 375
 376 #if BYTE_ORDER == LITTLE_ENDIAN
 377 #ifdef BSWAP8
 378         Xi[0] = BSWAP8(Z.hi);
 379         Xi[1] = BSWAP8(Z.lo);
 380 #else
 381         u8 *p = (u8 *)Xi;
 382         u32 v;
 383         v = (u32)(Z.hi>>32);    PUTU32(p,v);
 384         v = (u32)(Z.hi);        PUTU32(p+4,v);
 385         v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
 386         v = (u32)(Z.lo);        PUTU32(p+12,v);
 387 #endif
 388 #else /* BIG_ENDIAN */
 389         Xi[0] = Z.hi;
 390         Xi[1] = Z.lo;
 391 #endif
 392 }
 393
 394 #if !defined(OPENSSL_SMALL_FOOTPRINT)
 395 /*
 396  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
 397  * details... Compiler-generated code doesn't seem to give any
 398  * performance improvement, at least not on x86[_64]. It's here
 399  * mostly as reference and a placeholder for possible future
 400  * non-trivial optimization[s]...
 401  */
 402 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
 403                                 const u8 *inp,size_t len)
 404 {
 405     u128 Z;
 406     int cnt;
 407     size_t rem, nlo, nhi;
 408
 409 #if 1
 410     do {
 411         cnt  = 15;
 412         nlo  = ((const u8 *)Xi)[15];
 413         nlo ^= inp[15];
 414         nhi  = nlo>>4;
 415         nlo &= 0xf;
 416
 417         Z.hi = Htable[nlo].hi;
 418         Z.lo = Htable[nlo].lo;
 419
 420         while (1) {
 421                 rem  = (size_t)Z.lo&0xf;
 422                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
 423                 Z.hi = (Z.hi>>4);
 424 #if SIZE_MAX == 0xffffffffffffffff
 425                 Z.hi ^= rem_4bit[rem];
 426 #else
 427                 Z.hi ^= (u64)rem_4bit[rem]<<32;
 428 #endif
 429                 Z.hi ^= Htable[nhi].hi;
 430                 Z.lo ^= Htable[nhi].lo;
 431
 432                 if (--cnt<0)            break;
 433
 434                 nlo  = ((const u8 *)Xi)[cnt];
 435                 nlo ^= inp[cnt];
 436                 nhi  = nlo>>4;
 437                 nlo &= 0xf;
 438
 439                 rem  = (size_t)Z.lo&0xf;
 440                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
 441                 Z.hi = (Z.hi>>4);
 442 #if SIZE_MAX == 0xffffffffffffffff
 443                 Z.hi ^= rem_4bit[rem];
 444 #else
 445                 Z.hi ^= (u64)rem_4bit[rem]<<32;
 446 #endif
 447                 Z.hi ^= Htable[nlo].hi;
 448                 Z.lo ^= Htable[nlo].lo;
 449         }
 450 #else
 451     /*
 452      * Extra 256+16 bytes per-key plus 512 bytes shared tables
 453      * [should] give ~50% improvement... One could have PACK()-ed
 454      * the rem_8bit even here, but the priority is to minimize
 455      * cache footprint...
 456      */
 457     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
 458     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
 459     static const unsigned short rem_8bit[256] = {
 460         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
 461         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
 462         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
 463         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
 464         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
 465         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
 466         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
 467         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
 468         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
 469         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
 470         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
 471         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
 472         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
 473         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
 474         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
 475         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
 476         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
 477         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
 478         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
 479         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
 480         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
 481         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
 482         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
 483         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
 484         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
 485         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
 486         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
 487         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
 488         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
 489         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
 490         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
 491         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
 492     /*
 493      * This pre-processing phase slows down procedure by approximately
 494      * same time as it makes each loop spin faster. In other words
 495      * single block performance is approximately same as straightforward
 496      * "4-bit" implementation, and then it goes only faster...
 497      */
 498     for (cnt=0; cnt<16; ++cnt) {
 499         Z.hi = Htable[cnt].hi;
 500         Z.lo = Htable[cnt].lo;
 501         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
 502         Hshr4[cnt].hi = (Z.hi>>4);
 503         Hshl4[cnt]    = (u8)(Z.lo<<4);
 504     }
 505
 506     do {
 507         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
 508                 nlo  = ((const u8 *)Xi)[cnt];
 509                 nlo ^= inp[cnt];
 510                 nhi  = nlo>>4;
 511                 nlo &= 0xf;
 512
 513                 Z.hi ^= Htable[nlo].hi;
 514                 Z.lo ^= Htable[nlo].lo;
 515
 516                 rem = (size_t)Z.lo&0xff;
 517
 518                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
 519                 Z.hi = (Z.hi>>8);
 520
 521                 Z.hi ^= Hshr4[nhi].hi;
 522                 Z.lo ^= Hshr4[nhi].lo;
 523                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
 524         }
 525
 526         nlo  = ((const u8 *)Xi)[0];
 527         nlo ^= inp[0];
 528         nhi  = nlo>>4;
 529         nlo &= 0xf;
 530
 531         Z.hi ^= Htable[nlo].hi;
 532         Z.lo ^= Htable[nlo].lo;
 533
 534         rem = (size_t)Z.lo&0xf;
 535
 536         Z.lo = (Z.hi<<60)|(Z.lo>>4);
 537         Z.hi = (Z.hi>>4);
 538
 539         Z.hi ^= Htable[nhi].hi;
 540         Z.lo ^= Htable[nhi].lo;
 541         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
 542 #endif
 543
 544 #if BYTE_ORDER == LITTLE_ENDIAN
 545 #ifdef BSWAP8
 546         Xi[0] = BSWAP8(Z.hi);
 547         Xi[1] = BSWAP8(Z.lo);
 548 #else
 549         u8 *p = (u8 *)Xi;
 550         u32 v;
 551         v = (u32)(Z.hi>>32);    PUTU32(p,v);
 552         v = (u32)(Z.hi);        PUTU32(p+4,v);
 553         v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
 554         v = (u32)(Z.lo);        PUTU32(p+12,v);
 555 #endif
 556 #else /* BIG_ENDIAN */
 557         Xi[0] = Z.hi;
 558         Xi[1] = Z.lo;
 559 #endif
 560     } while (inp+=16, len-=16);
 561 }
 562 #endif
 563 #else
 564 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
 565 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 566 #endif
 567
 568 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
 569 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
 570 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
 571 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
 572  * trashing effect. In other words idea is to hash data while it's
 573  * still in L1 cache after encryption pass... */
 574 #define GHASH_CHUNK       (3*1024)
 575 #endif
 576
 577 #else   /* TABLE_BITS */
 578
 579 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
 580 {
 581         u128 V,Z = { 0,0 };
 582         long X;
 583         int  i,j;
 584         const long *xi = (const long *)Xi;
 585
 586         V.hi = H[0];    /* H is in host byte order, no byte swapping */
 587         V.lo = H[1];
 588
 589         for (j=0; j<16/sizeof(long); ++j) {
 590 #if BYTE_ORDER == LITTLE_ENDIAN
 591 #if SIZE_MAX == 0xffffffffffffffff
 592 #ifdef BSWAP8
 593                         X = (long)(BSWAP8(xi[j]));
 594 #else
 595                         const u8 *p = (const u8 *)(xi+j);
 596                         X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
 597 #endif
 598 #else
 599                         const u8 *p = (const u8 *)(xi+j);
 600                         X = (long)GETU32(p);
 601 #endif
 602 #else /* BIG_ENDIAN */
 603                 X = xi[j];
 604 #endif
 605
 606                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
 607                         u64 M = (u64)(X>>(8*sizeof(long)-1));
 608                         Z.hi ^= V.hi&M;
 609                         Z.lo ^= V.lo&M;
 610
 611                         REDUCE1BIT(V);
 612                 }
 613         }
 614
 615 #if BYTE_ORDER == LITTLE_ENDIAN
 616 #ifdef BSWAP8
 617         Xi[0] = BSWAP8(Z.hi);
 618         Xi[1] = BSWAP8(Z.lo);
 619 #else
 620         u8 *p = (u8 *)Xi;
 621         u32 v;
 622         v = (u32)(Z.hi>>32);    PUTU32(p,v);
 623         v = (u32)(Z.hi);        PUTU32(p+4,v);
 624         v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
 625         v = (u32)(Z.lo);        PUTU32(p+12,v);
 626 #endif
 627 #else /* BIG_ENDIAN */
 628         Xi[0] = Z.hi;
 629         Xi[1] = Z.lo;
 630 #endif
 631 }
 632 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
 633
 634 #endif
 635
 636 #if     defined(GHASH_ASM) && \
 637         (defined(__i386)        || defined(__i386__)    || \
 638          defined(__x86_64)      || defined(__x86_64__)  || \
 639          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
 640 #include "x86_arch.h"
 641 #endif
 642
 643 #if     TABLE_BITS==4 && defined(GHASH_ASM)
 644 # if    (defined(__i386)        || defined(__i386__)    || \
 645          defined(__x86_64)      || defined(__x86_64__)  || \
 646          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
 647 #  define GHASH_ASM_X86_OR_64
 648 #  define GCM_FUNCREF_4BIT
 649
 650 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
 651 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
 652 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 653
 654 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
 655 #   define GHASH_ASM_X86
 656 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
 657 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 658
 659 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
 660 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 661 #  endif
 662 # elif defined(__arm__) || defined(__arm)
 663 #  include "arm_arch.h"
 664 #  if __ARM_ARCH__>=7
 665 #   define GHASH_ASM_ARM
 666 #   define GCM_FUNCREF_4BIT
 667 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
 668 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 669 #  endif
 670 # endif
 671 #endif
 672
 673 #ifdef GCM_FUNCREF_4BIT
 674 # undef  GCM_MUL
 675 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
 676 # ifdef GHASH
 677 #  undef  GHASH
 678 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
 679 # endif
 680 #endif
 681
 682 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 683 {
 684         memset(ctx,0,sizeof(*ctx));
 685         ctx->block = block;
 686         ctx->key   = key;
 687
 688         (*block)(ctx->H.c,ctx->H.c,key);
 689
 690 #if BYTE_ORDER == LITTLE_ENDIAN
 691         /* H is stored in host byte order */
 692 #ifdef BSWAP8
 693         ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
 694         ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
 695 #else
 696         u8 *p = ctx->H.c;
 697         u64 hi,lo;
 698         hi = (u64)GETU32(p)  <<32|GETU32(p+4);
 699         lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
 700         ctx->H.u[0] = hi;
 701         ctx->H.u[1] = lo;
 702 #endif
 703 #endif
 704
 705 #if     TABLE_BITS==8
 706         gcm_init_8bit(ctx->Htable,ctx->H.u);
 707 #elif   TABLE_BITS==4
 708 # if    defined(GHASH_ASM_X86_OR_64)
 709 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
 710         /* check FXSR and PCLMULQDQ bits */
 711         if ((OPENSSL_cpu_caps() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) ==
 712             (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) {
 713                 gcm_init_clmul(ctx->Htable,ctx->H.u);
 714                 ctx->gmult = gcm_gmult_clmul;
 715                 ctx->ghash = gcm_ghash_clmul;
 716                 return;
 717         }
 718 #  endif
 719         gcm_init_4bit(ctx->Htable,ctx->H.u);
 720 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
 721 #   if  defined(OPENSSL_IA32_SSE2)
 722         if (OPENSSL_cpu_caps() & CPUCAP_MASK_SSE) {     /* check SSE bit */
 723 #   else
 724         if (OPENSSL_cpu_caps() & CPUCAP_MASK_MMX) {     /* check MMX bit */
 725 #   endif
 726                 ctx->gmult = gcm_gmult_4bit_mmx;
 727                 ctx->ghash = gcm_ghash_4bit_mmx;
 728         } else {
 729                 ctx->gmult = gcm_gmult_4bit_x86;
 730                 ctx->ghash = gcm_ghash_4bit_x86;
 731         }
 732 #  else
 733         ctx->gmult = gcm_gmult_4bit;
 734         ctx->ghash = gcm_ghash_4bit;
 735 #  endif
 736 # elif  defined(GHASH_ASM_ARM)
 737         if (OPENSSL_armcap_P & ARMV7_NEON) {
 738                 ctx->gmult = gcm_gmult_neon;
 739                 ctx->ghash = gcm_ghash_neon;
 740         } else {
 741                 gcm_init_4bit(ctx->Htable,ctx->H.u);
 742                 ctx->gmult = gcm_gmult_4bit;
 743                 ctx->ghash = gcm_ghash_4bit;
 744         }
 745 # else
 746         gcm_init_4bit(ctx->Htable,ctx->H.u);
 747 # endif
 748 #endif
 749 }
 750
 751 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
 752 {
 753         unsigned int ctr;
 754 #ifdef GCM_FUNCREF_4BIT
 755         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
 756 #endif
 757
 758         ctx->Yi.u[0]  = 0;
 759         ctx->Yi.u[1]  = 0;
 760         ctx->Xi.u[0]  = 0;
 761         ctx->Xi.u[1]  = 0;
 762         ctx->len.u[0] = 0;      /* AAD length */
 763         ctx->len.u[1] = 0;      /* message length */
 764         ctx->ares = 0;
 765         ctx->mres = 0;
 766
 767         if (len==12) {
 768                 memcpy(ctx->Yi.c,iv,12);
 769                 ctx->Yi.c[15]=1;
 770                 ctr=1;
 771         }
 772         else {
 773                 size_t i;
 774                 u64 len0 = len;
 775
 776                 while (len>=16) {
 777                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
 778                         GCM_MUL(ctx,Yi);
 779                         iv += 16;
 780                         len -= 16;
 781                 }
 782                 if (len) {
 783                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
 784                         GCM_MUL(ctx,Yi);
 785                 }
 786                 len0 <<= 3;
 787 #if BYTE_ORDER == LITTLE_ENDIAN
 788 #ifdef BSWAP8
 789                 ctx->Yi.u[1]  ^= BSWAP8(len0);
 790 #else
 791                 ctx->Yi.c[8]  ^= (u8)(len0>>56);
 792                 ctx->Yi.c[9]  ^= (u8)(len0>>48);
 793                 ctx->Yi.c[10] ^= (u8)(len0>>40);
 794                 ctx->Yi.c[11] ^= (u8)(len0>>32);
 795                 ctx->Yi.c[12] ^= (u8)(len0>>24);
 796                 ctx->Yi.c[13] ^= (u8)(len0>>16);
 797                 ctx->Yi.c[14] ^= (u8)(len0>>8);
 798                 ctx->Yi.c[15] ^= (u8)(len0);
 799 #endif
 800 #else /* BIG_ENDIAN */
 801                 ctx->Yi.u[1]  ^= len0;
 802 #endif
 803
 804                 GCM_MUL(ctx,Yi);
 805
 806 #if BYTE_ORDER == LITTLE_ENDIAN
 807 #ifdef BSWAP4
 808                 ctr = BSWAP4(ctx->Yi.d[3]);
 809 #else
 810                 ctr = GETU32(ctx->Yi.c+12);
 811 #endif
 812 #else /* BIG_ENDIAN */
 813                 ctr = ctx->Yi.d[3];
 814 #endif
 815         }
 816
 817         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
 818         ++ctr;
 819 #if BYTE_ORDER == LITTLE_ENDIAN
 820 #ifdef BSWAP4
 821         ctx->Yi.d[3] = BSWAP4(ctr);
 822 #else
 823         PUTU32(ctx->Yi.c+12,ctr);
 824 #endif
 825 #else /* BIG_ENDIAN */
 826         ctx->Yi.d[3] = ctr;
 827 #endif
 828 }
 829
 830 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
 831 {
 832         size_t i;
 833         unsigned int n;
 834         u64 alen = ctx->len.u[0];
 835 #ifdef GCM_FUNCREF_4BIT
 836         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
 837 # ifdef GHASH
 838         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
 839                                 const u8 *inp,size_t len)       = ctx->ghash;
 840 # endif
 841 #endif
 842
 843         if (ctx->len.u[1]) return -2;
 844
 845         alen += len;
 846         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
 847                 return -1;
 848         ctx->len.u[0] = alen;
 849
 850         n = ctx->ares;
 851         if (n) {
 852                 while (n && len) {
 853                         ctx->Xi.c[n] ^= *(aad++);
 854                         --len;
 855                         n = (n+1)%16;
 856                 }
 857                 if (n==0) GCM_MUL(ctx,Xi);
 858                 else {
 859                         ctx->ares = n;
 860                         return 0;
 861                 }
 862         }
 863
 864 #ifdef GHASH
 865         if ((i = (len&(size_t)-16))) {
 866                 GHASH(ctx,aad,i);
 867                 aad += i;
 868                 len -= i;
 869         }
 870 #else
 871         while (len>=16) {
 872                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
 873                 GCM_MUL(ctx,Xi);
 874                 aad += 16;
 875                 len -= 16;
 876         }
 877 #endif
 878         if (len) {
 879                 n = (unsigned int)len;
 880                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
 881         }
 882
 883         ctx->ares = n;
 884         return 0;
 885 }
 886
 887 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
 888                 const unsigned char *in, unsigned char *out,
 889                 size_t len)
 890 {
 891         unsigned int n, ctr;
 892         size_t i;
 893         u64        mlen  = ctx->len.u[1];
 894         block128_f block = ctx->block;
 895         void      *key   = ctx->key;
 896 #ifdef GCM_FUNCREF_4BIT
 897         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
 898 # ifdef GHASH
 899         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
 900                                 const u8 *inp,size_t len)       = ctx->ghash;
 901 # endif
 902 #endif
 903
 904         mlen += len;
 905         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
 906                 return -1;
 907         ctx->len.u[1] = mlen;
 908
 909         if (ctx->ares) {
 910                 /* First call to encrypt finalizes GHASH(AAD) */
 911                 GCM_MUL(ctx,Xi);
 912                 ctx->ares = 0;
 913         }
 914
 915 #if BYTE_ORDER == LITTLE_ENDIAN
 916 #ifdef BSWAP4
 917         ctr = BSWAP4(ctx->Yi.d[3]);
 918 #else
 919         ctr = GETU32(ctx->Yi.c+12);
 920 #endif
 921 #else /* BIG_ENDIAN */
 922         ctr = ctx->Yi.d[3];
 923 #endif
 924
 925         n = ctx->mres;
 926 #if !defined(OPENSSL_SMALL_FOOTPRINT)
 927         if (16%sizeof(size_t) == 0) do {        /* always true actually */
 928                 if (n) {
 929                         while (n && len) {
 930                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
 931                                 --len;
 932                                 n = (n+1)%16;
 933                         }
 934                         if (n==0) GCM_MUL(ctx,Xi);
 935                         else {
 936                                 ctx->mres = n;
 937                                 return 0;
 938                         }
 939                 }
 940 #ifdef __STRICT_ALIGNMENT
 941                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
 942                         break;
 943 #endif
 944 #if defined(GHASH) && defined(GHASH_CHUNK)
 945                 while (len>=GHASH_CHUNK) {
 946                     size_t j=GHASH_CHUNK;
 947
 948                     while (j) {
 949                         size_t *out_t=(size_t *)out;
 950                         const size_t *in_t=(const size_t *)in;
 951
 952                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
 953                         ++ctr;
 954 #if BYTE_ORDER == LITTLE_ENDIAN
 955 #ifdef BSWAP4
 956                         ctx->Yi.d[3] = BSWAP4(ctr);
 957 #else
 958                         PUTU32(ctx->Yi.c+12,ctr);
 959 #endif
 960 #else /* BIG_ENDIAN */
 961                         ctx->Yi.d[3] = ctr;
 962 #endif
 963                         for (i=0; i<16/sizeof(size_t); ++i)
 964                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
 965                         out += 16;
 966                         in  += 16;
 967                         j   -= 16;
 968                     }
 969                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
 970                     len -= GHASH_CHUNK;
 971                 }
 972                 if ((i = (len&(size_t)-16))) {
 973                     size_t j=i;
 974
 975                     while (len>=16) {
 976                         size_t *out_t=(size_t *)out;
 977                         const size_t *in_t=(const size_t *)in;
 978
 979                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
 980                         ++ctr;
 981 #if BYTE_ORDER == LITTLE_ENDIAN
 982 #ifdef BSWAP4
 983                         ctx->Yi.d[3] = BSWAP4(ctr);
 984 #else
 985                         PUTU32(ctx->Yi.c+12,ctr);
 986 #endif
 987 #else /* BIG_ENDIAN */
 988                         ctx->Yi.d[3] = ctr;
 989 #endif
 990                         for (i=0; i<16/sizeof(size_t); ++i)
 991                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
 992                         out += 16;
 993                         in  += 16;
 994                         len -= 16;
 995                     }
 996                     GHASH(ctx,out-j,j);
 997                 }
 998 #else
 999                 while (len>=16) {
1000                         size_t *out_t=(size_t *)out;
1001                         const size_t *in_t=(const size_t *)in;
1002
1003                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1004                         ++ctr;
1005 #if BYTE_ORDER == LITTLE_ENDIAN
1006 #ifdef BSWAP4
1007                         ctx->Yi.d[3] = BSWAP4(ctr);
1008 #else
1009                         PUTU32(ctx->Yi.c+12,ctr);
1010 #endif
1011 #else /* BIG_ENDIAN */
1012                         ctx->Yi.d[3] = ctr;
1013 #endif
1014                         for (i=0; i<16/sizeof(size_t); ++i)
1015                                 ctx->Xi.t[i] ^=
1016                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1017                         GCM_MUL(ctx,Xi);
1018                         out += 16;
1019                         in  += 16;
1020                         len -= 16;
1021                 }
1022 #endif
1023                 if (len) {
1024                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1025                         ++ctr;
1026 #if BYTE_ORDER == LITTLE_ENDIAN
1027 #ifdef BSWAP4
1028                         ctx->Yi.d[3] = BSWAP4(ctr);
1029 #else
1030                         PUTU32(ctx->Yi.c+12,ctr);
1031 #endif
1032 #else /* BIG_ENDIAN */
1033                         ctx->Yi.d[3] = ctr;
1034 #endif
1035                         while (len--) {
1036                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1037                                 ++n;
1038                         }
1039                 }
1040
1041                 ctx->mres = n;
1042                 return 0;
1043         } while(0);
1044 #endif
1045         for (i=0;i<len;++i) {
1046                 if (n==0) {
1047                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1048                         ++ctr;
1049 #if BYTE_ORDER == LITTLE_ENDIAN
1050 #ifdef BSWAP4
1051                         ctx->Yi.d[3] = BSWAP4(ctr);
1052 #else
1053                         PUTU32(ctx->Yi.c+12,ctr);
1054 #endif
1055 #else /* BIG_ENDIAN */
1056                         ctx->Yi.d[3] = ctr;
1057 #endif
1058                 }
1059                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1060                 n = (n+1)%16;
1061                 if (n==0)
1062                         GCM_MUL(ctx,Xi);
1063         }
1064
1065         ctx->mres = n;
1066         return 0;
1067 }
1068
1069 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1070                 const unsigned char *in, unsigned char *out,
1071                 size_t len)
1072 {
1073         unsigned int n, ctr;
1074         size_t i;
1075         u64        mlen  = ctx->len.u[1];
1076         block128_f block = ctx->block;
1077         void      *key   = ctx->key;
1078 #ifdef GCM_FUNCREF_4BIT
1079         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1080 # ifdef GHASH
1081         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1082                                 const u8 *inp,size_t len)       = ctx->ghash;
1083 # endif
1084 #endif
1085
1086         mlen += len;
1087         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1088                 return -1;
1089         ctx->len.u[1] = mlen;
1090
1091         if (ctx->ares) {
1092                 /* First call to decrypt finalizes GHASH(AAD) */
1093                 GCM_MUL(ctx,Xi);
1094                 ctx->ares = 0;
1095         }
1096
1097 #if BYTE_ORDER == LITTLE_ENDIAN
1098 #ifdef BSWAP4
1099         ctr = BSWAP4(ctx->Yi.d[3]);
1100 #else
1101         ctr = GETU32(ctx->Yi.c+12);
1102 #endif
1103 #else /* BIG_ENDIAN */
1104         ctr = ctx->Yi.d[3];
1105 #endif
1106
1107         n = ctx->mres;
1108 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1109         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1110                 if (n) {
1111                         while (n && len) {
1112                                 u8 c = *(in++);
1113                                 *(out++) = c^ctx->EKi.c[n];
1114                                 ctx->Xi.c[n] ^= c;
1115                                 --len;
1116                                 n = (n+1)%16;
1117                         }
1118                         if (n==0) GCM_MUL (ctx,Xi);
1119                         else {
1120                                 ctx->mres = n;
1121                                 return 0;
1122                         }
1123                 }
1124 #ifdef __STRICT_ALIGNMENT
1125                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1126                         break;
1127 #endif
1128 #if defined(GHASH) && defined(GHASH_CHUNK)
1129                 while (len>=GHASH_CHUNK) {
1130                     size_t j=GHASH_CHUNK;
1131
1132                     GHASH(ctx,in,GHASH_CHUNK);
1133                     while (j) {
1134                         size_t *out_t=(size_t *)out;
1135                         const size_t *in_t=(const size_t *)in;
1136
1137                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1138                         ++ctr;
1139 #if BYTE_ORDER == LITTLE_ENDIAN
1140 #ifdef BSWAP4
1141                                 ctx->Yi.d[3] = BSWAP4(ctr);
1142 #else
1143                                 PUTU32(ctx->Yi.c+12,ctr);
1144 #endif
1145 #else /* BIG_ENDIAN */
1146                                 ctx->Yi.d[3] = ctr;
1147 #endif
1148                         for (i=0; i<16/sizeof(size_t); ++i)
1149                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1150                         out += 16;
1151                         in  += 16;
1152                         j   -= 16;
1153                     }
1154                     len -= GHASH_CHUNK;
1155                 }
1156                 if ((i = (len&(size_t)-16))) {
1157                     GHASH(ctx,in,i);
1158                     while (len>=16) {
1159                         size_t *out_t=(size_t *)out;
1160                         const size_t *in_t=(const size_t *)in;
1161
1162                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1163                         ++ctr;
1164 #if BYTE_ORDER == LITTLE_ENDIAN
1165 #ifdef BSWAP4
1166                         ctx->Yi.d[3] = BSWAP4(ctr);
1167 #else
1168                         PUTU32(ctx->Yi.c+12,ctr);
1169 #endif
1170 #else /* BIG_ENDIAN */
1171                         ctx->Yi.d[3] = ctr;
1172 #endif
1173                         for (i=0; i<16/sizeof(size_t); ++i)
1174                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1175                         out += 16;
1176                         in  += 16;
1177                         len -= 16;
1178                     }
1179                 }
1180 #else
1181                 while (len>=16) {
1182                         size_t *out_t=(size_t *)out;
1183                         const size_t *in_t=(const size_t *)in;
1184
1185                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1186                         ++ctr;
1187 #if BYTE_ORDER == LITTLE_ENDIAN
1188 #ifdef BSWAP4
1189                         ctx->Yi.d[3] = BSWAP4(ctr);
1190 #else
1191                         PUTU32(ctx->Yi.c+12,ctr);
1192 #endif
1193 #else /* BIG_ENDIAN */
1194                         ctx->Yi.d[3] = ctr;
1195 #endif
1196                         for (i=0; i<16/sizeof(size_t); ++i) {
1197                                 size_t c = in[i];
1198                                 out[i] = c^ctx->EKi.t[i];
1199                                 ctx->Xi.t[i] ^= c;
1200                         }
1201                         GCM_MUL(ctx,Xi);
1202                         out += 16;
1203                         in  += 16;
1204                         len -= 16;
1205                 }
1206 #endif
1207                 if (len) {
1208                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1209                         ++ctr;
1210 #if BYTE_ORDER == LITTLE_ENDIAN
1211 #ifdef BSWAP4
1212                         ctx->Yi.d[3] = BSWAP4(ctr);
1213 #else
1214                         PUTU32(ctx->Yi.c+12,ctr);
1215 #endif
1216 #else /* BIG_ENDIAN */
1217                         ctx->Yi.d[3] = ctr;
1218 #endif
1219                         while (len--) {
1220                                 u8 c = in[n];
1221                                 ctx->Xi.c[n] ^= c;
1222                                 out[n] = c^ctx->EKi.c[n];
1223                                 ++n;
1224                         }
1225                 }
1226
1227                 ctx->mres = n;
1228                 return 0;
1229         } while(0);
1230 #endif
1231         for (i=0;i<len;++i) {
1232                 u8 c;
1233                 if (n==0) {
1234                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1235                         ++ctr;
1236 #if BYTE_ORDER == LITTLE_ENDIAN
1237 #ifdef BSWAP4
1238                         ctx->Yi.d[3] = BSWAP4(ctr);
1239 #else
1240                         PUTU32(ctx->Yi.c+12,ctr);
1241 #endif
1242 #else /* BIG_ENDIAN */
1243                         ctx->Yi.d[3] = ctr;
1244 #endif
1245                 }
1246                 c = in[i];
1247                 out[i] = c^ctx->EKi.c[n];
1248                 ctx->Xi.c[n] ^= c;
1249                 n = (n+1)%16;
1250                 if (n==0)
1251                         GCM_MUL(ctx,Xi);
1252         }
1253
1254         ctx->mres = n;
1255         return 0;
1256 }
1257
1258 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1259                 const unsigned char *in, unsigned char *out,
1260                 size_t len, ctr128_f stream)
1261 {
1262         unsigned int n, ctr;
1263         size_t i;
1264         u64   mlen = ctx->len.u[1];
1265         void *key  = ctx->key;
1266 #ifdef GCM_FUNCREF_4BIT
1267         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1268 # ifdef GHASH
1269         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1270                                 const u8 *inp,size_t len)       = ctx->ghash;
1271 # endif
1272 #endif
1273
1274         mlen += len;
1275         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1276                 return -1;
1277         ctx->len.u[1] = mlen;
1278
1279         if (ctx->ares) {
1280                 /* First call to encrypt finalizes GHASH(AAD) */
1281                 GCM_MUL(ctx,Xi);
1282                 ctx->ares = 0;
1283         }
1284
1285 #if BYTE_ORDER == LITTLE_ENDIAN
1286 #ifdef BSWAP4
1287         ctr = BSWAP4(ctx->Yi.d[3]);
1288 #else
1289         ctr = GETU32(ctx->Yi.c+12);
1290 #endif
1291 #else /* BIG_ENDIAN */
1292         ctr = ctx->Yi.d[3];
1293 #endif
1294
1295         n = ctx->mres;
1296         if (n) {
1297                 while (n && len) {
1298                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1299                         --len;
1300                         n = (n+1)%16;
1301                 }
1302                 if (n==0) GCM_MUL(ctx,Xi);
1303                 else {
1304                         ctx->mres = n;
1305                         return 0;
1306                 }
1307         }
1308 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1309         while (len>=GHASH_CHUNK) {
1310                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1311                 ctr += GHASH_CHUNK/16;
1312 #if BYTE_ORDER == LITTLE_ENDIAN
1313 #ifdef BSWAP4
1314                 ctx->Yi.d[3] = BSWAP4(ctr);
1315 #else
1316                 PUTU32(ctx->Yi.c+12,ctr);
1317 #endif
1318 #else /* BIG_ENDIAN */
1319                 ctx->Yi.d[3] = ctr;
1320 #endif
1321                 GHASH(ctx,out,GHASH_CHUNK);
1322                 out += GHASH_CHUNK;
1323                 in  += GHASH_CHUNK;
1324                 len -= GHASH_CHUNK;
1325         }
1326 #endif
1327         if ((i = (len&(size_t)-16))) {
1328                 size_t j=i/16;
1329
1330                 (*stream)(in,out,j,key,ctx->Yi.c);
1331                 ctr += (unsigned int)j;
1332 #if BYTE_ORDER == LITTLE_ENDIAN
1333 #ifdef BSWAP4
1334                 ctx->Yi.d[3] = BSWAP4(ctr);
1335 #else
1336                 PUTU32(ctx->Yi.c+12,ctr);
1337 #endif
1338 #else /* BIG_ENDIAN */
1339                 ctx->Yi.d[3] = ctr;
1340 #endif
1341                 in  += i;
1342                 len -= i;
1343 #if defined(GHASH)
1344                 GHASH(ctx,out,i);
1345                 out += i;
1346 #else
1347                 while (j--) {
1348                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1349                         GCM_MUL(ctx,Xi);
1350                         out += 16;
1351                 }
1352 #endif
1353         }
1354         if (len) {
1355                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1356                 ++ctr;
1357 #if BYTE_ORDER == LITTLE_ENDIAN
1358 #ifdef BSWAP4
1359                 ctx->Yi.d[3] = BSWAP4(ctr);
1360 #else
1361                 PUTU32(ctx->Yi.c+12,ctr);
1362 #endif
1363 #else /* BIG_ENDIAN */
1364                 ctx->Yi.d[3] = ctr;
1365 #endif
1366                 while (len--) {
1367                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1368                         ++n;
1369                 }
1370         }
1371
1372         ctx->mres = n;
1373         return 0;
1374 }
1375
1376 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1377                 const unsigned char *in, unsigned char *out,
1378                 size_t len,ctr128_f stream)
1379 {
1380         unsigned int n, ctr;
1381         size_t i;
1382         u64   mlen = ctx->len.u[1];
1383         void *key  = ctx->key;
1384 #ifdef GCM_FUNCREF_4BIT
1385         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1386 # ifdef GHASH
1387         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1388                                 const u8 *inp,size_t len)       = ctx->ghash;
1389 # endif
1390 #endif
1391
1392         mlen += len;
1393         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1394                 return -1;
1395         ctx->len.u[1] = mlen;
1396
1397         if (ctx->ares) {
1398                 /* First call to decrypt finalizes GHASH(AAD) */
1399                 GCM_MUL(ctx,Xi);
1400                 ctx->ares = 0;
1401         }
1402
1403 #if BYTE_ORDER == LITTLE_ENDIAN
1404 #ifdef BSWAP4
1405         ctr = BSWAP4(ctx->Yi.d[3]);
1406 #else
1407         ctr = GETU32(ctx->Yi.c+12);
1408 #endif
1409 #else /* BIG_ENDIAN */
1410         ctr = ctx->Yi.d[3];
1411 #endif
1412
1413         n = ctx->mres;
1414         if (n) {
1415                 while (n && len) {
1416                         u8 c = *(in++);
1417                         *(out++) = c^ctx->EKi.c[n];
1418                         ctx->Xi.c[n] ^= c;
1419                         --len;
1420                         n = (n+1)%16;
1421                 }
1422                 if (n==0) GCM_MUL (ctx,Xi);
1423                 else {
1424                         ctx->mres = n;
1425                         return 0;
1426                 }
1427         }
1428 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1429         while (len>=GHASH_CHUNK) {
1430                 GHASH(ctx,in,GHASH_CHUNK);
1431                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1432                 ctr += GHASH_CHUNK/16;
1433 #if BYTE_ORDER == LITTLE_ENDIAN
1434 #ifdef BSWAP4
1435                 ctx->Yi.d[3] = BSWAP4(ctr);
1436 #else
1437                 PUTU32(ctx->Yi.c+12,ctr);
1438 #endif
1439 #else /* BIG_ENDIAN */
1440                 ctx->Yi.d[3] = ctr;
1441 #endif
1442                 out += GHASH_CHUNK;
1443                 in  += GHASH_CHUNK;
1444                 len -= GHASH_CHUNK;
1445         }
1446 #endif
1447         if ((i = (len&(size_t)-16))) {
1448                 size_t j=i/16;
1449
1450 #if defined(GHASH)
1451                 GHASH(ctx,in,i);
1452 #else
1453                 while (j--) {
1454                         size_t k;
1455                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1456                         GCM_MUL(ctx,Xi);
1457                         in += 16;
1458                 }
1459                 j   = i/16;
1460                 in -= i;
1461 #endif
1462                 (*stream)(in,out,j,key,ctx->Yi.c);
1463                 ctr += (unsigned int)j;
1464 #if BYTE_ORDER == LITTLE_ENDIAN
1465 #ifdef BSWAP4
1466                 ctx->Yi.d[3] = BSWAP4(ctr);
1467 #else
1468                 PUTU32(ctx->Yi.c+12,ctr);
1469 #endif
1470 #else /* BIG_ENDIAN */
1471                 ctx->Yi.d[3] = ctr;
1472 #endif
1473                 out += i;
1474                 in  += i;
1475                 len -= i;
1476         }
1477         if (len) {
1478                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1479                 ++ctr;
1480 #if BYTE_ORDER == LITTLE_ENDIAN
1481 #ifdef BSWAP4
1482                 ctx->Yi.d[3] = BSWAP4(ctr);
1483 #else
1484                 PUTU32(ctx->Yi.c+12,ctr);
1485 #endif
1486 #else /* BIG_ENDIAN */
1487                 ctx->Yi.d[3] = ctr;
1488 #endif
1489                 while (len--) {
1490                         u8 c = in[n];
1491                         ctx->Xi.c[n] ^= c;
1492                         out[n] = c^ctx->EKi.c[n];
1493                         ++n;
1494                 }
1495         }
1496
1497         ctx->mres = n;
1498         return 0;
1499 }
1500
1501 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1502                         size_t len)
1503 {
1504         u64 alen = ctx->len.u[0]<<3;
1505         u64 clen = ctx->len.u[1]<<3;
1506 #ifdef GCM_FUNCREF_4BIT
1507         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1508 #endif
1509
1510         if (ctx->mres || ctx->ares)
1511                 GCM_MUL(ctx,Xi);
1512
1513 #if BYTE_ORDER == LITTLE_ENDIAN
1514 #ifdef BSWAP8
1515         alen = BSWAP8(alen);
1516         clen = BSWAP8(clen);
1517 #else
1518         u8 *p = ctx->len.c;
1519
1520         ctx->len.u[0] = alen;
1521         ctx->len.u[1] = clen;
1522
1523         alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1524         clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1525 #endif
1526 #endif
1527
1528         ctx->Xi.u[0] ^= alen;
1529         ctx->Xi.u[1] ^= clen;
1530         GCM_MUL(ctx,Xi);
1531
1532         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1533         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1534
1535         if (tag && len<=sizeof(ctx->Xi))
1536                 return memcmp(ctx->Xi.c,tag,len);
1537         else
1538                 return -1;
1539 }
1540
1541 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1542 {
1543         CRYPTO_gcm128_finish(ctx, NULL, 0);
1544         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1545 }
1546
1547 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1548 {
1549         GCM128_CONTEXT *ret;
1550
1551         if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1552                 CRYPTO_gcm128_init(ret,key,block);
1553
1554         return ret;
1555 }
1556
1557 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1558 {
1559         freezero(ctx, sizeof(*ctx));
1560 }