dmake: do not set MAKEFLAGS=k
[unleashed/tickless.git] / lib / libcrypto / modes / gcm128.c
blob69b1dd4f495573b8c8061d020253815f1a5bc552
1 /* $OpenBSD: gcm128.c,v 1.20 2017/09/03 13:07:34 inoguchi Exp $ */
2 /* ====================================================================
3 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
17 * 3. All advertising materials mentioning features or use of this
18 * software must display the following acknowledgment:
19 * "This product includes software developed by the OpenSSL Project
20 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
22 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
23 * endorse or promote products derived from this software without
24 * prior written permission. For written permission, please contact
25 * openssl-core@openssl.org.
27 * 5. Products derived from this software may not be called "OpenSSL"
28 * nor may "OpenSSL" appear in their names without prior written
29 * permission of the OpenSSL Project.
31 * 6. Redistributions of any form whatsoever must retain the following
32 * acknowledgment:
33 * "This product includes software developed by the OpenSSL Project
34 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
36 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
37 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
38 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
39 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
42 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
43 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
44 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
45 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
46 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
47 * OF THE POSSIBILITY OF SUCH DAMAGE.
48 * ====================================================================
51 #define OPENSSL_FIPSAPI
53 #include <openssl/crypto.h>
54 #include "modes_lcl.h"
55 #include <string.h>
57 #ifndef MODES_DEBUG
58 # ifndef NDEBUG
59 # define NDEBUG
60 # endif
61 #endif
63 #if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef GETU32
66 #define GETU32(p) BSWAP4(*(const u32 *)(p))
67 #undef PUTU32
68 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
69 #endif
71 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V) \
73 do { \
74 if (sizeof(size_t)==8) { \
75 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
76 V.lo = (V.hi<<63)|(V.lo>>1); \
77 V.hi = (V.hi>>1 )^T; \
78 } else { \
79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 V.lo = (V.hi<<63)|(V.lo>>1); \
81 V.hi = (V.hi>>1 )^((u64)T<<32); \
82 } \
83 } while(0)
86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87 * never be set to 8. 8 is effectively reserved for testing purposes.
88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90 * whole spectrum of possible table driven implementations. Why? In
91 * non-"Shoup's" case memory access pattern is segmented in such manner,
92 * that it's trivial to see that cache timing information can reveal
93 * fair portion of intermediate hash value. Given that ciphertext is
94 * always available to attacker, it's possible for him to attempt to
95 * deduce secret parameter H and if successful, tamper with messages
96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97 * not as trivial, but there is no reason to believe that it's resistant
98 * to cache-timing attack. And the thing about "8-bit" implementation is
99 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100 * key + 1KB shared. Well, on pros side it should be twice as fast as
101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102 * was observed to run ~75% faster, closer to 100% for commercial
103 * compilers... Yet "4-bit" procedure is preferred, because it's
104 * believed to provide better security-performance balance and adequate
105 * all-round performance. "All-round" refers to things like:
107 * - shorter setup time effectively improves overall timing for
108 * handling short messages;
109 * - larger table allocation can become unbearable because of VM
110 * subsystem penalties (for example on Windows large enough free
111 * results in VM working set trimming, meaning that consequent
112 * malloc would immediately incur working set expansion);
113 * - larger table has larger cache footprint, which can affect
114 * performance of other code paths (not necessarily even from same
115 * thread in Hyper-Threading world);
117 * Value of 1 is not appropriate for performance reasons.
119 #if TABLE_BITS==8
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
123 int i, j;
124 u128 V;
126 Htable[0].hi = 0;
127 Htable[0].lo = 0;
128 V.hi = H[0];
129 V.lo = H[1];
131 for (Htable[128]=V, i=64; i>0; i>>=1) {
132 REDUCE1BIT(V);
133 Htable[i] = V;
136 for (i=2; i<256; i<<=1) {
137 u128 *Hi = Htable+i, H0 = *Hi;
138 for (j=1; j<i; ++j) {
139 Hi[j].hi = H0.hi^Htable[j].hi;
140 Hi[j].lo = H0.lo^Htable[j].lo;
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
147 u128 Z = { 0, 0};
148 const u8 *xi = (const u8 *)Xi+15;
149 size_t rem, n = *xi;
150 static const size_t rem_8bit[256] = {
151 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
152 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
153 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
154 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
155 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
156 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
157 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
158 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
159 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
160 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
161 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
162 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
163 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
164 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
165 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
166 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
167 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
168 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
169 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
170 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
171 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
172 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
173 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
174 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
175 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
176 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
177 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
178 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
179 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
180 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
181 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
182 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
183 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
184 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
185 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
186 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
187 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
188 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
189 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
190 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
191 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
192 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
193 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
194 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
195 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
196 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
197 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
198 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
199 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
200 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
201 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
202 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
203 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
204 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
205 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
206 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
207 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
208 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
209 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
210 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
211 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
212 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
213 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
214 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216 while (1) {
217 Z.hi ^= Htable[n].hi;
218 Z.lo ^= Htable[n].lo;
220 if ((u8 *)Xi==xi) break;
222 n = *(--xi);
224 rem = (size_t)Z.lo&0xff;
225 Z.lo = (Z.hi<<56)|(Z.lo>>8);
226 Z.hi = (Z.hi>>8);
227 #if SIZE_MAX == 0xffffffffffffffff
228 Z.hi ^= rem_8bit[rem];
229 #else
230 Z.hi ^= (u64)rem_8bit[rem]<<32;
231 #endif
234 #if BYTE_ORDER == LITTLE_ENDIAN
235 #ifdef BSWAP8
236 Xi[0] = BSWAP8(Z.hi);
237 Xi[1] = BSWAP8(Z.lo);
238 #else
239 u8 *p = (u8 *)Xi;
240 u32 v;
241 v = (u32)(Z.hi>>32); PUTU32(p,v);
242 v = (u32)(Z.hi); PUTU32(p+4,v);
243 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
244 v = (u32)(Z.lo); PUTU32(p+12,v);
245 #endif
246 #else /* BIG_ENDIAN */
247 Xi[0] = Z.hi;
248 Xi[1] = Z.lo;
249 #endif
251 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253 #elif TABLE_BITS==4
255 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 u128 V;
258 #if defined(OPENSSL_SMALL_FOOTPRINT)
259 int i;
260 #endif
262 Htable[0].hi = 0;
263 Htable[0].lo = 0;
264 V.hi = H[0];
265 V.lo = H[1];
267 #if defined(OPENSSL_SMALL_FOOTPRINT)
268 for (Htable[8]=V, i=4; i>0; i>>=1) {
269 REDUCE1BIT(V);
270 Htable[i] = V;
273 for (i=2; i<16; i<<=1) {
274 u128 *Hi = Htable+i;
275 int j;
276 for (V=*Hi, j=1; j<i; ++j) {
277 Hi[j].hi = V.hi^Htable[j].hi;
278 Hi[j].lo = V.lo^Htable[j].lo;
281 #else
282 Htable[8] = V;
283 REDUCE1BIT(V);
284 Htable[4] = V;
285 REDUCE1BIT(V);
286 Htable[2] = V;
287 REDUCE1BIT(V);
288 Htable[1] = V;
289 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
290 V=Htable[4];
291 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
292 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
293 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
294 V=Htable[8];
295 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
296 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
297 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
298 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
299 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
300 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
301 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
302 #endif
303 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305 * ARM assembler expects specific dword order in Htable.
308 int j;
309 #if BYTE_ORDER == LITTLE_ENDIAN
310 for (j=0;j<16;++j) {
311 V = Htable[j];
312 Htable[j].hi = V.lo;
313 Htable[j].lo = V.hi;
315 #else /* BIG_ENDIAN */
316 for (j=0;j<16;++j) {
317 V = Htable[j];
318 Htable[j].hi = V.lo<<32|V.lo>>32;
319 Htable[j].lo = V.hi<<32|V.hi>>32;
321 #endif
323 #endif
326 #ifndef GHASH_ASM
327 static const size_t rem_4bit[16] = {
328 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
329 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
330 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
331 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
333 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
335 u128 Z;
336 int cnt = 15;
337 size_t rem, nlo, nhi;
339 nlo = ((const u8 *)Xi)[15];
340 nhi = nlo>>4;
341 nlo &= 0xf;
343 Z.hi = Htable[nlo].hi;
344 Z.lo = Htable[nlo].lo;
346 while (1) {
347 rem = (size_t)Z.lo&0xf;
348 Z.lo = (Z.hi<<60)|(Z.lo>>4);
349 Z.hi = (Z.hi>>4);
350 #if SIZE_MAX == 0xffffffffffffffff
351 Z.hi ^= rem_4bit[rem];
352 #else
353 Z.hi ^= (u64)rem_4bit[rem]<<32;
354 #endif
355 Z.hi ^= Htable[nhi].hi;
356 Z.lo ^= Htable[nhi].lo;
358 if (--cnt<0) break;
360 nlo = ((const u8 *)Xi)[cnt];
361 nhi = nlo>>4;
362 nlo &= 0xf;
364 rem = (size_t)Z.lo&0xf;
365 Z.lo = (Z.hi<<60)|(Z.lo>>4);
366 Z.hi = (Z.hi>>4);
367 #if SIZE_MAX == 0xffffffffffffffff
368 Z.hi ^= rem_4bit[rem];
369 #else
370 Z.hi ^= (u64)rem_4bit[rem]<<32;
371 #endif
372 Z.hi ^= Htable[nlo].hi;
373 Z.lo ^= Htable[nlo].lo;
376 #if BYTE_ORDER == LITTLE_ENDIAN
377 #ifdef BSWAP8
378 Xi[0] = BSWAP8(Z.hi);
379 Xi[1] = BSWAP8(Z.lo);
380 #else
381 u8 *p = (u8 *)Xi;
382 u32 v;
383 v = (u32)(Z.hi>>32); PUTU32(p,v);
384 v = (u32)(Z.hi); PUTU32(p+4,v);
385 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
386 v = (u32)(Z.lo); PUTU32(p+12,v);
387 #endif
388 #else /* BIG_ENDIAN */
389 Xi[0] = Z.hi;
390 Xi[1] = Z.lo;
391 #endif
394 #if !defined(OPENSSL_SMALL_FOOTPRINT)
396 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
397 * details... Compiler-generated code doesn't seem to give any
398 * performance improvement, at least not on x86[_64]. It's here
399 * mostly as reference and a placeholder for possible future
400 * non-trivial optimization[s]...
402 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
403 const u8 *inp,size_t len)
405 u128 Z;
406 int cnt;
407 size_t rem, nlo, nhi;
409 #if 1
410 do {
411 cnt = 15;
412 nlo = ((const u8 *)Xi)[15];
413 nlo ^= inp[15];
414 nhi = nlo>>4;
415 nlo &= 0xf;
417 Z.hi = Htable[nlo].hi;
418 Z.lo = Htable[nlo].lo;
420 while (1) {
421 rem = (size_t)Z.lo&0xf;
422 Z.lo = (Z.hi<<60)|(Z.lo>>4);
423 Z.hi = (Z.hi>>4);
424 #if SIZE_MAX == 0xffffffffffffffff
425 Z.hi ^= rem_4bit[rem];
426 #else
427 Z.hi ^= (u64)rem_4bit[rem]<<32;
428 #endif
429 Z.hi ^= Htable[nhi].hi;
430 Z.lo ^= Htable[nhi].lo;
432 if (--cnt<0) break;
434 nlo = ((const u8 *)Xi)[cnt];
435 nlo ^= inp[cnt];
436 nhi = nlo>>4;
437 nlo &= 0xf;
439 rem = (size_t)Z.lo&0xf;
440 Z.lo = (Z.hi<<60)|(Z.lo>>4);
441 Z.hi = (Z.hi>>4);
442 #if SIZE_MAX == 0xffffffffffffffff
443 Z.hi ^= rem_4bit[rem];
444 #else
445 Z.hi ^= (u64)rem_4bit[rem]<<32;
446 #endif
447 Z.hi ^= Htable[nlo].hi;
448 Z.lo ^= Htable[nlo].lo;
450 #else
452 * Extra 256+16 bytes per-key plus 512 bytes shared tables
453 * [should] give ~50% improvement... One could have PACK()-ed
454 * the rem_8bit even here, but the priority is to minimize
455 * cache footprint...
457 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
458 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
459 static const unsigned short rem_8bit[256] = {
460 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
461 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
462 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
463 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
464 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
465 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
466 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
467 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
468 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
469 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
470 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
471 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
472 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
473 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
474 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
475 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
476 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
477 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
478 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
479 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
480 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
481 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
482 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
483 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
484 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
485 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
486 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
487 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
488 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
489 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
490 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
491 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
493 * This pre-processing phase slows down procedure by approximately
494 * same time as it makes each loop spin faster. In other words
495 * single block performance is approximately same as straightforward
496 * "4-bit" implementation, and then it goes only faster...
498 for (cnt=0; cnt<16; ++cnt) {
499 Z.hi = Htable[cnt].hi;
500 Z.lo = Htable[cnt].lo;
501 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
502 Hshr4[cnt].hi = (Z.hi>>4);
503 Hshl4[cnt] = (u8)(Z.lo<<4);
506 do {
507 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
508 nlo = ((const u8 *)Xi)[cnt];
509 nlo ^= inp[cnt];
510 nhi = nlo>>4;
511 nlo &= 0xf;
513 Z.hi ^= Htable[nlo].hi;
514 Z.lo ^= Htable[nlo].lo;
516 rem = (size_t)Z.lo&0xff;
518 Z.lo = (Z.hi<<56)|(Z.lo>>8);
519 Z.hi = (Z.hi>>8);
521 Z.hi ^= Hshr4[nhi].hi;
522 Z.lo ^= Hshr4[nhi].lo;
523 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
526 nlo = ((const u8 *)Xi)[0];
527 nlo ^= inp[0];
528 nhi = nlo>>4;
529 nlo &= 0xf;
531 Z.hi ^= Htable[nlo].hi;
532 Z.lo ^= Htable[nlo].lo;
534 rem = (size_t)Z.lo&0xf;
536 Z.lo = (Z.hi<<60)|(Z.lo>>4);
537 Z.hi = (Z.hi>>4);
539 Z.hi ^= Htable[nhi].hi;
540 Z.lo ^= Htable[nhi].lo;
541 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
542 #endif
544 #if BYTE_ORDER == LITTLE_ENDIAN
545 #ifdef BSWAP8
546 Xi[0] = BSWAP8(Z.hi);
547 Xi[1] = BSWAP8(Z.lo);
548 #else
549 u8 *p = (u8 *)Xi;
550 u32 v;
551 v = (u32)(Z.hi>>32); PUTU32(p,v);
552 v = (u32)(Z.hi); PUTU32(p+4,v);
553 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
554 v = (u32)(Z.lo); PUTU32(p+12,v);
555 #endif
556 #else /* BIG_ENDIAN */
557 Xi[0] = Z.hi;
558 Xi[1] = Z.lo;
559 #endif
560 } while (inp+=16, len-=16);
562 #endif
563 #else
564 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
565 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
566 #endif
568 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
569 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
570 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
571 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
572 * trashing effect. In other words idea is to hash data while it's
573 * still in L1 cache after encryption pass... */
574 #define GHASH_CHUNK (3*1024)
575 #endif
577 #else /* TABLE_BITS */
579 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
581 u128 V,Z = { 0,0 };
582 long X;
583 int i,j;
584 const long *xi = (const long *)Xi;
586 V.hi = H[0]; /* H is in host byte order, no byte swapping */
587 V.lo = H[1];
589 for (j=0; j<16/sizeof(long); ++j) {
590 #if BYTE_ORDER == LITTLE_ENDIAN
591 #if SIZE_MAX == 0xffffffffffffffff
592 #ifdef BSWAP8
593 X = (long)(BSWAP8(xi[j]));
594 #else
595 const u8 *p = (const u8 *)(xi+j);
596 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
597 #endif
598 #else
599 const u8 *p = (const u8 *)(xi+j);
600 X = (long)GETU32(p);
601 #endif
602 #else /* BIG_ENDIAN */
603 X = xi[j];
604 #endif
606 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
607 u64 M = (u64)(X>>(8*sizeof(long)-1));
608 Z.hi ^= V.hi&M;
609 Z.lo ^= V.lo&M;
611 REDUCE1BIT(V);
615 #if BYTE_ORDER == LITTLE_ENDIAN
616 #ifdef BSWAP8
617 Xi[0] = BSWAP8(Z.hi);
618 Xi[1] = BSWAP8(Z.lo);
619 #else
620 u8 *p = (u8 *)Xi;
621 u32 v;
622 v = (u32)(Z.hi>>32); PUTU32(p,v);
623 v = (u32)(Z.hi); PUTU32(p+4,v);
624 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
625 v = (u32)(Z.lo); PUTU32(p+12,v);
626 #endif
627 #else /* BIG_ENDIAN */
628 Xi[0] = Z.hi;
629 Xi[1] = Z.lo;
630 #endif
632 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
634 #endif
636 #if defined(GHASH_ASM) && \
637 (defined(__i386) || defined(__i386__) || \
638 defined(__x86_64) || defined(__x86_64__) || \
639 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
640 #include "x86_arch.h"
641 #endif
643 #if TABLE_BITS==4 && defined(GHASH_ASM)
644 # if (defined(__i386) || defined(__i386__) || \
645 defined(__x86_64) || defined(__x86_64__) || \
646 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
647 # define GHASH_ASM_X86_OR_64
648 # define GCM_FUNCREF_4BIT
650 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
651 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
652 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
654 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
655 # define GHASH_ASM_X86
656 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
657 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
659 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
660 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
661 # endif
662 # elif defined(__arm__) || defined(__arm)
663 # include "arm_arch.h"
664 # if __ARM_ARCH__>=7
665 # define GHASH_ASM_ARM
666 # define GCM_FUNCREF_4BIT
667 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
668 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
669 # endif
670 # endif
671 #endif
673 #ifdef GCM_FUNCREF_4BIT
674 # undef GCM_MUL
675 # define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
676 # ifdef GHASH
677 # undef GHASH
678 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
679 # endif
680 #endif
682 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
684 memset(ctx,0,sizeof(*ctx));
685 ctx->block = block;
686 ctx->key = key;
688 (*block)(ctx->H.c,ctx->H.c,key);
690 #if BYTE_ORDER == LITTLE_ENDIAN
691 /* H is stored in host byte order */
692 #ifdef BSWAP8
693 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
694 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
695 #else
696 u8 *p = ctx->H.c;
697 u64 hi,lo;
698 hi = (u64)GETU32(p) <<32|GETU32(p+4);
699 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
700 ctx->H.u[0] = hi;
701 ctx->H.u[1] = lo;
702 #endif
703 #endif
705 #if TABLE_BITS==8
706 gcm_init_8bit(ctx->Htable,ctx->H.u);
707 #elif TABLE_BITS==4
708 # if defined(GHASH_ASM_X86_OR_64)
709 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
710 /* check FXSR and PCLMULQDQ bits */
711 if ((OPENSSL_cpu_caps() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) ==
712 (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) {
713 gcm_init_clmul(ctx->Htable,ctx->H.u);
714 ctx->gmult = gcm_gmult_clmul;
715 ctx->ghash = gcm_ghash_clmul;
716 return;
718 # endif
719 gcm_init_4bit(ctx->Htable,ctx->H.u);
720 # if defined(GHASH_ASM_X86) /* x86 only */
721 # if defined(OPENSSL_IA32_SSE2)
722 if (OPENSSL_cpu_caps() & CPUCAP_MASK_SSE) { /* check SSE bit */
723 # else
724 if (OPENSSL_cpu_caps() & CPUCAP_MASK_MMX) { /* check MMX bit */
725 # endif
726 ctx->gmult = gcm_gmult_4bit_mmx;
727 ctx->ghash = gcm_ghash_4bit_mmx;
728 } else {
729 ctx->gmult = gcm_gmult_4bit_x86;
730 ctx->ghash = gcm_ghash_4bit_x86;
732 # else
733 ctx->gmult = gcm_gmult_4bit;
734 ctx->ghash = gcm_ghash_4bit;
735 # endif
736 # elif defined(GHASH_ASM_ARM)
737 if (OPENSSL_armcap_P & ARMV7_NEON) {
738 ctx->gmult = gcm_gmult_neon;
739 ctx->ghash = gcm_ghash_neon;
740 } else {
741 gcm_init_4bit(ctx->Htable,ctx->H.u);
742 ctx->gmult = gcm_gmult_4bit;
743 ctx->ghash = gcm_ghash_4bit;
745 # else
746 gcm_init_4bit(ctx->Htable,ctx->H.u);
747 # endif
748 #endif
751 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
753 unsigned int ctr;
754 #ifdef GCM_FUNCREF_4BIT
755 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
756 #endif
758 ctx->Yi.u[0] = 0;
759 ctx->Yi.u[1] = 0;
760 ctx->Xi.u[0] = 0;
761 ctx->Xi.u[1] = 0;
762 ctx->len.u[0] = 0; /* AAD length */
763 ctx->len.u[1] = 0; /* message length */
764 ctx->ares = 0;
765 ctx->mres = 0;
767 if (len==12) {
768 memcpy(ctx->Yi.c,iv,12);
769 ctx->Yi.c[15]=1;
770 ctr=1;
772 else {
773 size_t i;
774 u64 len0 = len;
776 while (len>=16) {
777 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
778 GCM_MUL(ctx,Yi);
779 iv += 16;
780 len -= 16;
782 if (len) {
783 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
784 GCM_MUL(ctx,Yi);
786 len0 <<= 3;
787 #if BYTE_ORDER == LITTLE_ENDIAN
788 #ifdef BSWAP8
789 ctx->Yi.u[1] ^= BSWAP8(len0);
790 #else
791 ctx->Yi.c[8] ^= (u8)(len0>>56);
792 ctx->Yi.c[9] ^= (u8)(len0>>48);
793 ctx->Yi.c[10] ^= (u8)(len0>>40);
794 ctx->Yi.c[11] ^= (u8)(len0>>32);
795 ctx->Yi.c[12] ^= (u8)(len0>>24);
796 ctx->Yi.c[13] ^= (u8)(len0>>16);
797 ctx->Yi.c[14] ^= (u8)(len0>>8);
798 ctx->Yi.c[15] ^= (u8)(len0);
799 #endif
800 #else /* BIG_ENDIAN */
801 ctx->Yi.u[1] ^= len0;
802 #endif
804 GCM_MUL(ctx,Yi);
806 #if BYTE_ORDER == LITTLE_ENDIAN
807 #ifdef BSWAP4
808 ctr = BSWAP4(ctx->Yi.d[3]);
809 #else
810 ctr = GETU32(ctx->Yi.c+12);
811 #endif
812 #else /* BIG_ENDIAN */
813 ctr = ctx->Yi.d[3];
814 #endif
817 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
818 ++ctr;
819 #if BYTE_ORDER == LITTLE_ENDIAN
820 #ifdef BSWAP4
821 ctx->Yi.d[3] = BSWAP4(ctr);
822 #else
823 PUTU32(ctx->Yi.c+12,ctr);
824 #endif
825 #else /* BIG_ENDIAN */
826 ctx->Yi.d[3] = ctr;
827 #endif
830 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
832 size_t i;
833 unsigned int n;
834 u64 alen = ctx->len.u[0];
835 #ifdef GCM_FUNCREF_4BIT
836 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
837 # ifdef GHASH
838 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
839 const u8 *inp,size_t len) = ctx->ghash;
840 # endif
841 #endif
843 if (ctx->len.u[1]) return -2;
845 alen += len;
846 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
847 return -1;
848 ctx->len.u[0] = alen;
850 n = ctx->ares;
851 if (n) {
852 while (n && len) {
853 ctx->Xi.c[n] ^= *(aad++);
854 --len;
855 n = (n+1)%16;
857 if (n==0) GCM_MUL(ctx,Xi);
858 else {
859 ctx->ares = n;
860 return 0;
864 #ifdef GHASH
865 if ((i = (len&(size_t)-16))) {
866 GHASH(ctx,aad,i);
867 aad += i;
868 len -= i;
870 #else
871 while (len>=16) {
872 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
873 GCM_MUL(ctx,Xi);
874 aad += 16;
875 len -= 16;
877 #endif
878 if (len) {
879 n = (unsigned int)len;
880 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
883 ctx->ares = n;
884 return 0;
887 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
888 const unsigned char *in, unsigned char *out,
889 size_t len)
891 unsigned int n, ctr;
892 size_t i;
893 u64 mlen = ctx->len.u[1];
894 block128_f block = ctx->block;
895 void *key = ctx->key;
896 #ifdef GCM_FUNCREF_4BIT
897 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
898 # ifdef GHASH
899 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
900 const u8 *inp,size_t len) = ctx->ghash;
901 # endif
902 #endif
904 mlen += len;
905 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
906 return -1;
907 ctx->len.u[1] = mlen;
909 if (ctx->ares) {
910 /* First call to encrypt finalizes GHASH(AAD) */
911 GCM_MUL(ctx,Xi);
912 ctx->ares = 0;
915 #if BYTE_ORDER == LITTLE_ENDIAN
916 #ifdef BSWAP4
917 ctr = BSWAP4(ctx->Yi.d[3]);
918 #else
919 ctr = GETU32(ctx->Yi.c+12);
920 #endif
921 #else /* BIG_ENDIAN */
922 ctr = ctx->Yi.d[3];
923 #endif
925 n = ctx->mres;
926 #if !defined(OPENSSL_SMALL_FOOTPRINT)
927 if (16%sizeof(size_t) == 0) do { /* always true actually */
928 if (n) {
929 while (n && len) {
930 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
931 --len;
932 n = (n+1)%16;
934 if (n==0) GCM_MUL(ctx,Xi);
935 else {
936 ctx->mres = n;
937 return 0;
940 #ifdef __STRICT_ALIGNMENT
941 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
942 break;
943 #endif
944 #if defined(GHASH) && defined(GHASH_CHUNK)
945 while (len>=GHASH_CHUNK) {
946 size_t j=GHASH_CHUNK;
948 while (j) {
949 size_t *out_t=(size_t *)out;
950 const size_t *in_t=(const size_t *)in;
952 (*block)(ctx->Yi.c,ctx->EKi.c,key);
953 ++ctr;
954 #if BYTE_ORDER == LITTLE_ENDIAN
955 #ifdef BSWAP4
956 ctx->Yi.d[3] = BSWAP4(ctr);
957 #else
958 PUTU32(ctx->Yi.c+12,ctr);
959 #endif
960 #else /* BIG_ENDIAN */
961 ctx->Yi.d[3] = ctr;
962 #endif
963 for (i=0; i<16/sizeof(size_t); ++i)
964 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
965 out += 16;
966 in += 16;
967 j -= 16;
969 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
970 len -= GHASH_CHUNK;
972 if ((i = (len&(size_t)-16))) {
973 size_t j=i;
975 while (len>=16) {
976 size_t *out_t=(size_t *)out;
977 const size_t *in_t=(const size_t *)in;
979 (*block)(ctx->Yi.c,ctx->EKi.c,key);
980 ++ctr;
981 #if BYTE_ORDER == LITTLE_ENDIAN
982 #ifdef BSWAP4
983 ctx->Yi.d[3] = BSWAP4(ctr);
984 #else
985 PUTU32(ctx->Yi.c+12,ctr);
986 #endif
987 #else /* BIG_ENDIAN */
988 ctx->Yi.d[3] = ctr;
989 #endif
990 for (i=0; i<16/sizeof(size_t); ++i)
991 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
992 out += 16;
993 in += 16;
994 len -= 16;
996 GHASH(ctx,out-j,j);
998 #else
999 while (len>=16) {
1000 size_t *out_t=(size_t *)out;
1001 const size_t *in_t=(const size_t *)in;
1003 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1004 ++ctr;
1005 #if BYTE_ORDER == LITTLE_ENDIAN
1006 #ifdef BSWAP4
1007 ctx->Yi.d[3] = BSWAP4(ctr);
1008 #else
1009 PUTU32(ctx->Yi.c+12,ctr);
1010 #endif
1011 #else /* BIG_ENDIAN */
1012 ctx->Yi.d[3] = ctr;
1013 #endif
1014 for (i=0; i<16/sizeof(size_t); ++i)
1015 ctx->Xi.t[i] ^=
1016 out_t[i] = in_t[i]^ctx->EKi.t[i];
1017 GCM_MUL(ctx,Xi);
1018 out += 16;
1019 in += 16;
1020 len -= 16;
1022 #endif
1023 if (len) {
1024 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1025 ++ctr;
1026 #if BYTE_ORDER == LITTLE_ENDIAN
1027 #ifdef BSWAP4
1028 ctx->Yi.d[3] = BSWAP4(ctr);
1029 #else
1030 PUTU32(ctx->Yi.c+12,ctr);
1031 #endif
1032 #else /* BIG_ENDIAN */
1033 ctx->Yi.d[3] = ctr;
1034 #endif
1035 while (len--) {
1036 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1037 ++n;
1041 ctx->mres = n;
1042 return 0;
1043 } while(0);
1044 #endif
1045 for (i=0;i<len;++i) {
1046 if (n==0) {
1047 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1048 ++ctr;
1049 #if BYTE_ORDER == LITTLE_ENDIAN
1050 #ifdef BSWAP4
1051 ctx->Yi.d[3] = BSWAP4(ctr);
1052 #else
1053 PUTU32(ctx->Yi.c+12,ctr);
1054 #endif
1055 #else /* BIG_ENDIAN */
1056 ctx->Yi.d[3] = ctr;
1057 #endif
1059 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1060 n = (n+1)%16;
1061 if (n==0)
1062 GCM_MUL(ctx,Xi);
1065 ctx->mres = n;
1066 return 0;
1069 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1070 const unsigned char *in, unsigned char *out,
1071 size_t len)
1073 unsigned int n, ctr;
1074 size_t i;
1075 u64 mlen = ctx->len.u[1];
1076 block128_f block = ctx->block;
1077 void *key = ctx->key;
1078 #ifdef GCM_FUNCREF_4BIT
1079 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1080 # ifdef GHASH
1081 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1082 const u8 *inp,size_t len) = ctx->ghash;
1083 # endif
1084 #endif
1086 mlen += len;
1087 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1088 return -1;
1089 ctx->len.u[1] = mlen;
1091 if (ctx->ares) {
1092 /* First call to decrypt finalizes GHASH(AAD) */
1093 GCM_MUL(ctx,Xi);
1094 ctx->ares = 0;
1097 #if BYTE_ORDER == LITTLE_ENDIAN
1098 #ifdef BSWAP4
1099 ctr = BSWAP4(ctx->Yi.d[3]);
1100 #else
1101 ctr = GETU32(ctx->Yi.c+12);
1102 #endif
1103 #else /* BIG_ENDIAN */
1104 ctr = ctx->Yi.d[3];
1105 #endif
1107 n = ctx->mres;
1108 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1109 if (16%sizeof(size_t) == 0) do { /* always true actually */
1110 if (n) {
1111 while (n && len) {
1112 u8 c = *(in++);
1113 *(out++) = c^ctx->EKi.c[n];
1114 ctx->Xi.c[n] ^= c;
1115 --len;
1116 n = (n+1)%16;
1118 if (n==0) GCM_MUL (ctx,Xi);
1119 else {
1120 ctx->mres = n;
1121 return 0;
1124 #ifdef __STRICT_ALIGNMENT
1125 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1126 break;
1127 #endif
1128 #if defined(GHASH) && defined(GHASH_CHUNK)
1129 while (len>=GHASH_CHUNK) {
1130 size_t j=GHASH_CHUNK;
1132 GHASH(ctx,in,GHASH_CHUNK);
1133 while (j) {
1134 size_t *out_t=(size_t *)out;
1135 const size_t *in_t=(const size_t *)in;
1137 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1138 ++ctr;
1139 #if BYTE_ORDER == LITTLE_ENDIAN
1140 #ifdef BSWAP4
1141 ctx->Yi.d[3] = BSWAP4(ctr);
1142 #else
1143 PUTU32(ctx->Yi.c+12,ctr);
1144 #endif
1145 #else /* BIG_ENDIAN */
1146 ctx->Yi.d[3] = ctr;
1147 #endif
1148 for (i=0; i<16/sizeof(size_t); ++i)
1149 out_t[i] = in_t[i]^ctx->EKi.t[i];
1150 out += 16;
1151 in += 16;
1152 j -= 16;
1154 len -= GHASH_CHUNK;
1156 if ((i = (len&(size_t)-16))) {
1157 GHASH(ctx,in,i);
1158 while (len>=16) {
1159 size_t *out_t=(size_t *)out;
1160 const size_t *in_t=(const size_t *)in;
1162 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1163 ++ctr;
1164 #if BYTE_ORDER == LITTLE_ENDIAN
1165 #ifdef BSWAP4
1166 ctx->Yi.d[3] = BSWAP4(ctr);
1167 #else
1168 PUTU32(ctx->Yi.c+12,ctr);
1169 #endif
1170 #else /* BIG_ENDIAN */
1171 ctx->Yi.d[3] = ctr;
1172 #endif
1173 for (i=0; i<16/sizeof(size_t); ++i)
1174 out_t[i] = in_t[i]^ctx->EKi.t[i];
1175 out += 16;
1176 in += 16;
1177 len -= 16;
1180 #else
1181 while (len>=16) {
1182 size_t *out_t=(size_t *)out;
1183 const size_t *in_t=(const size_t *)in;
1185 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1186 ++ctr;
1187 #if BYTE_ORDER == LITTLE_ENDIAN
1188 #ifdef BSWAP4
1189 ctx->Yi.d[3] = BSWAP4(ctr);
1190 #else
1191 PUTU32(ctx->Yi.c+12,ctr);
1192 #endif
1193 #else /* BIG_ENDIAN */
1194 ctx->Yi.d[3] = ctr;
1195 #endif
1196 for (i=0; i<16/sizeof(size_t); ++i) {
1197 size_t c = in[i];
1198 out[i] = c^ctx->EKi.t[i];
1199 ctx->Xi.t[i] ^= c;
1201 GCM_MUL(ctx,Xi);
1202 out += 16;
1203 in += 16;
1204 len -= 16;
1206 #endif
1207 if (len) {
1208 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1209 ++ctr;
1210 #if BYTE_ORDER == LITTLE_ENDIAN
1211 #ifdef BSWAP4
1212 ctx->Yi.d[3] = BSWAP4(ctr);
1213 #else
1214 PUTU32(ctx->Yi.c+12,ctr);
1215 #endif
1216 #else /* BIG_ENDIAN */
1217 ctx->Yi.d[3] = ctr;
1218 #endif
1219 while (len--) {
1220 u8 c = in[n];
1221 ctx->Xi.c[n] ^= c;
1222 out[n] = c^ctx->EKi.c[n];
1223 ++n;
1227 ctx->mres = n;
1228 return 0;
1229 } while(0);
1230 #endif
1231 for (i=0;i<len;++i) {
1232 u8 c;
1233 if (n==0) {
1234 (*block)(ctx->Yi.c,ctx->EKi.c,key);
1235 ++ctr;
1236 #if BYTE_ORDER == LITTLE_ENDIAN
1237 #ifdef BSWAP4
1238 ctx->Yi.d[3] = BSWAP4(ctr);
1239 #else
1240 PUTU32(ctx->Yi.c+12,ctr);
1241 #endif
1242 #else /* BIG_ENDIAN */
1243 ctx->Yi.d[3] = ctr;
1244 #endif
1246 c = in[i];
1247 out[i] = c^ctx->EKi.c[n];
1248 ctx->Xi.c[n] ^= c;
1249 n = (n+1)%16;
1250 if (n==0)
1251 GCM_MUL(ctx,Xi);
1254 ctx->mres = n;
1255 return 0;
1258 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1259 const unsigned char *in, unsigned char *out,
1260 size_t len, ctr128_f stream)
1262 unsigned int n, ctr;
1263 size_t i;
1264 u64 mlen = ctx->len.u[1];
1265 void *key = ctx->key;
1266 #ifdef GCM_FUNCREF_4BIT
1267 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1268 # ifdef GHASH
1269 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1270 const u8 *inp,size_t len) = ctx->ghash;
1271 # endif
1272 #endif
1274 mlen += len;
1275 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1276 return -1;
1277 ctx->len.u[1] = mlen;
1279 if (ctx->ares) {
1280 /* First call to encrypt finalizes GHASH(AAD) */
1281 GCM_MUL(ctx,Xi);
1282 ctx->ares = 0;
1285 #if BYTE_ORDER == LITTLE_ENDIAN
1286 #ifdef BSWAP4
1287 ctr = BSWAP4(ctx->Yi.d[3]);
1288 #else
1289 ctr = GETU32(ctx->Yi.c+12);
1290 #endif
1291 #else /* BIG_ENDIAN */
1292 ctr = ctx->Yi.d[3];
1293 #endif
1295 n = ctx->mres;
1296 if (n) {
1297 while (n && len) {
1298 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1299 --len;
1300 n = (n+1)%16;
1302 if (n==0) GCM_MUL(ctx,Xi);
1303 else {
1304 ctx->mres = n;
1305 return 0;
1308 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1309 while (len>=GHASH_CHUNK) {
1310 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1311 ctr += GHASH_CHUNK/16;
1312 #if BYTE_ORDER == LITTLE_ENDIAN
1313 #ifdef BSWAP4
1314 ctx->Yi.d[3] = BSWAP4(ctr);
1315 #else
1316 PUTU32(ctx->Yi.c+12,ctr);
1317 #endif
1318 #else /* BIG_ENDIAN */
1319 ctx->Yi.d[3] = ctr;
1320 #endif
1321 GHASH(ctx,out,GHASH_CHUNK);
1322 out += GHASH_CHUNK;
1323 in += GHASH_CHUNK;
1324 len -= GHASH_CHUNK;
1326 #endif
1327 if ((i = (len&(size_t)-16))) {
1328 size_t j=i/16;
1330 (*stream)(in,out,j,key,ctx->Yi.c);
1331 ctr += (unsigned int)j;
1332 #if BYTE_ORDER == LITTLE_ENDIAN
1333 #ifdef BSWAP4
1334 ctx->Yi.d[3] = BSWAP4(ctr);
1335 #else
1336 PUTU32(ctx->Yi.c+12,ctr);
1337 #endif
1338 #else /* BIG_ENDIAN */
1339 ctx->Yi.d[3] = ctr;
1340 #endif
1341 in += i;
1342 len -= i;
1343 #if defined(GHASH)
1344 GHASH(ctx,out,i);
1345 out += i;
1346 #else
1347 while (j--) {
1348 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1349 GCM_MUL(ctx,Xi);
1350 out += 16;
1352 #endif
1354 if (len) {
1355 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1356 ++ctr;
1357 #if BYTE_ORDER == LITTLE_ENDIAN
1358 #ifdef BSWAP4
1359 ctx->Yi.d[3] = BSWAP4(ctr);
1360 #else
1361 PUTU32(ctx->Yi.c+12,ctr);
1362 #endif
1363 #else /* BIG_ENDIAN */
1364 ctx->Yi.d[3] = ctr;
1365 #endif
1366 while (len--) {
1367 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1368 ++n;
1372 ctx->mres = n;
1373 return 0;
1376 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1377 const unsigned char *in, unsigned char *out,
1378 size_t len,ctr128_f stream)
1380 unsigned int n, ctr;
1381 size_t i;
1382 u64 mlen = ctx->len.u[1];
1383 void *key = ctx->key;
1384 #ifdef GCM_FUNCREF_4BIT
1385 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1386 # ifdef GHASH
1387 void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1388 const u8 *inp,size_t len) = ctx->ghash;
1389 # endif
1390 #endif
1392 mlen += len;
1393 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1394 return -1;
1395 ctx->len.u[1] = mlen;
1397 if (ctx->ares) {
1398 /* First call to decrypt finalizes GHASH(AAD) */
1399 GCM_MUL(ctx,Xi);
1400 ctx->ares = 0;
1403 #if BYTE_ORDER == LITTLE_ENDIAN
1404 #ifdef BSWAP4
1405 ctr = BSWAP4(ctx->Yi.d[3]);
1406 #else
1407 ctr = GETU32(ctx->Yi.c+12);
1408 #endif
1409 #else /* BIG_ENDIAN */
1410 ctr = ctx->Yi.d[3];
1411 #endif
1413 n = ctx->mres;
1414 if (n) {
1415 while (n && len) {
1416 u8 c = *(in++);
1417 *(out++) = c^ctx->EKi.c[n];
1418 ctx->Xi.c[n] ^= c;
1419 --len;
1420 n = (n+1)%16;
1422 if (n==0) GCM_MUL (ctx,Xi);
1423 else {
1424 ctx->mres = n;
1425 return 0;
1428 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1429 while (len>=GHASH_CHUNK) {
1430 GHASH(ctx,in,GHASH_CHUNK);
1431 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1432 ctr += GHASH_CHUNK/16;
1433 #if BYTE_ORDER == LITTLE_ENDIAN
1434 #ifdef BSWAP4
1435 ctx->Yi.d[3] = BSWAP4(ctr);
1436 #else
1437 PUTU32(ctx->Yi.c+12,ctr);
1438 #endif
1439 #else /* BIG_ENDIAN */
1440 ctx->Yi.d[3] = ctr;
1441 #endif
1442 out += GHASH_CHUNK;
1443 in += GHASH_CHUNK;
1444 len -= GHASH_CHUNK;
1446 #endif
1447 if ((i = (len&(size_t)-16))) {
1448 size_t j=i/16;
1450 #if defined(GHASH)
1451 GHASH(ctx,in,i);
1452 #else
1453 while (j--) {
1454 size_t k;
1455 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1456 GCM_MUL(ctx,Xi);
1457 in += 16;
1459 j = i/16;
1460 in -= i;
1461 #endif
1462 (*stream)(in,out,j,key,ctx->Yi.c);
1463 ctr += (unsigned int)j;
1464 #if BYTE_ORDER == LITTLE_ENDIAN
1465 #ifdef BSWAP4
1466 ctx->Yi.d[3] = BSWAP4(ctr);
1467 #else
1468 PUTU32(ctx->Yi.c+12,ctr);
1469 #endif
1470 #else /* BIG_ENDIAN */
1471 ctx->Yi.d[3] = ctr;
1472 #endif
1473 out += i;
1474 in += i;
1475 len -= i;
1477 if (len) {
1478 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1479 ++ctr;
1480 #if BYTE_ORDER == LITTLE_ENDIAN
1481 #ifdef BSWAP4
1482 ctx->Yi.d[3] = BSWAP4(ctr);
1483 #else
1484 PUTU32(ctx->Yi.c+12,ctr);
1485 #endif
1486 #else /* BIG_ENDIAN */
1487 ctx->Yi.d[3] = ctr;
1488 #endif
1489 while (len--) {
1490 u8 c = in[n];
1491 ctx->Xi.c[n] ^= c;
1492 out[n] = c^ctx->EKi.c[n];
1493 ++n;
1497 ctx->mres = n;
1498 return 0;
1501 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1502 size_t len)
1504 u64 alen = ctx->len.u[0]<<3;
1505 u64 clen = ctx->len.u[1]<<3;
1506 #ifdef GCM_FUNCREF_4BIT
1507 void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1508 #endif
1510 if (ctx->mres || ctx->ares)
1511 GCM_MUL(ctx,Xi);
1513 #if BYTE_ORDER == LITTLE_ENDIAN
1514 #ifdef BSWAP8
1515 alen = BSWAP8(alen);
1516 clen = BSWAP8(clen);
1517 #else
1518 u8 *p = ctx->len.c;
1520 ctx->len.u[0] = alen;
1521 ctx->len.u[1] = clen;
1523 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1524 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1525 #endif
1526 #endif
1528 ctx->Xi.u[0] ^= alen;
1529 ctx->Xi.u[1] ^= clen;
1530 GCM_MUL(ctx,Xi);
1532 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1533 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1535 if (tag && len<=sizeof(ctx->Xi))
1536 return memcmp(ctx->Xi.c,tag,len);
1537 else
1538 return -1;
1541 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1543 CRYPTO_gcm128_finish(ctx, NULL, 0);
1544 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1547 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1549 GCM128_CONTEXT *ret;
1551 if ((ret = malloc(sizeof(GCM128_CONTEXT))))
1552 CRYPTO_gcm128_init(ret,key,block);
1554 return ret;
1557 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1559 freezero(ctx, sizeof(*ctx));