2 * Copyright (c) 2014 The FreeBSD Foundation
5 * This software was developed by John-Mark Gurney under
6 * the sponsorship of the FreeBSD Foundation and
7 * Rubicon Communications, LLC (Netgate).
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * Figure 5, 8 and 12 are copied from the Intel white paper:
34 * Intel® Carry-Less Multiplication Instruction and its Usage for
35 * Computing the GCM Mode
38 * Copyright © 2010 Intel Corporation.
39 * All rights reserved.
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
44 * * Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * * Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * * Neither the name of Intel Corporation nor the
50 * names of its contributors may be used to endorse or promote products
51 * derived from this software without specific prior written permission.
53 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
54 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
55 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
56 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
57 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
58 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
59 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
60 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
61 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
62 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
63 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
67 #include <crypto/aesni/aesni.h>
68 #include <crypto/aesni/aesni_os.h>
73 #include <wmmintrin.h>
74 #include <emmintrin.h>
75 #include <smmintrin.h>
78 m128icmp(__m128i a
, __m128i b
)
82 cmp
= _mm_cmpeq_epi32(a
, b
);
84 return _mm_movemask_epi8(cmp
) == 0xffff;
89 _mm_insert_epi64(__m128i a
, int64_t b
, const int ndx
)
93 a
= _mm_insert_epi32(a
, b
, 0);
94 a
= _mm_insert_epi32(a
, b
>> 32, 1);
96 a
= _mm_insert_epi32(a
, b
, 2);
97 a
= _mm_insert_epi32(a
, b
>> 32, 3);
104 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
106 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
108 gfmul(__m128i a
, __m128i b
, __m128i
*res
)
110 __m128i tmp2
, tmp3
, tmp4
, tmp5
, tmp6
, tmp7
, tmp8
, tmp9
;
112 tmp3
= _mm_clmulepi64_si128(a
, b
, 0x00);
113 tmp4
= _mm_clmulepi64_si128(a
, b
, 0x10);
114 tmp5
= _mm_clmulepi64_si128(a
, b
, 0x01);
115 tmp6
= _mm_clmulepi64_si128(a
, b
, 0x11);
117 tmp4
= _mm_xor_si128(tmp4
, tmp5
);
118 tmp5
= _mm_slli_si128(tmp4
, 8);
119 tmp4
= _mm_srli_si128(tmp4
, 8);
120 tmp3
= _mm_xor_si128(tmp3
, tmp5
);
121 tmp6
= _mm_xor_si128(tmp6
, tmp4
);
123 tmp7
= _mm_srli_epi32(tmp3
, 31);
124 tmp8
= _mm_srli_epi32(tmp6
, 31);
125 tmp3
= _mm_slli_epi32(tmp3
, 1);
126 tmp6
= _mm_slli_epi32(tmp6
, 1);
128 tmp9
= _mm_srli_si128(tmp7
, 12);
129 tmp8
= _mm_slli_si128(tmp8
, 4);
130 tmp7
= _mm_slli_si128(tmp7
, 4);
131 tmp3
= _mm_or_si128(tmp3
, tmp7
);
132 tmp6
= _mm_or_si128(tmp6
, tmp8
);
133 tmp6
= _mm_or_si128(tmp6
, tmp9
);
135 tmp7
= _mm_slli_epi32(tmp3
, 31);
136 tmp8
= _mm_slli_epi32(tmp3
, 30);
137 tmp9
= _mm_slli_epi32(tmp3
, 25);
139 tmp7
= _mm_xor_si128(tmp7
, tmp8
);
140 tmp7
= _mm_xor_si128(tmp7
, tmp9
);
141 tmp8
= _mm_srli_si128(tmp7
, 4);
142 tmp7
= _mm_slli_si128(tmp7
, 12);
143 tmp3
= _mm_xor_si128(tmp3
, tmp7
);
145 tmp2
= _mm_srli_epi32(tmp3
, 1);
146 tmp4
= _mm_srli_epi32(tmp3
, 2);
147 tmp5
= _mm_srli_epi32(tmp3
, 7);
148 tmp2
= _mm_xor_si128(tmp2
, tmp4
);
149 tmp2
= _mm_xor_si128(tmp2
, tmp5
);
150 tmp2
= _mm_xor_si128(tmp2
, tmp8
);
151 tmp3
= _mm_xor_si128(tmp3
, tmp2
);
152 tmp6
= _mm_xor_si128(tmp6
, tmp3
);
158 * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
161 reduce4(__m128i H1
, __m128i H2
, __m128i H3
, __m128i H4
,
162 __m128i X1
, __m128i X2
, __m128i X3
, __m128i X4
, __m128i
*res
)
164 /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
165 __m128i H1_X1_lo
, H1_X1_hi
, H2_X2_lo
, H2_X2_hi
, H3_X3_lo
,
166 H3_X3_hi
, H4_X4_lo
, H4_X4_hi
, lo
, hi
;
167 __m128i tmp0
, tmp1
, tmp2
, tmp3
;
168 __m128i tmp4
, tmp5
, tmp6
, tmp7
;
171 H1_X1_lo
= _mm_clmulepi64_si128(H1
, X1
, 0x00);
172 H2_X2_lo
= _mm_clmulepi64_si128(H2
, X2
, 0x00);
173 H3_X3_lo
= _mm_clmulepi64_si128(H3
, X3
, 0x00);
174 H4_X4_lo
= _mm_clmulepi64_si128(H4
, X4
, 0x00);
176 lo
= _mm_xor_si128(H1_X1_lo
, H2_X2_lo
);
177 lo
= _mm_xor_si128(lo
, H3_X3_lo
);
178 lo
= _mm_xor_si128(lo
, H4_X4_lo
);
180 H1_X1_hi
= _mm_clmulepi64_si128(H1
, X1
, 0x11);
181 H2_X2_hi
= _mm_clmulepi64_si128(H2
, X2
, 0x11);
182 H3_X3_hi
= _mm_clmulepi64_si128(H3
, X3
, 0x11);
183 H4_X4_hi
= _mm_clmulepi64_si128(H4
, X4
, 0x11);
185 hi
= _mm_xor_si128(H1_X1_hi
, H2_X2_hi
);
186 hi
= _mm_xor_si128(hi
, H3_X3_hi
);
187 hi
= _mm_xor_si128(hi
, H4_X4_hi
);
189 tmp0
= _mm_shuffle_epi32(H1
, 78);
190 tmp4
= _mm_shuffle_epi32(X1
, 78);
191 tmp0
= _mm_xor_si128(tmp0
, H1
);
192 tmp4
= _mm_xor_si128(tmp4
, X1
);
193 tmp1
= _mm_shuffle_epi32(H2
, 78);
194 tmp5
= _mm_shuffle_epi32(X2
, 78);
195 tmp1
= _mm_xor_si128(tmp1
, H2
);
196 tmp5
= _mm_xor_si128(tmp5
, X2
);
197 tmp2
= _mm_shuffle_epi32(H3
, 78);
198 tmp6
= _mm_shuffle_epi32(X3
, 78);
199 tmp2
= _mm_xor_si128(tmp2
, H3
);
200 tmp6
= _mm_xor_si128(tmp6
, X3
);
201 tmp3
= _mm_shuffle_epi32(H4
, 78);
202 tmp7
= _mm_shuffle_epi32(X4
, 78);
203 tmp3
= _mm_xor_si128(tmp3
, H4
);
204 tmp7
= _mm_xor_si128(tmp7
, X4
);
206 tmp0
= _mm_clmulepi64_si128(tmp0
, tmp4
, 0x00);
207 tmp1
= _mm_clmulepi64_si128(tmp1
, tmp5
, 0x00);
208 tmp2
= _mm_clmulepi64_si128(tmp2
, tmp6
, 0x00);
209 tmp3
= _mm_clmulepi64_si128(tmp3
, tmp7
, 0x00);
211 tmp0
= _mm_xor_si128(tmp0
, lo
);
212 tmp0
= _mm_xor_si128(tmp0
, hi
);
213 tmp0
= _mm_xor_si128(tmp1
, tmp0
);
214 tmp0
= _mm_xor_si128(tmp2
, tmp0
);
215 tmp0
= _mm_xor_si128(tmp3
, tmp0
);
217 tmp4
= _mm_slli_si128(tmp0
, 8);
218 tmp0
= _mm_srli_si128(tmp0
, 8);
220 lo
= _mm_xor_si128(tmp4
, lo
);
221 hi
= _mm_xor_si128(tmp0
, hi
);
226 tmp7
= _mm_srli_epi32(tmp3
, 31);
227 tmp8
= _mm_srli_epi32(tmp6
, 31);
228 tmp3
= _mm_slli_epi32(tmp3
, 1);
229 tmp6
= _mm_slli_epi32(tmp6
, 1);
231 tmp9
= _mm_srli_si128(tmp7
, 12);
232 tmp8
= _mm_slli_si128(tmp8
, 4);
233 tmp7
= _mm_slli_si128(tmp7
, 4);
234 tmp3
= _mm_or_si128(tmp3
, tmp7
);
235 tmp6
= _mm_or_si128(tmp6
, tmp8
);
236 tmp6
= _mm_or_si128(tmp6
, tmp9
);
238 tmp7
= _mm_slli_epi32(tmp3
, 31);
239 tmp8
= _mm_slli_epi32(tmp3
, 30);
240 tmp9
= _mm_slli_epi32(tmp3
, 25);
242 tmp7
= _mm_xor_si128(tmp7
, tmp8
);
243 tmp7
= _mm_xor_si128(tmp7
, tmp9
);
244 tmp8
= _mm_srli_si128(tmp7
, 4);
245 tmp7
= _mm_slli_si128(tmp7
, 12);
246 tmp3
= _mm_xor_si128(tmp3
, tmp7
);
248 tmp2
= _mm_srli_epi32(tmp3
, 1);
249 tmp4
= _mm_srli_epi32(tmp3
, 2);
250 tmp5
= _mm_srli_epi32(tmp3
, 7);
251 tmp2
= _mm_xor_si128(tmp2
, tmp4
);
252 tmp2
= _mm_xor_si128(tmp2
, tmp5
);
253 tmp2
= _mm_xor_si128(tmp2
, tmp8
);
254 tmp3
= _mm_xor_si128(tmp3
, tmp2
);
255 tmp6
= _mm_xor_si128(tmp6
, tmp3
);
261 * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
265 * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
266 * 2^32-256*8*16 bytes.
269 AES_GCM_encrypt(const unsigned char *in
, unsigned char *out
,
270 const unsigned char *addt
, const unsigned char *ivec
,
271 unsigned char *tag
, uint32_t nbytes
, uint32_t abytes
, int ibytes
,
272 const unsigned char *key
, int nr
)
275 __m128i tmp1
, tmp2
, tmp3
, tmp4
;
276 __m128i tmp5
, tmp6
, tmp7
, tmp8
;
277 __m128i H
, H2
, H3
, H4
, Y
, T
;
278 const __m128i
*KEY
= (const __m128i
*)key
;
279 __m128i ctr1
, ctr2
, ctr3
, ctr4
;
280 __m128i ctr5
, ctr6
, ctr7
, ctr8
;
281 __m128i last_block
= _mm_setzero_si128();
282 __m128i ONE
= _mm_set_epi32(0, 1, 0, 0);
283 __m128i EIGHT
= _mm_set_epi32(0, 8, 0, 0);
284 __m128i BSWAP_EPI64
= _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
286 __m128i BSWAP_MASK
= _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
288 __m128i X
= _mm_setzero_si128();
290 if (ibytes
== 96/8) {
291 Y
= _mm_loadu_si128((const __m128i
*)ivec
);
292 Y
= _mm_insert_epi32(Y
, 0x1000000, 3);
293 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
294 tmp1
= _mm_xor_si128(X
, KEY
[0]);
295 tmp2
= _mm_xor_si128(Y
, KEY
[0]);
296 for (j
=1; j
< nr
-1; j
+=2) {
297 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
298 tmp2
= _mm_aesenc_si128(tmp2
, KEY
[j
]);
300 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
+1]);
301 tmp2
= _mm_aesenc_si128(tmp2
, KEY
[j
+1]);
303 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[nr
-1]);
304 tmp2
= _mm_aesenc_si128(tmp2
, KEY
[nr
-1]);
306 H
= _mm_aesenclast_si128(tmp1
, KEY
[nr
]);
307 T
= _mm_aesenclast_si128(tmp2
, KEY
[nr
]);
309 H
= _mm_shuffle_epi8(H
, BSWAP_MASK
);
311 tmp1
= _mm_xor_si128(X
, KEY
[0]);
312 for (j
=1; j
<nr
; j
++)
313 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
314 H
= _mm_aesenclast_si128(tmp1
, KEY
[nr
]);
316 H
= _mm_shuffle_epi8(H
, BSWAP_MASK
);
317 Y
= _mm_setzero_si128();
319 for (i
=0; i
< ibytes
/16; i
++) {
320 tmp1
= _mm_loadu_si128(&((const __m128i
*)ivec
)[i
]);
321 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
322 Y
= _mm_xor_si128(Y
, tmp1
);
326 for (j
=0; j
< ibytes
%16; j
++)
327 ((unsigned char*)&last_block
)[j
] = ivec
[i
*16+j
];
329 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
330 Y
= _mm_xor_si128(Y
, tmp1
);
333 tmp1
= _mm_insert_epi64(tmp1
, (uint64_t)ibytes
*8, 0);
334 tmp1
= _mm_insert_epi64(tmp1
, 0, 1);
336 Y
= _mm_xor_si128(Y
, tmp1
);
338 Y
= _mm_shuffle_epi8(Y
, BSWAP_MASK
); /*Compute E(K, Y0)*/
339 tmp1
= _mm_xor_si128(Y
, KEY
[0]);
340 for (j
=1; j
< nr
; j
++)
341 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
342 T
= _mm_aesenclast_si128(tmp1
, KEY
[nr
]);
349 for (i
=0; i
<abytes
/16/4; i
++) {
350 tmp1
= _mm_loadu_si128(&((const __m128i
*)addt
)[i
*4]);
351 tmp2
= _mm_loadu_si128(&((const __m128i
*)addt
)[i
*4+1]);
352 tmp3
= _mm_loadu_si128(&((const __m128i
*)addt
)[i
*4+2]);
353 tmp4
= _mm_loadu_si128(&((const __m128i
*)addt
)[i
*4+3]);
355 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
356 tmp2
= _mm_shuffle_epi8(tmp2
, BSWAP_MASK
);
357 tmp3
= _mm_shuffle_epi8(tmp3
, BSWAP_MASK
);
358 tmp4
= _mm_shuffle_epi8(tmp4
, BSWAP_MASK
);
359 tmp1
= _mm_xor_si128(X
, tmp1
);
361 reduce4(H
, H2
, H3
, H4
, tmp4
, tmp3
, tmp2
, tmp1
, &X
);
363 for (i
=i
*4; i
<abytes
/16; i
++) {
364 tmp1
= _mm_loadu_si128(&((const __m128i
*)addt
)[i
]);
365 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
366 X
= _mm_xor_si128(X
,tmp1
);
370 last_block
= _mm_setzero_si128();
371 for (j
=0; j
<abytes
%16; j
++)
372 ((unsigned char*)&last_block
)[j
] = addt
[i
*16+j
];
374 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
375 X
=_mm_xor_si128(X
,tmp1
);
379 ctr1
= _mm_shuffle_epi8(Y
, BSWAP_EPI64
);
380 ctr1
= _mm_add_epi64(ctr1
, ONE
);
381 ctr2
= _mm_add_epi64(ctr1
, ONE
);
382 ctr3
= _mm_add_epi64(ctr2
, ONE
);
383 ctr4
= _mm_add_epi64(ctr3
, ONE
);
384 ctr5
= _mm_add_epi64(ctr4
, ONE
);
385 ctr6
= _mm_add_epi64(ctr5
, ONE
);
386 ctr7
= _mm_add_epi64(ctr6
, ONE
);
387 ctr8
= _mm_add_epi64(ctr7
, ONE
);
389 for (i
=0; i
<nbytes
/16/8; i
++) {
390 tmp1
= _mm_shuffle_epi8(ctr1
, BSWAP_EPI64
);
391 tmp2
= _mm_shuffle_epi8(ctr2
, BSWAP_EPI64
);
392 tmp3
= _mm_shuffle_epi8(ctr3
, BSWAP_EPI64
);
393 tmp4
= _mm_shuffle_epi8(ctr4
, BSWAP_EPI64
);
394 tmp5
= _mm_shuffle_epi8(ctr5
, BSWAP_EPI64
);
395 tmp6
= _mm_shuffle_epi8(ctr6
, BSWAP_EPI64
);
396 tmp7
= _mm_shuffle_epi8(ctr7
, BSWAP_EPI64
);
397 tmp8
= _mm_shuffle_epi8(ctr8
, BSWAP_EPI64
);
399 ctr1
= _mm_add_epi64(ctr1
, EIGHT
);
400 ctr2
= _mm_add_epi64(ctr2
, EIGHT
);
401 ctr3
= _mm_add_epi64(ctr3
, EIGHT
);
402 ctr4
= _mm_add_epi64(ctr4
, EIGHT
);
403 ctr5
= _mm_add_epi64(ctr5
, EIGHT
);
404 ctr6
= _mm_add_epi64(ctr6
, EIGHT
);
405 ctr7
= _mm_add_epi64(ctr7
, EIGHT
);
406 ctr8
= _mm_add_epi64(ctr8
, EIGHT
);
408 tmp1
=_mm_xor_si128(tmp1
, KEY
[0]);
409 tmp2
=_mm_xor_si128(tmp2
, KEY
[0]);
410 tmp3
=_mm_xor_si128(tmp3
, KEY
[0]);
411 tmp4
=_mm_xor_si128(tmp4
, KEY
[0]);
412 tmp5
=_mm_xor_si128(tmp5
, KEY
[0]);
413 tmp6
=_mm_xor_si128(tmp6
, KEY
[0]);
414 tmp7
=_mm_xor_si128(tmp7
, KEY
[0]);
415 tmp8
=_mm_xor_si128(tmp8
, KEY
[0]);
417 for (j
=1; j
<nr
; j
++) {
418 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
419 tmp2
= _mm_aesenc_si128(tmp2
, KEY
[j
]);
420 tmp3
= _mm_aesenc_si128(tmp3
, KEY
[j
]);
421 tmp4
= _mm_aesenc_si128(tmp4
, KEY
[j
]);
422 tmp5
= _mm_aesenc_si128(tmp5
, KEY
[j
]);
423 tmp6
= _mm_aesenc_si128(tmp6
, KEY
[j
]);
424 tmp7
= _mm_aesenc_si128(tmp7
, KEY
[j
]);
425 tmp8
= _mm_aesenc_si128(tmp8
, KEY
[j
]);
427 tmp1
=_mm_aesenclast_si128(tmp1
, KEY
[nr
]);
428 tmp2
=_mm_aesenclast_si128(tmp2
, KEY
[nr
]);
429 tmp3
=_mm_aesenclast_si128(tmp3
, KEY
[nr
]);
430 tmp4
=_mm_aesenclast_si128(tmp4
, KEY
[nr
]);
431 tmp5
=_mm_aesenclast_si128(tmp5
, KEY
[nr
]);
432 tmp6
=_mm_aesenclast_si128(tmp6
, KEY
[nr
]);
433 tmp7
=_mm_aesenclast_si128(tmp7
, KEY
[nr
]);
434 tmp8
=_mm_aesenclast_si128(tmp8
, KEY
[nr
]);
436 tmp1
= _mm_xor_si128(tmp1
,
437 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+0]));
438 tmp2
= _mm_xor_si128(tmp2
,
439 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+1]));
440 tmp3
= _mm_xor_si128(tmp3
,
441 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+2]));
442 tmp4
= _mm_xor_si128(tmp4
,
443 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+3]));
444 tmp5
= _mm_xor_si128(tmp5
,
445 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+4]));
446 tmp6
= _mm_xor_si128(tmp6
,
447 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+5]));
448 tmp7
= _mm_xor_si128(tmp7
,
449 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+6]));
450 tmp8
= _mm_xor_si128(tmp8
,
451 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+7]));
453 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+0], tmp1
);
454 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+1], tmp2
);
455 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+2], tmp3
);
456 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+3], tmp4
);
457 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+4], tmp5
);
458 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+5], tmp6
);
459 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+6], tmp7
);
460 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+7], tmp8
);
462 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
463 tmp2
= _mm_shuffle_epi8(tmp2
, BSWAP_MASK
);
464 tmp3
= _mm_shuffle_epi8(tmp3
, BSWAP_MASK
);
465 tmp4
= _mm_shuffle_epi8(tmp4
, BSWAP_MASK
);
466 tmp5
= _mm_shuffle_epi8(tmp5
, BSWAP_MASK
);
467 tmp6
= _mm_shuffle_epi8(tmp6
, BSWAP_MASK
);
468 tmp7
= _mm_shuffle_epi8(tmp7
, BSWAP_MASK
);
469 tmp8
= _mm_shuffle_epi8(tmp8
, BSWAP_MASK
);
471 tmp1
= _mm_xor_si128(X
, tmp1
);
473 reduce4(H
, H2
, H3
, H4
, tmp4
, tmp3
, tmp2
, tmp1
, &X
);
475 tmp5
= _mm_xor_si128(X
, tmp5
);
476 reduce4(H
, H2
, H3
, H4
, tmp8
, tmp7
, tmp6
, tmp5
, &X
);
478 for (k
=i
*8; k
<nbytes
/16; k
++) {
479 tmp1
= _mm_shuffle_epi8(ctr1
, BSWAP_EPI64
);
480 ctr1
= _mm_add_epi64(ctr1
, ONE
);
481 tmp1
= _mm_xor_si128(tmp1
, KEY
[0]);
482 for (j
=1; j
<nr
-1; j
+=2) {
483 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
484 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
+1]);
486 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[nr
-1]);
487 tmp1
= _mm_aesenclast_si128(tmp1
, KEY
[nr
]);
488 tmp1
= _mm_xor_si128(tmp1
,
489 _mm_loadu_si128(&((const __m128i
*)in
)[k
]));
490 _mm_storeu_si128(&((__m128i
*)out
)[k
], tmp1
);
491 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
492 X
= _mm_xor_si128(X
, tmp1
);
495 //If remains one incomplete block
497 tmp1
= _mm_shuffle_epi8(ctr1
, BSWAP_EPI64
);
498 tmp1
= _mm_xor_si128(tmp1
, KEY
[0]);
499 for (j
=1; j
<nr
-1; j
+=2) {
500 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
501 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
+1]);
503 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[nr
-1]);
504 tmp1
= _mm_aesenclast_si128(tmp1
, KEY
[nr
]);
505 last_block
= _mm_setzero_si128();
506 memcpy(&last_block
, &((const __m128i
*)in
)[k
],
508 last_block
= _mm_xor_si128(last_block
, tmp1
);
509 for (j
=0; j
<nbytes
%16; j
++)
510 out
[k
*16+j
] = ((unsigned char*)&last_block
)[j
];
511 for ((void)j
; j
<16; j
++)
512 ((unsigned char*)&last_block
)[j
] = 0;
514 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
515 X
= _mm_xor_si128(X
, tmp1
);
518 tmp1
= _mm_insert_epi64(tmp1
, (uint64_t)nbytes
*8, 0);
519 tmp1
= _mm_insert_epi64(tmp1
, (uint64_t)abytes
*8, 1);
521 X
= _mm_xor_si128(X
, tmp1
);
523 X
= _mm_shuffle_epi8(X
, BSWAP_MASK
);
524 T
= _mm_xor_si128(X
, T
);
525 _mm_storeu_si128((__m128i
*)tag
, T
);
528 /* My modification of _encrypt to be _decrypt */
530 AES_GCM_decrypt(const unsigned char *in
, unsigned char *out
,
531 const unsigned char *addt
, const unsigned char *ivec
,
532 const unsigned char *tag
, uint32_t nbytes
, uint32_t abytes
, int ibytes
,
533 const unsigned char *key
, int nr
)
536 __m128i tmp1
, tmp2
, tmp3
, tmp4
;
537 __m128i tmp5
, tmp6
, tmp7
, tmp8
;
538 __m128i H
, H2
, H3
, H4
, Y
, T
;
539 const __m128i
*KEY
= (const __m128i
*)key
;
540 __m128i ctr1
, ctr2
, ctr3
, ctr4
;
541 __m128i ctr5
, ctr6
, ctr7
, ctr8
;
542 __m128i last_block
= _mm_setzero_si128();
543 __m128i ONE
= _mm_set_epi32(0, 1, 0, 0);
544 __m128i EIGHT
= _mm_set_epi32(0, 8, 0, 0);
545 __m128i BSWAP_EPI64
= _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
547 __m128i BSWAP_MASK
= _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
549 __m128i X
= _mm_setzero_si128();
551 if (ibytes
== 96/8) {
552 Y
= _mm_loadu_si128((const __m128i
*)ivec
);
553 Y
= _mm_insert_epi32(Y
, 0x1000000, 3);
554 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
555 tmp1
= _mm_xor_si128(X
, KEY
[0]);
556 tmp2
= _mm_xor_si128(Y
, KEY
[0]);
557 for (j
=1; j
< nr
-1; j
+=2) {
558 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
559 tmp2
= _mm_aesenc_si128(tmp2
, KEY
[j
]);
561 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
+1]);
562 tmp2
= _mm_aesenc_si128(tmp2
, KEY
[j
+1]);
564 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[nr
-1]);
565 tmp2
= _mm_aesenc_si128(tmp2
, KEY
[nr
-1]);
567 H
= _mm_aesenclast_si128(tmp1
, KEY
[nr
]);
568 T
= _mm_aesenclast_si128(tmp2
, KEY
[nr
]);
570 H
= _mm_shuffle_epi8(H
, BSWAP_MASK
);
572 tmp1
= _mm_xor_si128(X
, KEY
[0]);
573 for (j
=1; j
<nr
; j
++)
574 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
575 H
= _mm_aesenclast_si128(tmp1
, KEY
[nr
]);
577 H
= _mm_shuffle_epi8(H
, BSWAP_MASK
);
578 Y
= _mm_setzero_si128();
580 for (i
=0; i
< ibytes
/16; i
++) {
581 tmp1
= _mm_loadu_si128(&((const __m128i
*)ivec
)[i
]);
582 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
583 Y
= _mm_xor_si128(Y
, tmp1
);
587 for (j
=0; j
< ibytes
%16; j
++)
588 ((unsigned char*)&last_block
)[j
] = ivec
[i
*16+j
];
590 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
591 Y
= _mm_xor_si128(Y
, tmp1
);
594 tmp1
= _mm_insert_epi64(tmp1
, (uint64_t)ibytes
*8, 0);
595 tmp1
= _mm_insert_epi64(tmp1
, 0, 1);
597 Y
= _mm_xor_si128(Y
, tmp1
);
599 Y
= _mm_shuffle_epi8(Y
, BSWAP_MASK
); /*Compute E(K, Y0)*/
600 tmp1
= _mm_xor_si128(Y
, KEY
[0]);
601 for (j
=1; j
< nr
; j
++)
602 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
603 T
= _mm_aesenclast_si128(tmp1
, KEY
[nr
]);
610 for (i
=0; i
<abytes
/16/4; i
++) {
611 tmp1
= _mm_loadu_si128(&((const __m128i
*)addt
)[i
*4]);
612 tmp2
= _mm_loadu_si128(&((const __m128i
*)addt
)[i
*4+1]);
613 tmp3
= _mm_loadu_si128(&((const __m128i
*)addt
)[i
*4+2]);
614 tmp4
= _mm_loadu_si128(&((const __m128i
*)addt
)[i
*4+3]);
616 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
617 tmp2
= _mm_shuffle_epi8(tmp2
, BSWAP_MASK
);
618 tmp3
= _mm_shuffle_epi8(tmp3
, BSWAP_MASK
);
619 tmp4
= _mm_shuffle_epi8(tmp4
, BSWAP_MASK
);
621 tmp1
= _mm_xor_si128(X
, tmp1
);
623 reduce4(H
, H2
, H3
, H4
, tmp4
, tmp3
, tmp2
, tmp1
, &X
);
625 for (i
=i
*4; i
<abytes
/16; i
++) {
626 tmp1
= _mm_loadu_si128(&((const __m128i
*)addt
)[i
]);
627 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
628 X
= _mm_xor_si128(X
,tmp1
);
632 last_block
= _mm_setzero_si128();
633 for (j
=0; j
<abytes
%16; j
++)
634 ((unsigned char*)&last_block
)[j
] = addt
[i
*16+j
];
636 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
637 X
=_mm_xor_si128(X
,tmp1
);
641 /* This is where we validate the cipher text before decrypt */
642 for (i
= 0; i
<nbytes
/16/4; i
++) {
643 tmp1
= _mm_loadu_si128(&((const __m128i
*)in
)[i
*4]);
644 tmp2
= _mm_loadu_si128(&((const __m128i
*)in
)[i
*4+1]);
645 tmp3
= _mm_loadu_si128(&((const __m128i
*)in
)[i
*4+2]);
646 tmp4
= _mm_loadu_si128(&((const __m128i
*)in
)[i
*4+3]);
648 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
649 tmp2
= _mm_shuffle_epi8(tmp2
, BSWAP_MASK
);
650 tmp3
= _mm_shuffle_epi8(tmp3
, BSWAP_MASK
);
651 tmp4
= _mm_shuffle_epi8(tmp4
, BSWAP_MASK
);
653 tmp1
= _mm_xor_si128(X
, tmp1
);
655 reduce4(H
, H2
, H3
, H4
, tmp4
, tmp3
, tmp2
, tmp1
, &X
);
657 for (i
= i
*4; i
<nbytes
/16; i
++) {
658 tmp1
= _mm_loadu_si128(&((const __m128i
*)in
)[i
]);
659 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
660 X
= _mm_xor_si128(X
, tmp1
);
664 last_block
= _mm_setzero_si128();
665 for (j
=0; j
<nbytes
%16; j
++)
666 ((unsigned char*)&last_block
)[j
] = in
[i
*16+j
];
668 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
669 X
= _mm_xor_si128(X
, tmp1
);
673 tmp1
= _mm_insert_epi64(tmp1
, (uint64_t)nbytes
*8, 0);
674 tmp1
= _mm_insert_epi64(tmp1
, (uint64_t)abytes
*8, 1);
676 X
= _mm_xor_si128(X
, tmp1
);
678 X
= _mm_shuffle_epi8(X
, BSWAP_MASK
);
679 T
= _mm_xor_si128(X
, T
);
681 if (!m128icmp(T
, _mm_loadu_si128((const __m128i
*)tag
)))
682 return 0; //in case the authentication failed
684 ctr1
= _mm_shuffle_epi8(Y
, BSWAP_EPI64
);
685 ctr1
= _mm_add_epi64(ctr1
, ONE
);
686 ctr2
= _mm_add_epi64(ctr1
, ONE
);
687 ctr3
= _mm_add_epi64(ctr2
, ONE
);
688 ctr4
= _mm_add_epi64(ctr3
, ONE
);
689 ctr5
= _mm_add_epi64(ctr4
, ONE
);
690 ctr6
= _mm_add_epi64(ctr5
, ONE
);
691 ctr7
= _mm_add_epi64(ctr6
, ONE
);
692 ctr8
= _mm_add_epi64(ctr7
, ONE
);
694 for (i
=0; i
<nbytes
/16/8; i
++) {
695 tmp1
= _mm_shuffle_epi8(ctr1
, BSWAP_EPI64
);
696 tmp2
= _mm_shuffle_epi8(ctr2
, BSWAP_EPI64
);
697 tmp3
= _mm_shuffle_epi8(ctr3
, BSWAP_EPI64
);
698 tmp4
= _mm_shuffle_epi8(ctr4
, BSWAP_EPI64
);
699 tmp5
= _mm_shuffle_epi8(ctr5
, BSWAP_EPI64
);
700 tmp6
= _mm_shuffle_epi8(ctr6
, BSWAP_EPI64
);
701 tmp7
= _mm_shuffle_epi8(ctr7
, BSWAP_EPI64
);
702 tmp8
= _mm_shuffle_epi8(ctr8
, BSWAP_EPI64
);
704 ctr1
= _mm_add_epi64(ctr1
, EIGHT
);
705 ctr2
= _mm_add_epi64(ctr2
, EIGHT
);
706 ctr3
= _mm_add_epi64(ctr3
, EIGHT
);
707 ctr4
= _mm_add_epi64(ctr4
, EIGHT
);
708 ctr5
= _mm_add_epi64(ctr5
, EIGHT
);
709 ctr6
= _mm_add_epi64(ctr6
, EIGHT
);
710 ctr7
= _mm_add_epi64(ctr7
, EIGHT
);
711 ctr8
= _mm_add_epi64(ctr8
, EIGHT
);
713 tmp1
=_mm_xor_si128(tmp1
, KEY
[0]);
714 tmp2
=_mm_xor_si128(tmp2
, KEY
[0]);
715 tmp3
=_mm_xor_si128(tmp3
, KEY
[0]);
716 tmp4
=_mm_xor_si128(tmp4
, KEY
[0]);
717 tmp5
=_mm_xor_si128(tmp5
, KEY
[0]);
718 tmp6
=_mm_xor_si128(tmp6
, KEY
[0]);
719 tmp7
=_mm_xor_si128(tmp7
, KEY
[0]);
720 tmp8
=_mm_xor_si128(tmp8
, KEY
[0]);
722 for (j
=1; j
<nr
; j
++) {
723 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
724 tmp2
= _mm_aesenc_si128(tmp2
, KEY
[j
]);
725 tmp3
= _mm_aesenc_si128(tmp3
, KEY
[j
]);
726 tmp4
= _mm_aesenc_si128(tmp4
, KEY
[j
]);
727 tmp5
= _mm_aesenc_si128(tmp5
, KEY
[j
]);
728 tmp6
= _mm_aesenc_si128(tmp6
, KEY
[j
]);
729 tmp7
= _mm_aesenc_si128(tmp7
, KEY
[j
]);
730 tmp8
= _mm_aesenc_si128(tmp8
, KEY
[j
]);
732 tmp1
=_mm_aesenclast_si128(tmp1
, KEY
[nr
]);
733 tmp2
=_mm_aesenclast_si128(tmp2
, KEY
[nr
]);
734 tmp3
=_mm_aesenclast_si128(tmp3
, KEY
[nr
]);
735 tmp4
=_mm_aesenclast_si128(tmp4
, KEY
[nr
]);
736 tmp5
=_mm_aesenclast_si128(tmp5
, KEY
[nr
]);
737 tmp6
=_mm_aesenclast_si128(tmp6
, KEY
[nr
]);
738 tmp7
=_mm_aesenclast_si128(tmp7
, KEY
[nr
]);
739 tmp8
=_mm_aesenclast_si128(tmp8
, KEY
[nr
]);
741 tmp1
= _mm_xor_si128(tmp1
,
742 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+0]));
743 tmp2
= _mm_xor_si128(tmp2
,
744 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+1]));
745 tmp3
= _mm_xor_si128(tmp3
,
746 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+2]));
747 tmp4
= _mm_xor_si128(tmp4
,
748 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+3]));
749 tmp5
= _mm_xor_si128(tmp5
,
750 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+4]));
751 tmp6
= _mm_xor_si128(tmp6
,
752 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+5]));
753 tmp7
= _mm_xor_si128(tmp7
,
754 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+6]));
755 tmp8
= _mm_xor_si128(tmp8
,
756 _mm_loadu_si128(&((const __m128i
*)in
)[i
*8+7]));
758 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+0], tmp1
);
759 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+1], tmp2
);
760 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+2], tmp3
);
761 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+3], tmp4
);
762 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+4], tmp5
);
763 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+5], tmp6
);
764 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+6], tmp7
);
765 _mm_storeu_si128(&((__m128i
*)out
)[i
*8+7], tmp8
);
767 tmp1
= _mm_shuffle_epi8(tmp1
, BSWAP_MASK
);
768 tmp2
= _mm_shuffle_epi8(tmp2
, BSWAP_MASK
);
769 tmp3
= _mm_shuffle_epi8(tmp3
, BSWAP_MASK
);
770 tmp4
= _mm_shuffle_epi8(tmp4
, BSWAP_MASK
);
771 tmp5
= _mm_shuffle_epi8(tmp5
, BSWAP_MASK
);
772 tmp6
= _mm_shuffle_epi8(tmp6
, BSWAP_MASK
);
773 tmp7
= _mm_shuffle_epi8(tmp7
, BSWAP_MASK
);
774 tmp8
= _mm_shuffle_epi8(tmp8
, BSWAP_MASK
);
776 for (k
=i
*8; k
<nbytes
/16; k
++) {
777 tmp1
= _mm_shuffle_epi8(ctr1
, BSWAP_EPI64
);
778 ctr1
= _mm_add_epi64(ctr1
, ONE
);
779 tmp1
= _mm_xor_si128(tmp1
, KEY
[0]);
780 for (j
=1; j
<nr
-1; j
+=2) {
781 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
782 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
+1]);
784 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[nr
-1]);
785 tmp1
= _mm_aesenclast_si128(tmp1
, KEY
[nr
]);
786 tmp1
= _mm_xor_si128(tmp1
,
787 _mm_loadu_si128(&((const __m128i
*)in
)[k
]));
788 _mm_storeu_si128(&((__m128i
*)out
)[k
], tmp1
);
790 //If remains one incomplete block
792 tmp1
= _mm_shuffle_epi8(ctr1
, BSWAP_EPI64
);
793 tmp1
= _mm_xor_si128(tmp1
, KEY
[0]);
794 for (j
=1; j
<nr
-1; j
+=2) {
795 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
]);
796 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[j
+1]);
798 tmp1
= _mm_aesenc_si128(tmp1
, KEY
[nr
-1]);
799 tmp1
= _mm_aesenclast_si128(tmp1
, KEY
[nr
]);
800 last_block
= _mm_setzero_si128();
801 memcpy(&last_block
, &((const __m128i
*)in
)[k
], nbytes
%16);
802 tmp1
= _mm_xor_si128(tmp1
, last_block
);
804 for (j
=0; j
<nbytes
%16; j
++)
805 out
[k
*16+j
] = ((unsigned char*)&last_block
)[j
];
807 return 1; //when sucessfull returns 1