2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
17 * Copyright(c) 2014 Intel Corporation.
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
35 * Copyright(c) 2014 Intel Corporation.
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 #include <linux/linkage.h>
68 #define VMOVDQ vmovdqu
78 #define xcounter %xmm8
79 #define xbyteswap %xmm9
104 .octa 0x000102030405060708090A0B0C0D0E0F
106 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
108 .octa 0x00000000000000010000000000000000
110 .octa 0x00000000000000000000000000000001
112 .octa 0x00000000000000000000000000000002
114 .octa 0x00000000000000000000000000000003
116 .octa 0x00000000000000000000000000000004
118 .octa 0x00000000000000000000000000000005
120 .octa 0x00000000000000000000000000000006
122 .octa 0x00000000000000000000000000000007
124 .octa 0x00000000000000000000000000000008
128 /* generate a unique variable for ddq_add_x */
131 var_ddq_add = ddq_add_\n
134 /* generate a unique variable for xmm register */
139 /* club the numeric 'id' to the symbol 'name' */
143 .if \name == DDQ_DATA
145 .elseif \name == XDATA
152 * do_aes num_in_par load_keys key_len
153 * This increments p_in, but not p_out
155 .macro do_aes b, k, key_len
161 vmovdqa 0*16(p_keys), xkey0
164 vpshufb xbyteswap, xcounter, xdata0
170 vpaddq var_ddq_add(%rip), xcounter, var_xdata
171 vptest ddq_low_msk(%rip), var_xdata
173 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
174 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
176 vpshufb xbyteswap, var_xdata, var_xdata
180 vmovdqa 1*16(p_keys), xkeyA
182 vpxor xkey0, xdata0, xdata0
184 vpaddq var_ddq_add(%rip), xcounter, xcounter
185 vptest ddq_low_msk(%rip), xcounter
187 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
193 vpxor xkey0, var_xdata, var_xdata
197 vmovdqa 2*16(p_keys), xkeyB
202 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
206 .if (klen == KEY_128)
208 vmovdqa 3*16(p_keys), xkey4
211 vmovdqa 3*16(p_keys), xkeyA
217 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
223 .if (klen == KEY_128)
224 vmovdqa 4*16(p_keys), xkeyB
227 vmovdqa 4*16(p_keys), xkey4
235 .if (klen == KEY_128)
236 vaesenc xkey4, var_xdata, var_xdata
238 vaesenc xkeyA, var_xdata, var_xdata
243 vmovdqa 5*16(p_keys), xkeyA
249 .if (klen == KEY_128)
250 vaesenc xkeyB, var_xdata, var_xdata
252 vaesenc xkey4, var_xdata, var_xdata
257 .if (klen == KEY_128)
259 vmovdqa 6*16(p_keys), xkey8
262 vmovdqa 6*16(p_keys), xkeyB
268 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
272 vmovdqa 7*16(p_keys), xkeyA
278 .if (klen == KEY_128)
279 vaesenc xkey8, var_xdata, var_xdata
281 vaesenc xkeyB, var_xdata, var_xdata
286 .if (klen == KEY_128)
287 vmovdqa 8*16(p_keys), xkeyB
290 vmovdqa 8*16(p_keys), xkey8
297 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
301 .if (klen == KEY_128)
303 vmovdqa 9*16(p_keys), xkey12
306 vmovdqa 9*16(p_keys), xkeyA
313 .if (klen == KEY_128)
314 vaesenc xkeyB, var_xdata, var_xdata
316 vaesenc xkey8, var_xdata, var_xdata
321 vmovdqa 10*16(p_keys), xkeyB
327 .if (klen == KEY_128)
328 vaesenc xkey12, var_xdata, var_xdata
330 vaesenc xkeyA, var_xdata, var_xdata
335 .if (klen != KEY_128)
336 vmovdqa 11*16(p_keys), xkeyA
343 .if (klen == KEY_128)
344 vaesenclast xkeyB, var_xdata, var_xdata
346 vaesenc xkeyB, var_xdata, var_xdata
351 .if (klen != KEY_128)
353 vmovdqa 12*16(p_keys), xkey12
359 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
363 .if (klen == KEY_256)
364 vmovdqa 13*16(p_keys), xkeyA
370 .if (klen == KEY_256)
372 vaesenc xkey12, var_xdata, var_xdata
374 vaesenclast xkey12, var_xdata, var_xdata
379 .if (klen == KEY_256)
380 vmovdqa 14*16(p_keys), xkeyB
386 vaesenc xkeyA, var_xdata, var_xdata
394 vaesenclast xkeyB, var_xdata, var_xdata
403 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
404 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
406 vpxor xkeyA, var_xdata, var_xdata
408 vpxor xkeyB, var_xdata, var_xdata
413 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
415 vpxor xkeyA, var_xdata, var_xdata
421 VMOVDQ var_xdata, i*16(p_out)
426 .macro do_aes_load val, key_len
427 do_aes \val, 1, \key_len
430 .macro do_aes_noload val, key_len
431 do_aes \val, 0, \key_len
434 /* main body of aes ctr load */
436 .macro do_aes_ctrmain key_len
438 jb .Ldo_return2\key_len
440 vmovdqa byteswap_const(%rip), xbyteswap
441 vmovdqu (p_iv), xcounter
442 vpshufb xbyteswap, xcounter, xcounter
446 jz .Lmult_of_8_blks\key_len
459 do_aes_load 1, \key_len
461 and $(~7*16), num_bytes
462 jz .Ldo_return2\key_len
463 jmp .Lmain_loop2\key_len
466 do_aes_load 2, \key_len
468 and $(~7*16), num_bytes
469 jz .Ldo_return2\key_len
470 jmp .Lmain_loop2\key_len
474 do_aes_load 3, \key_len
476 and $(~7*16), num_bytes
477 jz .Ldo_return2\key_len
478 jmp .Lmain_loop2\key_len
481 do_aes_load 4, \key_len
483 and $(~7*16), num_bytes
484 jz .Ldo_return2\key_len
485 jmp .Lmain_loop2\key_len
493 do_aes_load 5, \key_len
495 and $(~7*16), num_bytes
496 jz .Ldo_return2\key_len
497 jmp .Lmain_loop2\key_len
500 do_aes_load 6, \key_len
502 and $(~7*16), num_bytes
503 jz .Ldo_return2\key_len
504 jmp .Lmain_loop2\key_len
507 do_aes_load 7, \key_len
509 and $(~7*16), num_bytes
510 jz .Ldo_return2\key_len
511 jmp .Lmain_loop2\key_len
513 .Lmult_of_8_blks\key_len:
514 .if (\key_len != KEY_128)
515 vmovdqa 0*16(p_keys), xkey0
516 vmovdqa 4*16(p_keys), xkey4
517 vmovdqa 8*16(p_keys), xkey8
518 vmovdqa 12*16(p_keys), xkey12
520 vmovdqa 0*16(p_keys), xkey0
521 vmovdqa 3*16(p_keys), xkey4
522 vmovdqa 6*16(p_keys), xkey8
523 vmovdqa 9*16(p_keys), xkey12
526 .Lmain_loop2\key_len:
527 /* num_bytes is a multiple of 8 and >0 */
528 do_aes_noload 8, \key_len
530 sub $(8*16), num_bytes
531 jne .Lmain_loop2\key_len
533 .Ldo_return2\key_len:
534 /* return updated IV */
535 vpshufb xbyteswap, xcounter, xcounter
536 vmovdqu xcounter, (p_iv)
541 * routine to do AES128 CTR enc/decrypt "by8"
542 * XMM registers are clobbered.
543 * Saving/restoring must be done at a higher level
544 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
545 * unsigned int num_bytes)
547 SYM_FUNC_START(aes_ctr_enc_128_avx_by8)
548 /* call the aes main loop */
549 do_aes_ctrmain KEY_128
551 SYM_FUNC_END(aes_ctr_enc_128_avx_by8)
554 * routine to do AES192 CTR enc/decrypt "by8"
555 * XMM registers are clobbered.
556 * Saving/restoring must be done at a higher level
557 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
558 * unsigned int num_bytes)
560 SYM_FUNC_START(aes_ctr_enc_192_avx_by8)
561 /* call the aes main loop */
562 do_aes_ctrmain KEY_192
564 SYM_FUNC_END(aes_ctr_enc_192_avx_by8)
567 * routine to do AES256 CTR enc/decrypt "by8"
568 * XMM registers are clobbered.
569 * Saving/restoring must be done at a higher level
570 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
571 * unsigned int num_bytes)
573 SYM_FUNC_START(aes_ctr_enc_256_avx_by8)
574 /* call the aes main loop */
575 do_aes_ctrmain KEY_256
577 SYM_FUNC_END(aes_ctr_enc_256_avx_by8)