2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
17 * Copyright(c) 2014 Intel Corporation.
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
35 * Copyright(c) 2014 Intel Corporation.
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 #include <linux/linkage.h>
68 #define CONCAT(a,b) a##b
69 #define VMOVDQ vmovdqu
79 #define xcounter %xmm8
80 #define xbyteswap %xmm9
95 #define DDQ(i) CONCAT(ddq_add_,i)
96 #define XMM(i) CONCAT(%xmm, i)
107 .octa 0x000102030405060708090A0B0C0D0E0F
109 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
111 .octa 0x00000000000000010000000000000000
113 .octa 0x00000000000000000000000000000001
115 .octa 0x00000000000000000000000000000002
117 .octa 0x00000000000000000000000000000003
119 .octa 0x00000000000000000000000000000004
121 .octa 0x00000000000000000000000000000005
123 .octa 0x00000000000000000000000000000006
125 .octa 0x00000000000000000000000000000007
127 .octa 0x00000000000000000000000000000008
131 /* generate a unique variable for ddq_add_x */
134 var_ddq_add = DDQ(\n)
137 /* generate a unique variable for xmm register */
142 /* club the numeric 'id' to the symbol 'name' */
146 .if \name == DDQ_DATA
148 .elseif \name == XDATA
155 * do_aes num_in_par load_keys key_len
156 * This increments p_in, but not p_out
158 .macro do_aes b, k, key_len
164 vmovdqa 0*16(p_keys), xkey0
167 vpshufb xbyteswap, xcounter, xdata0
173 vpaddq var_ddq_add(%rip), xcounter, var_xdata
174 vptest ddq_low_msk(%rip), var_xdata
176 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
177 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
179 vpshufb xbyteswap, var_xdata, var_xdata
183 vmovdqa 1*16(p_keys), xkeyA
185 vpxor xkey0, xdata0, xdata0
187 vpaddq var_ddq_add(%rip), xcounter, xcounter
188 vptest ddq_low_msk(%rip), xcounter
190 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
196 vpxor xkey0, var_xdata, var_xdata
200 vmovdqa 2*16(p_keys), xkeyB
205 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
209 .if (klen == KEY_128)
211 vmovdqa 3*16(p_keys), xkey4
214 vmovdqa 3*16(p_keys), xkeyA
220 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
226 .if (klen == KEY_128)
227 vmovdqa 4*16(p_keys), xkeyB
230 vmovdqa 4*16(p_keys), xkey4
238 .if (klen == KEY_128)
239 vaesenc xkey4, var_xdata, var_xdata
241 vaesenc xkeyA, var_xdata, var_xdata
246 vmovdqa 5*16(p_keys), xkeyA
252 .if (klen == KEY_128)
253 vaesenc xkeyB, var_xdata, var_xdata
255 vaesenc xkey4, var_xdata, var_xdata
260 .if (klen == KEY_128)
262 vmovdqa 6*16(p_keys), xkey8
265 vmovdqa 6*16(p_keys), xkeyB
271 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
275 vmovdqa 7*16(p_keys), xkeyA
281 .if (klen == KEY_128)
282 vaesenc xkey8, var_xdata, var_xdata
284 vaesenc xkeyB, var_xdata, var_xdata
289 .if (klen == KEY_128)
290 vmovdqa 8*16(p_keys), xkeyB
293 vmovdqa 8*16(p_keys), xkey8
300 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
304 .if (klen == KEY_128)
306 vmovdqa 9*16(p_keys), xkey12
309 vmovdqa 9*16(p_keys), xkeyA
316 .if (klen == KEY_128)
317 vaesenc xkeyB, var_xdata, var_xdata
319 vaesenc xkey8, var_xdata, var_xdata
324 vmovdqa 10*16(p_keys), xkeyB
330 .if (klen == KEY_128)
331 vaesenc xkey12, var_xdata, var_xdata
333 vaesenc xkeyA, var_xdata, var_xdata
338 .if (klen != KEY_128)
339 vmovdqa 11*16(p_keys), xkeyA
346 .if (klen == KEY_128)
347 vaesenclast xkeyB, var_xdata, var_xdata
349 vaesenc xkeyB, var_xdata, var_xdata
354 .if (klen != KEY_128)
356 vmovdqa 12*16(p_keys), xkey12
362 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
366 .if (klen == KEY_256)
367 vmovdqa 13*16(p_keys), xkeyA
373 .if (klen == KEY_256)
375 vaesenc xkey12, var_xdata, var_xdata
377 vaesenclast xkey12, var_xdata, var_xdata
382 .if (klen == KEY_256)
383 vmovdqa 14*16(p_keys), xkeyB
389 vaesenc xkeyA, var_xdata, var_xdata
397 vaesenclast xkeyB, var_xdata, var_xdata
406 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
407 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
409 vpxor xkeyA, var_xdata, var_xdata
411 vpxor xkeyB, var_xdata, var_xdata
416 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
418 vpxor xkeyA, var_xdata, var_xdata
424 VMOVDQ var_xdata, i*16(p_out)
429 .macro do_aes_load val, key_len
430 do_aes \val, 1, \key_len
433 .macro do_aes_noload val, key_len
434 do_aes \val, 0, \key_len
437 /* main body of aes ctr load */
439 .macro do_aes_ctrmain key_len
441 jb .Ldo_return2\key_len
443 vmovdqa byteswap_const(%rip), xbyteswap
444 vmovdqu (p_iv), xcounter
445 vpshufb xbyteswap, xcounter, xcounter
449 jz .Lmult_of_8_blks\key_len
462 do_aes_load 1, \key_len
464 and $(~7*16), num_bytes
465 jz .Ldo_return2\key_len
466 jmp .Lmain_loop2\key_len
469 do_aes_load 2, \key_len
471 and $(~7*16), num_bytes
472 jz .Ldo_return2\key_len
473 jmp .Lmain_loop2\key_len
477 do_aes_load 3, \key_len
479 and $(~7*16), num_bytes
480 jz .Ldo_return2\key_len
481 jmp .Lmain_loop2\key_len
484 do_aes_load 4, \key_len
486 and $(~7*16), num_bytes
487 jz .Ldo_return2\key_len
488 jmp .Lmain_loop2\key_len
496 do_aes_load 5, \key_len
498 and $(~7*16), num_bytes
499 jz .Ldo_return2\key_len
500 jmp .Lmain_loop2\key_len
503 do_aes_load 6, \key_len
505 and $(~7*16), num_bytes
506 jz .Ldo_return2\key_len
507 jmp .Lmain_loop2\key_len
510 do_aes_load 7, \key_len
512 and $(~7*16), num_bytes
513 jz .Ldo_return2\key_len
514 jmp .Lmain_loop2\key_len
516 .Lmult_of_8_blks\key_len:
517 .if (\key_len != KEY_128)
518 vmovdqa 0*16(p_keys), xkey0
519 vmovdqa 4*16(p_keys), xkey4
520 vmovdqa 8*16(p_keys), xkey8
521 vmovdqa 12*16(p_keys), xkey12
523 vmovdqa 0*16(p_keys), xkey0
524 vmovdqa 3*16(p_keys), xkey4
525 vmovdqa 6*16(p_keys), xkey8
526 vmovdqa 9*16(p_keys), xkey12
529 .Lmain_loop2\key_len:
530 /* num_bytes is a multiple of 8 and >0 */
531 do_aes_noload 8, \key_len
533 sub $(8*16), num_bytes
534 jne .Lmain_loop2\key_len
536 .Ldo_return2\key_len:
537 /* return updated IV */
538 vpshufb xbyteswap, xcounter, xcounter
539 vmovdqu xcounter, (p_iv)
544 * routine to do AES128 CTR enc/decrypt "by8"
545 * XMM registers are clobbered.
546 * Saving/restoring must be done at a higher level
547 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
548 * unsigned int num_bytes)
550 ENTRY(aes_ctr_enc_128_avx_by8)
551 /* call the aes main loop */
552 do_aes_ctrmain KEY_128
554 ENDPROC(aes_ctr_enc_128_avx_by8)
557 * routine to do AES192 CTR enc/decrypt "by8"
558 * XMM registers are clobbered.
559 * Saving/restoring must be done at a higher level
560 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
561 * unsigned int num_bytes)
563 ENTRY(aes_ctr_enc_192_avx_by8)
564 /* call the aes main loop */
565 do_aes_ctrmain KEY_192
567 ENDPROC(aes_ctr_enc_192_avx_by8)
570 * routine to do AES256 CTR enc/decrypt "by8"
571 * XMM registers are clobbered.
572 * Saving/restoring must be done at a higher level
573 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
574 * unsigned int num_bytes)
576 ENTRY(aes_ctr_enc_256_avx_by8)
577 /* call the aes main loop */
578 do_aes_ctrmain KEY_256
580 ENDPROC(aes_ctr_enc_256_avx_by8)