2 * Fast AES implementation for SPE instruction set (PPC)
4 * This code makes use of the SPE SIMD instruction set as defined in
5 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
6 * Implementation is based on optimization guide notes from
7 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
9 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
18 #include <asm/ppc_asm.h>
19 #include "aes-spe-regs.h"
21 #define EAD(in, bpos) \
22 rlwimi rT0,in,28-((bpos+3)%4)*8,20,27;
24 #define DAD(in, bpos) \
25 rlwimi rT1,in,24-((bpos+3)%4)*8,24,31;
27 #define LWH(out, off) \
28 evlwwsplat out,off(rT0); /* load word high */
30 #define LWL(out, off) \
31 lwz out,off(rT0); /* load word low */
33 #define LBZ(out, tab, off) \
34 lbz out,off(tab); /* load byte */
36 #define LAH(out, in, bpos, off) \
37 EAD(in, bpos) /* calc addr + load word high */ \
40 #define LAL(out, in, bpos, off) \
41 EAD(in, bpos) /* calc addr + load word low */ \
44 #define LAE(out, in, bpos) \
45 EAD(in, bpos) /* calc addr + load enc byte */ \
49 LBZ(out, rT0, 8) /* load enc byte */
51 #define LAD(out, in, bpos) \
52 DAD(in, bpos) /* calc addr + load dec byte */ \
59 * ppc_encrypt_block: The central encryption function for a single 16 bytes
60 * block. It does no stack handling or register saving to support fast calls
61 * via bl/blr. It expects that caller has pre-xored input data with first
62 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
63 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
64 * and rW0-rW3 and caller must execute a final xor on the output registers.
65 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
68 _GLOBAL(ppc_encrypt_block)
72 ppc_encrypt_block_loop:
104 evmergehi rD2,rD2,rD3
127 evmergehi rD0,rD0,rD1
137 evmergehi rD2,rD2,rD3
140 bdnz ppc_encrypt_block_loop
162 evmergehi rD0,rD0,rD1
172 evmergehi rD2,rD2,rD3
180 rlwimi rW0,rW4,8,16,23
181 rlwimi rW1,rW5,8,16,23
184 rlwimi rW2,rW6,8,16,23
185 rlwimi rW3,rW7,8,16,23
188 rlwimi rW0,rW4,16,8,15
189 rlwimi rW1,rW5,16,8,15
192 rlwimi rW2,rW6,16,8,15
194 rlwimi rW3,rW7,16,8,15
198 rlwimi rW0,rW4,24,0,7
200 rlwimi rW1,rW5,24,0,7
202 rlwimi rW2,rW6,24,0,7
203 rlwimi rW3,rW7,24,0,7
207 * ppc_decrypt_block: The central decryption function for a single 16 bytes
208 * block. It does no stack handling or register saving to support fast calls
209 * via bl/blr. It expects that caller has pre-xored input data with first
210 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
211 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
212 * and rW0-rW3 and caller must execute a final xor on the output registers.
213 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
216 _GLOBAL(ppc_decrypt_block)
220 ppc_decrypt_block_loop:
242 evmergehi rD0,rD0,rD1
252 evmergehi rD2,rD2,rD3
275 evmergehi rD0,rD0,rD1
285 evmergehi rD2,rD2,rD3
288 bdnz ppc_decrypt_block_loop
310 evmergehi rD0,rD0,rD1
320 evmergehi rD2,rD2,rD3
327 rlwimi rW0,rW4,8,16,23
328 rlwimi rW1,rW5,8,16,23
331 rlwimi rW2,rW6,8,16,23
332 rlwimi rW3,rW7,8,16,23
335 rlwimi rW0,rW4,16,8,15
336 rlwimi rW1,rW5,16,8,15
339 rlwimi rW2,rW6,16,8,15
341 rlwimi rW3,rW7,16,8,15
345 rlwimi rW0,rW4,24,0,7
347 rlwimi rW1,rW5,24,0,7
349 rlwimi rW2,rW6,24,0,7
350 rlwimi rW3,rW7,24,0,7