2 * ---------------------------------------------------------------------------
3 * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
7 * The free distribution and use of this software is allowed (with or without
8 * changes) provided that:
10 * 1. source code distributions include the above copyright notice, this
11 * list of conditions and the following disclaimer;
13 * 2. binary distributions include the above copyright notice, this list
14 * of conditions and the following disclaimer in their documentation;
16 * 3. the name of the copyright holder is not used to endorse products
17 * built using this software without specific written permission.
21 * This software is provided 'as is' with no explicit or implied warranties
22 * in respect of its properties, including, but not limited to, correctness
23 * and/or fitness for purpose.
24 * ---------------------------------------------------------------------------
27 * I am grateful to Dag Arne Osvik for many discussions of the techniques that
28 * can be used to optimise AES assembler code on AMD64/EM64T architectures.
29 * Some of the techniques used in this implementation are the result of
30 * suggestions made by him for which I am most grateful.
32 * An AES implementation for AMD64 processors using the YASM assembler. This
33 * implementation provides only encryption, decryption and hence requires key
34 * scheduling support in C. It uses 8k bytes of tables but its encryption and
35 * decryption performance is very close to that obtained using large tables.
36 * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
37 * which are as follows:
38 * ms windows gnu/linux/opensolaris os
44 * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15
45 * registers rdi - on both
47 * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11
48 * registers - rdi on both
50 * The convention used here is that for gnu/linux/opensolaris os.
52 * This code provides the standard AES block size (128 bits, 16 bytes) and the
53 * three standard AES key sizes (128, 192 and 256 bits). It has the same call
54 * interface as my C implementation. It uses the Microsoft C AMD64 calling
55 * conventions in which the three parameters are placed in rcx, rdx and r8
56 * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
59 * Modified to use GNU/Linux/Solaris calling conventions.
60 * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
62 * AES_RETURN aes_encrypt(const unsigned char in_blk[],
63 * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
65 * AES_RETURN aes_decrypt(const unsigned char in_blk[],
66 * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
68 * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
69 * const aes_encrypt_ctx cx[1])/
71 * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
72 * const aes_decrypt_ctx cx[1])/
74 * AES_RETURN aes_encrypt_key(const unsigned char key[],
75 * unsigned int len, const aes_decrypt_ctx cx[1])/
77 * AES_RETURN aes_decrypt_key(const unsigned char key[],
78 * unsigned int len, const aes_decrypt_ctx cx[1])/
80 * where <NNN> is 128, 102 or 256. In the last two calls the length can be in
81 * either bits or bytes.
83 * Comment in/out the following lines to obtain the desired subroutines. These
84 * selections MUST match those in the C header file aesopt.h
86 #define AES_REV_DKS /* define if key decryption schedule is reversed */
88 #define LAST_ROUND_TABLES /* define for the faster version using extra tables */
91 * The encryption key schedule has the following in memory layout where N is the
92 * number of rounds (10, 12 or 14):
94 * lo: | input key (round 0) | / each round is four 32-bit words
95 * | encryption round 1 |
96 * | encryption round 2 |
98 * | encryption round N-1 |
99 * hi: | encryption round N |
101 * The decryption key schedule is normally set up so that it has the same
102 * layout as above by actually reversing the order of the encryption key
103 * schedule in memory (this happens when AES_REV_DKS is set):
105 * lo: | decryption round 0 | = | encryption round N |
106 * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ]
107 * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ]
109 * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ]
110 * hi: | decryption round N | = | input key (round 0) |
112 * with rounds except the first and last modified using inv_mix_column()
113 * But if AES_REV_DKS is NOT set the order of keys is left as it is for
114 * encryption so that it has to be accessed in reverse when used for
115 * decryption (although the inverse mix column modifications are done)
117 * lo: | decryption round 0 | = | input key (round 0) |
118 * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ]
119 * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ]
121 * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
122 * hi: | decryption round N | = | encryption round N |
124 * This layout is faster when the assembler key scheduling provided here
127 * End of user defines
131 * ---------------------------------------------------------------------------
132 * OpenSolaris OS modifications
134 * This source originates from Brian Gladman file aes_amd64.asm
135 * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
136 * with these changes:
138 * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
139 * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION,
140 * AES_128, AES_192, AES_256, AES_VAR ifdefs.
142 * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
144 * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
146 * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
147 * (operands reversed, literals prefixed with "$", registers prefixed with "%",
148 * and "[register+offset]", addressing changed to "offset(register)",
149 * parenthesis in constant expressions "()" changed to square brackets "[]",
150 * "." removed from local (numeric) labels, and other changes.
152 * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax
153 * mov rax,(4*20h) mov $[4*0x20],%rax
154 * mov rax,[ebx+20h] mov 0x20(%ebx),%rax
155 * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax
156 * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax
158 * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
159 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
160 * definitions for lint.
162 * 6. Renamed functions and reordered parameters to match OpenSolaris:
163 * Original Gladman interface:
164 * int aes_encrypt(const unsigned char *in,
165 * unsigned char *out, const aes_encrypt_ctx cx[1])/
166 * int aes_decrypt(const unsigned char *in,
167 * unsigned char *out, const aes_encrypt_ctx cx[1])/
168 * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
169 * and a union type, inf., containing inf.l, a uint32_t and
170 * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is
171 * used and contains the key schedule length * 16 where key schedule length is
172 * 10, 12, or 14 bytes.
174 * OpenSolaris OS interface:
175 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
176 * const uint32_t pt[4], uint32_t ct[4])/
177 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
178 * const uint32_t pt[4], uint32_t ct[4])/
179 * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
180 * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
181 * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
182 * ct is crypto text, and MAX_AES_NR is 14.
183 * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
187 #include <sys/asm_linkage.h>
205 / finite field multiplies by
{02}, {04} and {08}
207 #define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
208 #define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
209 #define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
211 / finite field multiplies required in table generation
213 #define f3(x) [[f2(x)] ^ [x]]
214 #define f9(x) [[f8(x)] ^ [x]]
215 #define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
216 #define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
217 #define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
219 / macros for expanding S-box data
221 #define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
222 #define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
223 #define w8(x) [x], 0, 0, 0, [x], 0, 0, 0
225 #define enc_vals(x) \
226 .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
227 .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
228 .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
229 .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
230 .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
231 .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
232 .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
233 .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
234 .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
235 .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
236 .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
237 .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
238 .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
239 .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
240 .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
241 .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
242 .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
243 .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
244 .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
245 .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
246 .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
247 .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
248 .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
249 .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
250 .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
251 .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
252 .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
253 .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
254 .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
255 .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
256 .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
257 .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
259 #define dec_vals(x) \
260 .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
261 .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
262 .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
263 .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
264 .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
265 .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
266 .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
267 .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
268 .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
269 .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
270 .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
271 .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
272 .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
273 .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
274 .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
275 .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
276 .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
277 .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
278 .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
279 .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
280 .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
281 .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
282 .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
283 .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
284 .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
285 .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
286 .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
287 .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
288 .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
289 .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
290 .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
291 .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
293 #define tptr %rbp /* table pointer */
294 #define kptr %r8 /* key schedule pointer */
295 #define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */
296 #define fk_ref(x, y) -16*x+fofs+4*y(kptr)
300 #define ik_ref(x, y) -16*x+rofs+4*y(kptr)
304 #define ik_ref(x, y) 16*x+rofs+4*y(kptr)
305 #endif /* AES_REV_DKS */
307 #define tab_0(x) (tptr,x,8)
308 #define tab_1(x) 3(tptr,x,8)
309 #define tab_2(x) 2(tptr,x,8)
310 #define tab_3(x) 1(tptr,x,8)
311 #define tab_f(x) 1(tptr,x,8)
312 #define tab_i(x) 7(tptr,x,8)
314 #define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \
315 mov fk_ref
(round
,0), p1; \
316 mov fk_ref
(round
,1), p2; \
317 mov fk_ref
(round
,2), p3; \
318 mov fk_ref
(round
,3), p4; \
323 xor tab_0
(%rsi
), p1; \
324 xor tab_1
(%rdi
), p4; \
327 xor tab_2
(%rsi
), p3; \
328 xor tab_3
(%rdi
), p2; \
333 xor tab_0
(%rsi
), p2; \
334 xor tab_1
(%rdi
), p1; \
337 xor tab_2
(%rsi
), p4; \
338 xor tab_3
(%rdi
), p3; \
343 xor tab_0
(%rsi
), p3; \
344 xor tab_1
(%rdi
), p2; \
347 xor tab_2
(%rsi
), p1; \
348 xor tab_3
(%rdi
), p4; \
353 xor tab_0
(%rsi
), p4; \
354 xor tab_1
(%rdi
), p3; \
357 xor tab_2
(%rsi
), p2; \
358 xor tab_3
(%rdi
), p1; \
365 #ifdef LAST_ROUND_TABLES
367 #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \
369 mov fk_ref
(round
,0), p1; \
370 mov fk_ref
(round
,1), p2; \
371 mov fk_ref
(round
,2), p3; \
372 mov fk_ref
(round
,3), p4; \
377 xor tab_0
(%rsi
), p1; \
378 xor tab_1
(%rdi
), p4; \
381 xor tab_2
(%rsi
), p3; \
382 xor tab_3
(%rdi
), p2; \
387 xor tab_0
(%rsi
), p2; \
388 xor tab_1
(%rdi
), p1; \
391 xor tab_2
(%rsi
), p4; \
392 xor tab_3
(%rdi
), p3; \
397 xor tab_0
(%rsi
), p3; \
398 xor tab_1
(%rdi
), p2; \
401 xor tab_2
(%rsi
), p1; \
402 xor tab_3
(%rdi
), p4; \
407 xor tab_0
(%rsi
), p4; \
408 xor tab_1
(%rdi
), p3; \
411 xor tab_2
(%rsi
), p2; \
416 #define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \
417 mov fk_ref
(round
,0), p1; \
418 mov fk_ref
(round
,1), p2; \
419 mov fk_ref
(round
,2), p3; \
420 mov fk_ref
(round
,3), p4; \
425 movzx tab_f
(%rsi
), %esi; \
426 movzx tab_f
(%rdi
), %edi; \
432 movzx tab_f
(%rsi
), %esi; \
433 movzx tab_f
(%rdi
), %edi; \
442 movzx tab_f
(%rsi
), %esi; \
443 movzx tab_f
(%rdi
), %edi; \
449 movzx tab_f
(%rsi
), %esi; \
450 movzx tab_f
(%rdi
), %edi; \
458 movzx tab_f
(%rsi
), %esi; \
459 movzx tab_f
(%rdi
), %edi; \
466 movzx tab_f
(%rsi
), %esi; \
467 movzx tab_f
(%rdi
), %edi; \
475 movzx tab_f
(%rsi
), %esi; \
476 movzx tab_f
(%rdi
), %edi; \
483 movzx tab_f
(%rsi
), %esi; \
484 movzx tab_f
(%rdi
), %edi; \
490 #endif /* LAST_ROUND_TABLES */
492 #define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \
493 mov ik_ref
(round
,0), p1; \
494 mov ik_ref
(round
,1), p2; \
495 mov ik_ref
(round
,2), p3; \
496 mov ik_ref
(round
,3), p4; \
501 xor tab_0
(%rsi
), p1; \
502 xor tab_1
(%rdi
), p2; \
505 xor tab_2
(%rsi
), p3; \
506 xor tab_3
(%rdi
), p4; \
511 xor tab_0
(%rsi
), p2; \
512 xor tab_1
(%rdi
), p3; \
515 xor tab_2
(%rsi
), p4; \
516 xor tab_3
(%rdi
), p1; \
521 xor tab_0
(%rsi
), p3; \
522 xor tab_1
(%rdi
), p4; \
525 xor tab_2
(%rsi
), p1; \
526 xor tab_3
(%rdi
), p2; \
531 xor tab_0
(%rsi
), p4; \
532 xor tab_1
(%rdi
), p1; \
535 xor tab_2
(%rsi
), p2; \
536 xor tab_3
(%rdi
), p3; \
543 #ifdef LAST_ROUND_TABLES
545 #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \
547 mov ik_ref
(round
,0), p1; \
548 mov ik_ref
(round
,1), p2; \
549 mov ik_ref
(round
,2), p3; \
550 mov ik_ref
(round
,3), p4; \
555 xor tab_0
(%rsi
), p1; \
556 xor tab_1
(%rdi
), p2; \
559 xor tab_2
(%rsi
), p3; \
560 xor tab_3
(%rdi
), p4; \
565 xor tab_0
(%rsi
), p2; \
566 xor tab_1
(%rdi
), p3; \
569 xor tab_2
(%rsi
), p4; \
570 xor tab_3
(%rdi
), p1; \
575 xor tab_0
(%rsi
), p3; \
576 xor tab_1
(%rdi
), p4; \
579 xor tab_2
(%rsi
), p1; \
580 xor tab_3
(%rdi
), p2; \
585 xor tab_0
(%rsi
), p4; \
586 xor tab_1
(%rdi
), p1; \
589 xor tab_2
(%rsi
), p2; \
594 #define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \
595 mov ik_ref
(round
,0), p1; \
596 mov ik_ref
(round
,1), p2; \
597 mov ik_ref
(round
,2), p3; \
598 mov ik_ref
(round
,3), p4; \
602 movzx tab_i
(%rsi
), %esi; \
603 movzx tab_i
(%rdi
), %edi; \
610 movzx tab_i
(%rsi
), %esi; \
611 movzx tab_i
(%rdi
), %edi; \
619 movzx tab_i
(%rsi
), %esi; \
620 movzx tab_i
(%rdi
), %edi; \
627 movzx tab_i
(%rsi
), %esi; \
628 movzx tab_i
(%rdi
), %edi; \
636 movzx tab_i
(%rsi
), %esi; \
637 movzx tab_i
(%rdi
), %edi; \
644 movzx tab_i
(%rsi
), %esi; \
645 movzx tab_i
(%rdi
), %edi; \
653 movzx tab_i
(%rsi
), %esi; \
654 movzx tab_i
(%rdi
), %edi; \
661 movzx tab_i
(%rsi
), %esi; \
662 movzx tab_i
(%rdi
), %edi; \
668 #endif /* LAST_ROUND_TABLES */
672 * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
673 * const uint32_t pt[4], uint32_t ct[4])/
675 * Original interface:
676 * int aes_encrypt(const unsigned char *in,
677 * unsigned char *out, const aes_encrypt_ctx cx[1])/
682 #ifdef LAST_ROUND_TABLES
688 ENTRY_NP
(aes_encrypt_amd64
)
689 #ifdef GLADMAN_INTERFACE
691 sub $
[4*8], %rsp
/ gnu
/linux
/opensolaris binary interface
692 mov
%rsi
, (%rsp
) / output pointer
(P2
)
693 mov
%rdx
, %r8 / context
(P3
)
695 mov
%rbx
, 1*8(%rsp
) / P1
: input pointer in rdi
696 mov
%rbp
, 2*8(%rsp
) / P2
: output pointer in
(rsp
)
697 mov
%r12, 3*8(%rsp
) / P3
: context in
r8
698 movzx
4*KS_LENGTH
(kptr
), %esi
/ Get byte key length
* 16
701 / OpenSolaris OS interface
702 sub $
[4*8], %rsp
/ Make room on stack to save registers
703 mov
%rcx
, (%rsp
) / Save output pointer
(P4
) on stack
704 mov
%rdi
, %r8 / context
(P1
)
705 mov
%rdx
, %rdi
/ P3
: save input pointer
706 shl $
4, %esi
/ P2
: esi byte key length
* 16
708 mov
%rbx
, 1*8(%rsp
) / Save registers
712 / P2
: byte key length
* 16 in esi
713 / P3
: input pointer in rdi
714 / P4
: output pointer in
(rsp
)
715 #endif /* GLADMAN_INTERFACE */
717 lea enc_tab
(%rip
), tptr
720 / Load input block into registers
727 xor fofs+
4(kptr
), %ebx
728 xor fofs+
8(kptr
), %ecx
729 xor fofs+
12(kptr
), %edx
731 lea
(kptr
,%rsi
), kptr
732 / Jump based on byte key length
* 16:
739 mov $
-1, %rax
/ error
742 / Perform normal forward rounds
743 1: ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 13)
744 ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 12)
745 2: ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 11)
746 ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 10)
747 3: ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 9)
748 ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 8)
749 ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 7)
750 ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 6)
751 ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 5)
752 ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 4)
753 ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 3)
754 ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 2)
755 ff_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 1)
756 fl_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 0)
765 4: / Restore registers
772 SET_SIZE
(aes_encrypt_amd64
)
776 * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
777 * const uint32_t pt[4], uint32_t ct[4])/
779 * Original interface:
780 * int aes_decrypt(const unsigned char *in,
781 * unsigned char *out, const aes_encrypt_ctx cx[1])/
786 #ifdef LAST_ROUND_TABLES
792 ENTRY_NP
(aes_decrypt_amd64
)
793 #ifdef GLADMAN_INTERFACE
795 sub $
[4*8], %rsp
/ gnu
/linux
/opensolaris binary interface
796 mov
%rsi
, (%rsp
) / output pointer
(P2
)
797 mov
%rdx
, %r8 / context
(P3
)
799 mov
%rbx
, 1*8(%rsp
) / P1
: input pointer in rdi
800 mov
%rbp
, 2*8(%rsp
) / P2
: output pointer in
(rsp
)
801 mov
%r12, 3*8(%rsp
) / P3
: context in
r8
802 movzx
4*KS_LENGTH
(kptr
), %esi
/ Get byte key length
* 16
805 / OpenSolaris OS interface
806 sub $
[4*8], %rsp
/ Make room on stack to save registers
807 mov
%rcx
, (%rsp
) / Save output pointer
(P4
) on stack
808 mov
%rdi
, %r8 / context
(P1
)
809 mov
%rdx
, %rdi
/ P3
: save input pointer
810 shl $
4, %esi
/ P2
: esi byte key length
* 16
812 mov
%rbx
, 1*8(%rsp
) / Save registers
816 / P2
: byte key length
* 16 in esi
817 / P3
: input pointer in rdi
818 / P4
: output pointer in
(rsp
)
819 #endif /* GLADMAN_INTERFACE */
821 lea dec_tab
(%rip
), tptr
824 / Load input block into registers
832 lea
(kptr
,%rsi
), kptr
834 lea
(kptr
,%rsi
), %rdi
838 xor rofs+
4(%rdi
), %ebx
839 xor rofs+
8(%rdi
), %ecx
840 xor rofs+
12(%rdi
), %edx
842 / Jump based on byte key length
* 16:
849 mov $
-1, %rax
/ error
852 / Perform normal inverse rounds
853 1: ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 13)
854 ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 12)
855 2: ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 11)
856 ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 10)
857 3: ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 9)
858 ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 8)
859 ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 7)
860 ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 6)
861 ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 5)
862 ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 4)
863 ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 3)
864 ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 2)
865 ii_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 1)
866 il_rnd
(%r9d
, %r10d
, %r11d
, %r12d
, 0)
875 4: / Restore registers
882 SET_SIZE
(aes_decrypt_amd64
)