Add phnxdeco with debian patch set (version 0.33-3).
[delutions.git] / tc / crypto / Aes_x64.asm
blob1ccf73c956d0bf3c16cb07e45c655928e8a21e08
2 ; ---------------------------------------------------------------------------
3 ; Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
4 ;
5 ; LICENSE TERMS
6 ;
7 ; The free distribution and use of this software is allowed (with or without
8 ; changes) provided that:
9 ;
10 ; 1. source code distributions include the above copyright notice, this
11 ; list of conditions and the following disclaimer;
13 ; 2. binary distributions include the above copyright notice, this list
14 ; of conditions and the following disclaimer in their documentation;
16 ; 3. the name of the copyright holder is not used to endorse products
17 ; built using this software without specific written permission.
19 ; DISCLAIMER
21 ; This software is provided 'as is' with no explicit or implied warranties
22 ; in respect of its properties, including, but not limited to, correctness
23 ; and/or fitness for purpose.
24 ; ---------------------------------------------------------------------------
25 ; Issue 20/12/2007
27 ; I am grateful to Dag Arne Osvik for many discussions of the techniques that
28 ; can be used to optimise AES assembler code on AMD64/EM64T architectures.
29 ; Some of the techniques used in this implementation are the result of
30 ; suggestions made by him for which I am most grateful.
32 ; An AES implementation for AMD64 processors using the YASM assembler. This
33 ; implemetation provides only encryption, decryption and hence requires key
34 ; scheduling support in C. It uses 8k bytes of tables but its encryption and
35 ; decryption performance is very close to that obtained using large tables.
36 ; It can use either Windows or Gnu/Linux calling conventions, which are as
37 ; follows:
38 ; windows gnu/linux
40 ; in_blk rcx rdi
41 ; out_blk rdx rsi
42 ; context (cx) r8 rdx
44 ; preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15
45 ; registers rdi - on both
47 ; destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11
48 ; registers - rdi on both
50 ; The default convention is that for windows, the gnu/linux convention being
51 ; used if __GNUC__ is defined.
53 ; Define _SEH_ to include support for Win64 structured exception handling
54 ; (this requires YASM version 0.6 or later).
56 ; This code provides the standard AES block size (128 bits, 16 bytes) and the
57 ; three standard AES key sizes (128, 192 and 256 bits). It has the same call
58 ; interface as my C implementation. It uses the Microsoft C AMD64 calling
59 ; conventions in which the three parameters are placed in rcx, rdx and r8
60 ; respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
62 ; AES_RETURN aes_encrypt(const unsigned char in_blk[],
63 ; unsigned char out_blk[], const aes_encrypt_ctx cx[1]);
65 ; AES_RETURN aes_decrypt(const unsigned char in_blk[],
66 ; unsigned char out_blk[], const aes_decrypt_ctx cx[1]);
68 ; AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
69 ; const aes_encrypt_ctx cx[1]);
71 ; AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
72 ; const aes_decrypt_ctx cx[1]);
74 ; AES_RETURN aes_encrypt_key(const unsigned char key[],
75 ; unsigned int len, const aes_decrypt_ctx cx[1]);
77 ; AES_RETURN aes_decrypt_key(const unsigned char key[],
78 ; unsigned int len, const aes_decrypt_ctx cx[1]);
80 ; where <NNN> is 128, 102 or 256. In the last two calls the length can be in
81 ; either bits or bytes.
83 ; Comment in/out the following lines to obtain the desired subroutines. These
84 ; selections MUST match those in the C header file aes.h
86 ; %define AES_128 ; define if AES with 128 bit keys is needed
87 ; %define AES_192 ; define if AES with 192 bit keys is needed
88 %define AES_256 ; define if AES with 256 bit keys is needed
89 ; %define AES_VAR ; define if a variable key size is needed
90 %define ENCRYPTION ; define if encryption is needed
91 %define DECRYPTION ; define if decryption is needed
92 %define AES_REV_DKS ; define if key decryption schedule is reversed
93 %define LAST_ROUND_TABLES ; define for the faster version using extra tables
95 ; The encryption key schedule has the following in memory layout where N is the
96 ; number of rounds (10, 12 or 14):
98 ; lo: | input key (round 0) | ; each round is four 32-bit words
99 ; | encryption round 1 |
100 ; | encryption round 2 |
101 ; ....
102 ; | encryption round N-1 |
103 ; hi: | encryption round N |
105 ; The decryption key schedule is normally set up so that it has the same
106 ; layout as above by actually reversing the order of the encryption key
107 ; schedule in memory (this happens when AES_REV_DKS is set):
109 ; lo: | decryption round 0 | = | encryption round N |
110 ; | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ]
111 ; | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ]
112 ; .... ....
113 ; | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ]
114 ; hi: | decryption round N | = | input key (round 0) |
116 ; with rounds except the first and last modified using inv_mix_column()
117 ; But if AES_REV_DKS is NOT set the order of keys is left as it is for
118 ; encryption so that it has to be accessed in reverse when used for
119 ; decryption (although the inverse mix column modifications are done)
121 ; lo: | decryption round 0 | = | input key (round 0) |
122 ; | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ]
123 ; | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ]
124 ; .... ....
125 ; | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
126 ; hi: | decryption round N | = | encryption round N |
128 ; This layout is faster when the assembler key scheduling provided here
129 ; is used.
131 ; The DLL interface must use the _stdcall convention in which the number
132 ; of bytes of parameter space is added after an @ to the sutine's name.
133 ; We must also remove our parameters from the stack before return (see
134 ; the do_exit macro). Define DLL_EXPORT for the Dynamic Link Library version.
136 ;%define DLL_EXPORT
138 ; End of user defines
140 %ifdef AES_VAR
141 %ifndef AES_128
142 %define AES_128
143 %endif
144 %ifndef AES_192
145 %define AES_192
146 %endif
147 %ifndef AES_256
148 %define AES_256
149 %endif
150 %endif
152 %ifdef AES_VAR
153 %define KS_LENGTH 60
154 %elifdef AES_256
155 %define KS_LENGTH 60
156 %elifdef AES_192
157 %define KS_LENGTH 52
158 %else
159 %define KS_LENGTH 44
160 %endif
162 %define r0 rax
163 %define r1 rdx
164 %define r2 rcx
165 %define r3 rbx
166 %define r4 rsi
167 %define r5 rdi
168 %define r6 rbp
169 %define r7 rsp
171 %define raxd eax
172 %define rdxd edx
173 %define rcxd ecx
174 %define rbxd ebx
175 %define rsid esi
176 %define rdid edi
177 %define rbpd ebp
178 %define rspd esp
180 %define raxb al
181 %define rdxb dl
182 %define rcxb cl
183 %define rbxb bl
184 %define rsib sil
185 %define rdib dil
186 %define rbpb bpl
187 %define rspb spl
189 %define r0h ah
190 %define r1h dh
191 %define r2h ch
192 %define r3h bh
194 %define r0d eax
195 %define r1d edx
196 %define r2d ecx
197 %define r3d ebx
199 ; finite field multiplies by {02}, {04} and {08}
201 %define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
202 %define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
203 %define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
205 ; finite field multiplies required in table generation
207 %define f3(x) (f2(x) ^ x)
208 %define f9(x) (f8(x) ^ x)
209 %define fb(x) (f8(x) ^ f2(x) ^ x)
210 %define fd(x) (f8(x) ^ f4(x) ^ x)
211 %define fe(x) (f8(x) ^ f4(x) ^ f2(x))
213 ; macro for expanding S-box data
215 %macro enc_vals 1
216 db %1(0x63),%1(0x7c),%1(0x77),%1(0x7b),%1(0xf2),%1(0x6b),%1(0x6f),%1(0xc5)
217 db %1(0x30),%1(0x01),%1(0x67),%1(0x2b),%1(0xfe),%1(0xd7),%1(0xab),%1(0x76)
218 db %1(0xca),%1(0x82),%1(0xc9),%1(0x7d),%1(0xfa),%1(0x59),%1(0x47),%1(0xf0)
219 db %1(0xad),%1(0xd4),%1(0xa2),%1(0xaf),%1(0x9c),%1(0xa4),%1(0x72),%1(0xc0)
220 db %1(0xb7),%1(0xfd),%1(0x93),%1(0x26),%1(0x36),%1(0x3f),%1(0xf7),%1(0xcc)
221 db %1(0x34),%1(0xa5),%1(0xe5),%1(0xf1),%1(0x71),%1(0xd8),%1(0x31),%1(0x15)
222 db %1(0x04),%1(0xc7),%1(0x23),%1(0xc3),%1(0x18),%1(0x96),%1(0x05),%1(0x9a)
223 db %1(0x07),%1(0x12),%1(0x80),%1(0xe2),%1(0xeb),%1(0x27),%1(0xb2),%1(0x75)
224 db %1(0x09),%1(0x83),%1(0x2c),%1(0x1a),%1(0x1b),%1(0x6e),%1(0x5a),%1(0xa0)
225 db %1(0x52),%1(0x3b),%1(0xd6),%1(0xb3),%1(0x29),%1(0xe3),%1(0x2f),%1(0x84)
226 db %1(0x53),%1(0xd1),%1(0x00),%1(0xed),%1(0x20),%1(0xfc),%1(0xb1),%1(0x5b)
227 db %1(0x6a),%1(0xcb),%1(0xbe),%1(0x39),%1(0x4a),%1(0x4c),%1(0x58),%1(0xcf)
228 db %1(0xd0),%1(0xef),%1(0xaa),%1(0xfb),%1(0x43),%1(0x4d),%1(0x33),%1(0x85)
229 db %1(0x45),%1(0xf9),%1(0x02),%1(0x7f),%1(0x50),%1(0x3c),%1(0x9f),%1(0xa8)
230 db %1(0x51),%1(0xa3),%1(0x40),%1(0x8f),%1(0x92),%1(0x9d),%1(0x38),%1(0xf5)
231 db %1(0xbc),%1(0xb6),%1(0xda),%1(0x21),%1(0x10),%1(0xff),%1(0xf3),%1(0xd2)
232 db %1(0xcd),%1(0x0c),%1(0x13),%1(0xec),%1(0x5f),%1(0x97),%1(0x44),%1(0x17)
233 db %1(0xc4),%1(0xa7),%1(0x7e),%1(0x3d),%1(0x64),%1(0x5d),%1(0x19),%1(0x73)
234 db %1(0x60),%1(0x81),%1(0x4f),%1(0xdc),%1(0x22),%1(0x2a),%1(0x90),%1(0x88)
235 db %1(0x46),%1(0xee),%1(0xb8),%1(0x14),%1(0xde),%1(0x5e),%1(0x0b),%1(0xdb)
236 db %1(0xe0),%1(0x32),%1(0x3a),%1(0x0a),%1(0x49),%1(0x06),%1(0x24),%1(0x5c)
237 db %1(0xc2),%1(0xd3),%1(0xac),%1(0x62),%1(0x91),%1(0x95),%1(0xe4),%1(0x79)
238 db %1(0xe7),%1(0xc8),%1(0x37),%1(0x6d),%1(0x8d),%1(0xd5),%1(0x4e),%1(0xa9)
239 db %1(0x6c),%1(0x56),%1(0xf4),%1(0xea),%1(0x65),%1(0x7a),%1(0xae),%1(0x08)
240 db %1(0xba),%1(0x78),%1(0x25),%1(0x2e),%1(0x1c),%1(0xa6),%1(0xb4),%1(0xc6)
241 db %1(0xe8),%1(0xdd),%1(0x74),%1(0x1f),%1(0x4b),%1(0xbd),%1(0x8b),%1(0x8a)
242 db %1(0x70),%1(0x3e),%1(0xb5),%1(0x66),%1(0x48),%1(0x03),%1(0xf6),%1(0x0e)
243 db %1(0x61),%1(0x35),%1(0x57),%1(0xb9),%1(0x86),%1(0xc1),%1(0x1d),%1(0x9e)
244 db %1(0xe1),%1(0xf8),%1(0x98),%1(0x11),%1(0x69),%1(0xd9),%1(0x8e),%1(0x94)
245 db %1(0x9b),%1(0x1e),%1(0x87),%1(0xe9),%1(0xce),%1(0x55),%1(0x28),%1(0xdf)
246 db %1(0x8c),%1(0xa1),%1(0x89),%1(0x0d),%1(0xbf),%1(0xe6),%1(0x42),%1(0x68)
247 db %1(0x41),%1(0x99),%1(0x2d),%1(0x0f),%1(0xb0),%1(0x54),%1(0xbb),%1(0x16)
248 %endmacro
250 %macro dec_vals 1
251 db %1(0x52),%1(0x09),%1(0x6a),%1(0xd5),%1(0x30),%1(0x36),%1(0xa5),%1(0x38)
252 db %1(0xbf),%1(0x40),%1(0xa3),%1(0x9e),%1(0x81),%1(0xf3),%1(0xd7),%1(0xfb)
253 db %1(0x7c),%1(0xe3),%1(0x39),%1(0x82),%1(0x9b),%1(0x2f),%1(0xff),%1(0x87)
254 db %1(0x34),%1(0x8e),%1(0x43),%1(0x44),%1(0xc4),%1(0xde),%1(0xe9),%1(0xcb)
255 db %1(0x54),%1(0x7b),%1(0x94),%1(0x32),%1(0xa6),%1(0xc2),%1(0x23),%1(0x3d)
256 db %1(0xee),%1(0x4c),%1(0x95),%1(0x0b),%1(0x42),%1(0xfa),%1(0xc3),%1(0x4e)
257 db %1(0x08),%1(0x2e),%1(0xa1),%1(0x66),%1(0x28),%1(0xd9),%1(0x24),%1(0xb2)
258 db %1(0x76),%1(0x5b),%1(0xa2),%1(0x49),%1(0x6d),%1(0x8b),%1(0xd1),%1(0x25)
259 db %1(0x72),%1(0xf8),%1(0xf6),%1(0x64),%1(0x86),%1(0x68),%1(0x98),%1(0x16)
260 db %1(0xd4),%1(0xa4),%1(0x5c),%1(0xcc),%1(0x5d),%1(0x65),%1(0xb6),%1(0x92)
261 db %1(0x6c),%1(0x70),%1(0x48),%1(0x50),%1(0xfd),%1(0xed),%1(0xb9),%1(0xda)
262 db %1(0x5e),%1(0x15),%1(0x46),%1(0x57),%1(0xa7),%1(0x8d),%1(0x9d),%1(0x84)
263 db %1(0x90),%1(0xd8),%1(0xab),%1(0x00),%1(0x8c),%1(0xbc),%1(0xd3),%1(0x0a)
264 db %1(0xf7),%1(0xe4),%1(0x58),%1(0x05),%1(0xb8),%1(0xb3),%1(0x45),%1(0x06)
265 db %1(0xd0),%1(0x2c),%1(0x1e),%1(0x8f),%1(0xca),%1(0x3f),%1(0x0f),%1(0x02)
266 db %1(0xc1),%1(0xaf),%1(0xbd),%1(0x03),%1(0x01),%1(0x13),%1(0x8a),%1(0x6b)
267 db %1(0x3a),%1(0x91),%1(0x11),%1(0x41),%1(0x4f),%1(0x67),%1(0xdc),%1(0xea)
268 db %1(0x97),%1(0xf2),%1(0xcf),%1(0xce),%1(0xf0),%1(0xb4),%1(0xe6),%1(0x73)
269 db %1(0x96),%1(0xac),%1(0x74),%1(0x22),%1(0xe7),%1(0xad),%1(0x35),%1(0x85)
270 db %1(0xe2),%1(0xf9),%1(0x37),%1(0xe8),%1(0x1c),%1(0x75),%1(0xdf),%1(0x6e)
271 db %1(0x47),%1(0xf1),%1(0x1a),%1(0x71),%1(0x1d),%1(0x29),%1(0xc5),%1(0x89)
272 db %1(0x6f),%1(0xb7),%1(0x62),%1(0x0e),%1(0xaa),%1(0x18),%1(0xbe),%1(0x1b)
273 db %1(0xfc),%1(0x56),%1(0x3e),%1(0x4b),%1(0xc6),%1(0xd2),%1(0x79),%1(0x20)
274 db %1(0x9a),%1(0xdb),%1(0xc0),%1(0xfe),%1(0x78),%1(0xcd),%1(0x5a),%1(0xf4)
275 db %1(0x1f),%1(0xdd),%1(0xa8),%1(0x33),%1(0x88),%1(0x07),%1(0xc7),%1(0x31)
276 db %1(0xb1),%1(0x12),%1(0x10),%1(0x59),%1(0x27),%1(0x80),%1(0xec),%1(0x5f)
277 db %1(0x60),%1(0x51),%1(0x7f),%1(0xa9),%1(0x19),%1(0xb5),%1(0x4a),%1(0x0d)
278 db %1(0x2d),%1(0xe5),%1(0x7a),%1(0x9f),%1(0x93),%1(0xc9),%1(0x9c),%1(0xef)
279 db %1(0xa0),%1(0xe0),%1(0x3b),%1(0x4d),%1(0xae),%1(0x2a),%1(0xf5),%1(0xb0)
280 db %1(0xc8),%1(0xeb),%1(0xbb),%1(0x3c),%1(0x83),%1(0x53),%1(0x99),%1(0x61)
281 db %1(0x17),%1(0x2b),%1(0x04),%1(0x7e),%1(0xba),%1(0x77),%1(0xd6),%1(0x26)
282 db %1(0xe1),%1(0x69),%1(0x14),%1(0x63),%1(0x55),%1(0x21),%1(0x0c),%1(0x7d)
283 %endmacro
285 %define u8(x) f2(x), x, x, f3(x), f2(x), x, x, f3(x)
286 %define v8(x) fe(x), f9(x), fd(x), fb(x), fe(x), f9(x), fd(x), x
287 %define w8(x) x, 0, 0, 0, x, 0, 0, 0
289 %define tptr rbp ; table pointer
290 %define kptr r8 ; key schedule pointer
291 %define fofs 128 ; adjust offset in key schedule to keep |disp| < 128
292 %define fk_ref(x,y) [kptr-16*x+fofs+4*y]
293 %ifdef AES_REV_DKS
294 %define rofs 128
295 %define ik_ref(x,y) [kptr-16*x+rofs+4*y]
296 %else
297 %define rofs -128
298 %define ik_ref(x,y) [kptr+16*x+rofs+4*y]
299 %endif
301 %define tab_0(x) [tptr+8*x]
302 %define tab_1(x) [tptr+8*x+3]
303 %define tab_2(x) [tptr+8*x+2]
304 %define tab_3(x) [tptr+8*x+1]
305 %define tab_f(x) byte [tptr+8*x+1]
306 %define tab_i(x) byte [tptr+8*x+7]
307 %define t_ref(x,r) tab_ %+ x(r)
309 %macro ff_rnd 5 ; normal forward round
310 mov %1d, fk_ref(%5,0)
311 mov %2d, fk_ref(%5,1)
312 mov %3d, fk_ref(%5,2)
313 mov %4d, fk_ref(%5,3)
315 movzx esi, al
316 movzx edi, ah
317 shr eax, 16
318 xor %1d, t_ref(0,rsi)
319 xor %4d, t_ref(1,rdi)
320 movzx esi, al
321 movzx edi, ah
322 xor %3d, t_ref(2,rsi)
323 xor %2d, t_ref(3,rdi)
325 movzx esi, bl
326 movzx edi, bh
327 shr ebx, 16
328 xor %2d, t_ref(0,rsi)
329 xor %1d, t_ref(1,rdi)
330 movzx esi, bl
331 movzx edi, bh
332 xor %4d, t_ref(2,rsi)
333 xor %3d, t_ref(3,rdi)
335 movzx esi, cl
336 movzx edi, ch
337 shr ecx, 16
338 xor %3d, t_ref(0,rsi)
339 xor %2d, t_ref(1,rdi)
340 movzx esi, cl
341 movzx edi, ch
342 xor %1d, t_ref(2,rsi)
343 xor %4d, t_ref(3,rdi)
345 movzx esi, dl
346 movzx edi, dh
347 shr edx, 16
348 xor %4d, t_ref(0,rsi)
349 xor %3d, t_ref(1,rdi)
350 movzx esi, dl
351 movzx edi, dh
352 xor %2d, t_ref(2,rsi)
353 xor %1d, t_ref(3,rdi)
355 mov eax,%1d
356 mov ebx,%2d
357 mov ecx,%3d
358 mov edx,%4d
359 %endmacro
361 %ifdef LAST_ROUND_TABLES
363 %macro fl_rnd 5 ; last forward round
364 add tptr, 2048
365 mov %1d, fk_ref(%5,0)
366 mov %2d, fk_ref(%5,1)
367 mov %3d, fk_ref(%5,2)
368 mov %4d, fk_ref(%5,3)
370 movzx esi, al
371 movzx edi, ah
372 shr eax, 16
373 xor %1d, t_ref(0,rsi)
374 xor %4d, t_ref(1,rdi)
375 movzx esi, al
376 movzx edi, ah
377 xor %3d, t_ref(2,rsi)
378 xor %2d, t_ref(3,rdi)
380 movzx esi, bl
381 movzx edi, bh
382 shr ebx, 16
383 xor %2d, t_ref(0,rsi)
384 xor %1d, t_ref(1,rdi)
385 movzx esi, bl
386 movzx edi, bh
387 xor %4d, t_ref(2,rsi)
388 xor %3d, t_ref(3,rdi)
390 movzx esi, cl
391 movzx edi, ch
392 shr ecx, 16
393 xor %3d, t_ref(0,rsi)
394 xor %2d, t_ref(1,rdi)
395 movzx esi, cl
396 movzx edi, ch
397 xor %1d, t_ref(2,rsi)
398 xor %4d, t_ref(3,rdi)
400 movzx esi, dl
401 movzx edi, dh
402 shr edx, 16
403 xor %4d, t_ref(0,rsi)
404 xor %3d, t_ref(1,rdi)
405 movzx esi, dl
406 movzx edi, dh
407 xor %2d, t_ref(2,rsi)
408 xor %1d, t_ref(3,rdi)
409 %endmacro
411 %else
413 %macro fl_rnd 5 ; last forward round
414 mov %1d, fk_ref(%5,0)
415 mov %2d, fk_ref(%5,1)
416 mov %3d, fk_ref(%5,2)
417 mov %4d, fk_ref(%5,3)
419 movzx esi, al
420 movzx edi, ah
421 shr eax, 16
422 movzx esi, t_ref(f,rsi)
423 movzx edi, t_ref(f,rdi)
424 xor %1d, esi
425 rol edi, 8
426 xor %4d, edi
427 movzx esi, al
428 movzx edi, ah
429 movzx esi, t_ref(f,rsi)
430 movzx edi, t_ref(f,rdi)
431 rol esi, 16
432 rol edi, 24
433 xor %3d, esi
434 xor %2d, edi
436 movzx esi, bl
437 movzx edi, bh
438 shr ebx, 16
439 movzx esi, t_ref(f,rsi)
440 movzx edi, t_ref(f,rdi)
441 xor %2d, esi
442 rol edi, 8
443 xor %1d, edi
444 movzx esi, bl
445 movzx edi, bh
446 movzx esi, t_ref(f,rsi)
447 movzx edi, t_ref(f,rdi)
448 rol esi, 16
449 rol edi, 24
450 xor %4d, esi
451 xor %3d, edi
453 movzx esi, cl
454 movzx edi, ch
455 movzx esi, t_ref(f,rsi)
456 movzx edi, t_ref(f,rdi)
457 shr ecx, 16
458 xor %3d, esi
459 rol edi, 8
460 xor %2d, edi
461 movzx esi, cl
462 movzx edi, ch
463 movzx esi, t_ref(f,rsi)
464 movzx edi, t_ref(f,rdi)
465 rol esi, 16
466 rol edi, 24
467 xor %1d, esi
468 xor %4d, edi
470 movzx esi, dl
471 movzx edi, dh
472 movzx esi, t_ref(f,rsi)
473 movzx edi, t_ref(f,rdi)
474 shr edx, 16
475 xor %4d, esi
476 rol edi, 8
477 xor %3d, edi
478 movzx esi, dl
479 movzx edi, dh
480 movzx esi, t_ref(f,rsi)
481 movzx edi, t_ref(f,rdi)
482 rol esi, 16
483 rol edi, 24
484 xor %2d, esi
485 xor %1d, edi
486 %endmacro
488 %endif
490 %macro ii_rnd 5 ; normal inverse round
491 mov %1d, ik_ref(%5,0)
492 mov %2d, ik_ref(%5,1)
493 mov %3d, ik_ref(%5,2)
494 mov %4d, ik_ref(%5,3)
496 movzx esi, al
497 movzx edi, ah
498 shr eax, 16
499 xor %1d, t_ref(0,rsi)
500 xor %2d, t_ref(1,rdi)
501 movzx esi, al
502 movzx edi, ah
503 xor %3d, t_ref(2,rsi)
504 xor %4d, t_ref(3,rdi)
506 movzx esi, bl
507 movzx edi, bh
508 shr ebx, 16
509 xor %2d, t_ref(0,rsi)
510 xor %3d, t_ref(1,rdi)
511 movzx esi, bl
512 movzx edi, bh
513 xor %4d, t_ref(2,rsi)
514 xor %1d, t_ref(3,rdi)
516 movzx esi, cl
517 movzx edi, ch
518 shr ecx, 16
519 xor %3d, t_ref(0,rsi)
520 xor %4d, t_ref(1,rdi)
521 movzx esi, cl
522 movzx edi, ch
523 xor %1d, t_ref(2,rsi)
524 xor %2d, t_ref(3,rdi)
526 movzx esi, dl
527 movzx edi, dh
528 shr edx, 16
529 xor %4d, t_ref(0,rsi)
530 xor %1d, t_ref(1,rdi)
531 movzx esi, dl
532 movzx edi, dh
533 xor %2d, t_ref(2,rsi)
534 xor %3d, t_ref(3,rdi)
536 mov eax,%1d
537 mov ebx,%2d
538 mov ecx,%3d
539 mov edx,%4d
540 %endmacro
542 %ifdef LAST_ROUND_TABLES
544 %macro il_rnd 5 ; last inverse round
545 add tptr, 2048
546 mov %1d, ik_ref(%5,0)
547 mov %2d, ik_ref(%5,1)
548 mov %3d, ik_ref(%5,2)
549 mov %4d, ik_ref(%5,3)
551 movzx esi, al
552 movzx edi, ah
553 shr eax, 16
554 xor %1d, t_ref(0,rsi)
555 xor %2d, t_ref(1,rdi)
556 movzx esi, al
557 movzx edi, ah
558 xor %3d, t_ref(2,rsi)
559 xor %4d, t_ref(3,rdi)
561 movzx esi, bl
562 movzx edi, bh
563 shr ebx, 16
564 xor %2d, t_ref(0,rsi)
565 xor %3d, t_ref(1,rdi)
566 movzx esi, bl
567 movzx edi, bh
568 xor %4d, t_ref(2,rsi)
569 xor %1d, t_ref(3,rdi)
571 movzx esi, cl
572 movzx edi, ch
573 shr ecx, 16
574 xor %3d, t_ref(0,rsi)
575 xor %4d, t_ref(1,rdi)
576 movzx esi, cl
577 movzx edi, ch
578 xor %1d, t_ref(2,rsi)
579 xor %2d, t_ref(3,rdi)
581 movzx esi, dl
582 movzx edi, dh
583 shr edx, 16
584 xor %4d, t_ref(0,rsi)
585 xor %1d, t_ref(1,rdi)
586 movzx esi, dl
587 movzx edi, dh
588 xor %2d, t_ref(2,rsi)
589 xor %3d, t_ref(3,rdi)
590 %endmacro
592 %else
594 %macro il_rnd 5 ; last inverse round
595 mov %1d, ik_ref(%5,0)
596 mov %2d, ik_ref(%5,1)
597 mov %3d, ik_ref(%5,2)
598 mov %4d, ik_ref(%5,3)
600 movzx esi, al
601 movzx edi, ah
602 movzx esi, t_ref(i,rsi)
603 movzx edi, t_ref(i,rdi)
604 shr eax, 16
605 xor %1d, esi
606 rol edi, 8
607 xor %2d, edi
608 movzx esi, al
609 movzx edi, ah
610 movzx esi, t_ref(i,rsi)
611 movzx edi, t_ref(i,rdi)
612 rol esi, 16
613 rol edi, 24
614 xor %3d, esi
615 xor %4d, edi
617 movzx esi, bl
618 movzx edi, bh
619 movzx esi, t_ref(i,rsi)
620 movzx edi, t_ref(i,rdi)
621 shr ebx, 16
622 xor %2d, esi
623 rol edi, 8
624 xor %3d, edi
625 movzx esi, bl
626 movzx edi, bh
627 movzx esi, t_ref(i,rsi)
628 movzx edi, t_ref(i,rdi)
629 rol esi, 16
630 rol edi, 24
631 xor %4d, esi
632 xor %1d, edi
634 movzx esi, cl
635 movzx edi, ch
636 movzx esi, t_ref(i,rsi)
637 movzx edi, t_ref(i,rdi)
638 shr ecx, 16
639 xor %3d, esi
640 rol edi, 8
641 xor %4d, edi
642 movzx esi, cl
643 movzx edi, ch
644 movzx esi, t_ref(i,rsi)
645 movzx edi, t_ref(i,rdi)
646 rol esi, 16
647 rol edi, 24
648 xor %1d, esi
649 xor %2d, edi
651 movzx esi, dl
652 movzx edi, dh
653 movzx esi, t_ref(i,rsi)
654 movzx edi, t_ref(i,rdi)
655 shr edx, 16
656 xor %4d, esi
657 rol edi, 8
658 xor %1d, edi
659 movzx esi, dl
660 movzx edi, dh
661 movzx esi, t_ref(i,rsi)
662 movzx edi, t_ref(i,rdi)
663 rol esi, 16
664 rol edi, 24
665 xor %2d, esi
666 xor %3d, edi
667 %endmacro
669 %endif
671 %ifdef ENCRYPTION
673 global aes_encrypt
674 %ifdef DLL_EXPORT
675 export aes_encrypt
676 %endif
678 section .data align=64
679 align 64
680 enc_tab:
681 enc_vals u8
682 %ifdef LAST_ROUND_TABLES
683 enc_vals w8
684 %endif
686 section .text align=16
687 align 16
689 %ifdef _SEH_
690 proc_frame aes_encrypt
691 alloc_stack 7*8 ; 7 to align stack to 16 bytes
692 save_reg rsi,4*8
693 save_reg rdi,5*8
694 save_reg rbx,1*8
695 save_reg rbp,2*8
696 save_reg r12,3*8
697 end_prologue
698 mov rdi, rcx ; input pointer
699 mov [rsp+0*8], rdx ; output pointer
700 %else
701 aes_encrypt:
702 %ifdef __GNUC__
703 sub rsp, 4*8 ; gnu/linux binary interface
704 mov [rsp+0*8], rsi ; output pointer
705 mov r8, rdx ; context
706 %else
707 sub rsp, 6*8 ; windows binary interface
708 mov [rsp+4*8], rsi
709 mov [rsp+5*8], rdi
710 mov rdi, rcx ; input pointer
711 mov [rsp+0*8], rdx ; output pointer
712 %endif
713 mov [rsp+1*8], rbx ; input pointer in rdi
714 mov [rsp+2*8], rbp ; output pointer in [rsp]
715 mov [rsp+3*8], r12 ; context in r8
716 %endif
718 movzx esi, byte [kptr+4*KS_LENGTH]
719 lea tptr,[enc_tab wrt rip]
720 sub kptr, fofs
722 mov eax, [rdi+0*4]
723 mov ebx, [rdi+1*4]
724 mov ecx, [rdi+2*4]
725 mov edx, [rdi+3*4]
727 xor eax, [kptr+fofs]
728 xor ebx, [kptr+fofs+4]
729 xor ecx, [kptr+fofs+8]
730 xor edx, [kptr+fofs+12]
732 lea kptr,[kptr+rsi]
733 cmp esi, 10*16
734 je .3
735 cmp esi, 12*16
736 je .2
737 cmp esi, 14*16
738 je .1
739 mov rax, -1
740 jmp .4
742 .1: ff_rnd r9, r10, r11, r12, 13
743 ff_rnd r9, r10, r11, r12, 12
744 .2: ff_rnd r9, r10, r11, r12, 11
745 ff_rnd r9, r10, r11, r12, 10
746 .3: ff_rnd r9, r10, r11, r12, 9
747 ff_rnd r9, r10, r11, r12, 8
748 ff_rnd r9, r10, r11, r12, 7
749 ff_rnd r9, r10, r11, r12, 6
750 ff_rnd r9, r10, r11, r12, 5
751 ff_rnd r9, r10, r11, r12, 4
752 ff_rnd r9, r10, r11, r12, 3
753 ff_rnd r9, r10, r11, r12, 2
754 ff_rnd r9, r10, r11, r12, 1
755 fl_rnd r9, r10, r11, r12, 0
757 mov rbx, [rsp]
758 mov [rbx], r9d
759 mov [rbx+4], r10d
760 mov [rbx+8], r11d
761 mov [rbx+12], r12d
762 xor rax, rax
764 mov rbx, [rsp+1*8]
765 mov rbp, [rsp+2*8]
766 mov r12, [rsp+3*8]
767 %ifdef __GNUC__
768 add rsp, 4*8
770 %else
771 mov rsi, [rsp+4*8]
772 mov rdi, [rsp+5*8]
773 %ifdef _SEH_
774 add rsp, 7*8
776 endproc_frame
777 %else
778 add rsp, 6*8
780 %endif
781 %endif
783 %endif
785 %ifdef DECRYPTION
787 global aes_decrypt
788 %ifdef DLL_EXPORT
789 export aes_decrypt
790 %endif
792 section .data
793 align 64
794 dec_tab:
795 dec_vals v8
796 %ifdef LAST_ROUND_TABLES
797 dec_vals w8
798 %endif
800 section .text
801 align 16
803 %ifdef _SEH_
804 proc_frame aes_decrypt
805 alloc_stack 7*8 ; 7 to align stack to 16 bytes
806 save_reg rsi,4*8
807 save_reg rdi,5*8
808 save_reg rbx,1*8
809 save_reg rbp,2*8
810 save_reg r12,3*8
811 end_prologue
812 mov rdi, rcx ; input pointer
813 mov [rsp+0*8], rdx ; output pointer
814 %else
815 aes_decrypt:
816 %ifdef __GNUC__
817 sub rsp, 4*8 ; gnu/linux binary interface
818 mov [rsp+0*8], rsi ; output pointer
819 mov r8, rdx ; context
820 %else
821 sub rsp, 6*8 ; windows binary interface
822 mov [rsp+4*8], rsi
823 mov [rsp+5*8], rdi
824 mov rdi, rcx ; input pointer
825 mov [rsp+0*8], rdx ; output pointer
826 %endif
827 mov [rsp+1*8], rbx ; input pointer in rdi
828 mov [rsp+2*8], rbp ; output pointer in [rsp]
829 mov [rsp+3*8], r12 ; context in r8
830 %endif
832 movzx esi,byte[kptr+4*KS_LENGTH]
833 lea tptr,[dec_tab wrt rip]
834 sub kptr, rofs
836 mov eax, [rdi+0*4]
837 mov ebx, [rdi+1*4]
838 mov ecx, [rdi+2*4]
839 mov edx, [rdi+3*4]
841 %ifdef AES_REV_DKS
842 mov rdi, kptr
843 lea kptr,[kptr+rsi]
844 %else
845 lea rdi,[kptr+rsi]
846 %endif
848 xor eax, [rdi+rofs]
849 xor ebx, [rdi+rofs+4]
850 xor ecx, [rdi+rofs+8]
851 xor edx, [rdi+rofs+12]
853 cmp esi, 10*16
854 je .3
855 cmp esi, 12*16
856 je .2
857 cmp esi, 14*16
858 je .1
859 mov rax, -1
860 jmp .4
862 .1: ii_rnd r9, r10, r11, r12, 13
863 ii_rnd r9, r10, r11, r12, 12
864 .2: ii_rnd r9, r10, r11, r12, 11
865 ii_rnd r9, r10, r11, r12, 10
866 .3: ii_rnd r9, r10, r11, r12, 9
867 ii_rnd r9, r10, r11, r12, 8
868 ii_rnd r9, r10, r11, r12, 7
869 ii_rnd r9, r10, r11, r12, 6
870 ii_rnd r9, r10, r11, r12, 5
871 ii_rnd r9, r10, r11, r12, 4
872 ii_rnd r9, r10, r11, r12, 3
873 ii_rnd r9, r10, r11, r12, 2
874 ii_rnd r9, r10, r11, r12, 1
875 il_rnd r9, r10, r11, r12, 0
877 mov rbx, [rsp]
878 mov [rbx], r9d
879 mov [rbx+4], r10d
880 mov [rbx+8], r11d
881 mov [rbx+12], r12d
882 xor rax, rax
883 .4: mov rbx, [rsp+1*8]
884 mov rbp, [rsp+2*8]
885 mov r12, [rsp+3*8]
886 %ifdef __GNUC__
887 add rsp, 4*8
889 %else
890 mov rsi, [rsp+4*8]
891 mov rdi, [rsp+5*8]
892 %ifdef _SEH_
893 add rsp, 7*8
895 endproc_frame
896 %else
897 add rsp, 6*8
899 %endif
900 %endif
902 %endif