3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 ######################################################################
16 # Interface to OpenSSL as "almost" drop-in replacement for
17 # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
18 # doesn't handle partial vectors (doesn't have to if called from
19 # EVP only). "Drop-in" implies that this module doesn't share key
20 # schedule structure with the original nor does it make assumption
21 # about its alignment...
23 # Performance summary. aes-x86_64.pl column lists large-block CBC
24 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25 # byte processed with 128-bit key, and vpaes-x86_64.pl column -
26 # [also large-block CBC] encrypt/decrypt.
28 # aes-x86_64.pl vpaes-x86_64.pl
30 # Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
31 # Nehalem 30.5/42.2/14.6 9.8/11.8
32 # Atom 63.9/79.0/32.1 64.0/84.8(***)
34 # (*) "Hyper-threading" in the context refers rather to cache shared
35 # among multiple cores, than to specifically Intel HTT. As vast
36 # majority of contemporary cores share cache, slower code path
37 # is common place. In other words "with-hyper-threading-off"
38 # results are presented mostly for reference purposes.
40 # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
42 # (***) Less impressive improvement on Core 2 and Atom is due to slow
43 # pshufb, yet it's respectable +40%/78% improvement on Core 2
44 # (as implied, over "hyper-threading-safe" code path).
50 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
54 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
55 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f
$xlate) or
57 die "can't locate x86_64-xlate.pl";
59 open STDOUT
,"| $^X $xlate $flavour $output";
73 ## %xmm9-%xmm15 as in _vpaes_preheat
74 ## (%rdx) = scheduled keys
77 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
78 ## Preserves %xmm6 - %xmm8 so you get some local vectors
81 .type _vpaes_encrypt_core
,\
@abi-omnipotent
88 movdqa
.Lk_ipt
(%rip), %xmm2 # iptlo
90 movdqu
(%r9), %xmm5 # round0 key
94 movdqa
.Lk_ipt
+16(%rip), %xmm0 # ipthi
99 lea
.Lk_mc_backward
(%rip),%r10
104 # middle of middle round
105 movdqa
%xmm13, %xmm4 # 4 : sb1u
106 pshufb
%xmm2, %xmm4 # 4 = sb1u
107 pxor
%xmm5, %xmm4 # 4 = sb1u + k
108 movdqa
%xmm12, %xmm0 # 0 : sb1t
109 pshufb
%xmm3, %xmm0 # 0 = sb1t
110 pxor
%xmm4, %xmm0 # 0 = A
111 movdqa
%xmm15, %xmm5 # 4 : sb2u
112 pshufb
%xmm2, %xmm5 # 4 = sb2u
113 movdqa
-0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
114 movdqa
%xmm14, %xmm2 # 2 : sb2t
115 pshufb
%xmm3, %xmm2 # 2 = sb2t
116 pxor
%xmm5, %xmm2 # 2 = 2A
117 movdqa
(%r11,%r10), %xmm4 # .Lk_mc_backward[]
118 movdqa
%xmm0, %xmm3 # 3 = A
119 pshufb
%xmm1, %xmm0 # 0 = B
120 add \
$16, %r9 # next key
121 pxor
%xmm2, %xmm0 # 0 = 2A+B
122 pshufb
%xmm4, %xmm3 # 3 = D
123 add \
$16, %r11 # next mc
124 pxor
%xmm0, %xmm3 # 3 = 2A+B+D
125 pshufb
%xmm1, %xmm0 # 0 = 2B+C
126 and \
$0x30, %r11 # ... mod 4
127 pxor
%xmm3, %xmm0 # 0 = 2A+3B+C+D
132 movdqa
%xmm9, %xmm1 # 1 : i
133 pandn
%xmm0, %xmm1 # 1 = i<<4
134 psrld \
$4, %xmm1 # 1 = i
135 pand
%xmm9, %xmm0 # 0 = k
136 movdqa
%xmm11, %xmm5 # 2 : a/k
137 pshufb
%xmm0, %xmm5 # 2 = a/k
138 pxor
%xmm1, %xmm0 # 0 = j
139 movdqa
%xmm10, %xmm3 # 3 : 1/i
140 pshufb
%xmm1, %xmm3 # 3 = 1/i
141 pxor
%xmm5, %xmm3 # 3 = iak = 1/i + a/k
142 movdqa
%xmm10, %xmm4 # 4 : 1/j
143 pshufb
%xmm0, %xmm4 # 4 = 1/j
144 pxor
%xmm5, %xmm4 # 4 = jak = 1/j + a/k
145 movdqa
%xmm10, %xmm2 # 2 : 1/iak
146 pshufb
%xmm3, %xmm2 # 2 = 1/iak
147 pxor
%xmm0, %xmm2 # 2 = io
148 movdqa
%xmm10, %xmm3 # 3 : 1/jak
150 pshufb
%xmm4, %xmm3 # 3 = 1/jak
151 pxor
%xmm1, %xmm3 # 3 = jo
154 # middle of last round
155 movdqa
-0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
156 movdqa
-0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
157 pshufb
%xmm2, %xmm4 # 4 = sbou
158 pxor
%xmm5, %xmm4 # 4 = sb1u + k
159 pshufb
%xmm3, %xmm0 # 0 = sb1t
160 movdqa
0x40(%r11,%r10), %xmm1 # .Lk_sr[]
161 pxor
%xmm4, %xmm0 # 0 = A
164 .size _vpaes_encrypt_core
,.-_vpaes_encrypt_core
169 ## Same API as encryption core.
171 .type _vpaes_decrypt_core
,\
@abi-omnipotent
174 mov
%rdx, %r9 # load key
177 movdqa
.Lk_dipt
(%rip), %xmm2 # iptlo
181 movdqu
(%r9), %xmm5 # round0 key
185 movdqa
.Lk_dipt
+16(%rip), %xmm0 # ipthi
187 lea
.Lk_dsbd
(%rip),%r10
191 movdqa
.Lk_mc_forward
+48(%rip), %xmm5
200 ## Inverse mix columns
202 movdqa
-0x20(%r10),%xmm4 # 4 : sb9u
203 pshufb
%xmm2, %xmm4 # 4 = sb9u
205 movdqa
-0x10(%r10),%xmm0 # 0 : sb9t
206 pshufb
%xmm3, %xmm0 # 0 = sb9t
207 pxor
%xmm4, %xmm0 # 0 = ch
208 add \
$16, %r9 # next round key
210 pshufb
%xmm5, %xmm0 # MC ch
211 movdqa
0x00(%r10),%xmm4 # 4 : sbdu
212 pshufb
%xmm2, %xmm4 # 4 = sbdu
213 pxor
%xmm0, %xmm4 # 4 = ch
214 movdqa
0x10(%r10),%xmm0 # 0 : sbdt
215 pshufb
%xmm3, %xmm0 # 0 = sbdt
216 pxor
%xmm4, %xmm0 # 0 = ch
219 pshufb
%xmm5, %xmm0 # MC ch
220 movdqa
0x20(%r10),%xmm4 # 4 : sbbu
221 pshufb
%xmm2, %xmm4 # 4 = sbbu
222 pxor
%xmm0, %xmm4 # 4 = ch
223 movdqa
0x30(%r10),%xmm0 # 0 : sbbt
224 pshufb
%xmm3, %xmm0 # 0 = sbbt
225 pxor
%xmm4, %xmm0 # 0 = ch
227 pshufb
%xmm5, %xmm0 # MC ch
228 movdqa
0x40(%r10),%xmm4 # 4 : sbeu
229 pshufb
%xmm2, %xmm4 # 4 = sbeu
230 pxor
%xmm0, %xmm4 # 4 = ch
231 movdqa
0x50(%r10),%xmm0 # 0 : sbet
232 pshufb
%xmm3, %xmm0 # 0 = sbet
233 pxor
%xmm4, %xmm0 # 0 = ch
235 palignr \
$12, %xmm5, %xmm5
239 movdqa
%xmm9, %xmm1 # 1 : i
240 pandn
%xmm0, %xmm1 # 1 = i<<4
241 psrld \
$4, %xmm1 # 1 = i
242 pand
%xmm9, %xmm0 # 0 = k
243 movdqa
%xmm11, %xmm2 # 2 : a/k
244 pshufb
%xmm0, %xmm2 # 2 = a/k
245 pxor
%xmm1, %xmm0 # 0 = j
246 movdqa
%xmm10, %xmm3 # 3 : 1/i
247 pshufb
%xmm1, %xmm3 # 3 = 1/i
248 pxor
%xmm2, %xmm3 # 3 = iak = 1/i + a/k
249 movdqa
%xmm10, %xmm4 # 4 : 1/j
250 pshufb
%xmm0, %xmm4 # 4 = 1/j
251 pxor
%xmm2, %xmm4 # 4 = jak = 1/j + a/k
252 movdqa
%xmm10, %xmm2 # 2 : 1/iak
253 pshufb
%xmm3, %xmm2 # 2 = 1/iak
254 pxor
%xmm0, %xmm2 # 2 = io
255 movdqa
%xmm10, %xmm3 # 3 : 1/jak
256 pshufb
%xmm4, %xmm3 # 3 = 1/jak
257 pxor
%xmm1, %xmm3 # 3 = jo
261 # middle of last round
262 movdqa
0x60(%r10), %xmm4 # 3 : sbou
263 pshufb
%xmm2, %xmm4 # 4 = sbou
264 pxor
%xmm0, %xmm4 # 4 = sb1u + k
265 movdqa
0x70(%r10), %xmm0 # 0 : sbot
266 movdqa
-0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
267 pshufb
%xmm3, %xmm0 # 0 = sb1t
268 pxor
%xmm4, %xmm0 # 0 = A
271 .size _vpaes_decrypt_core
,.-_vpaes_decrypt_core
273 ########################################################
275 ## AES key schedule ##
277 ########################################################
278 .type _vpaes_schedule_core
,\
@abi-omnipotent
280 _vpaes_schedule_core
:
284 # rcx = direction. 0=encrypt, 1=decrypt
286 call _vpaes_preheat
# load the tables
287 movdqa
.Lk_rcon
(%rip), %xmm8 # load rcon
288 movdqu
(%rdi), %xmm0 # load key (unaligned)
292 lea
.Lk_ipt
(%rip), %r11
293 call _vpaes_schedule_transform
296 lea
.Lk_sr
(%rip),%r10
298 jnz
.Lschedule_am_decrypting
300 # encrypting, output zeroth round key after transform
304 .Lschedule_am_decrypting
:
305 # decrypting, output zeroth round key after shiftrows
306 movdqa
(%r8,%r10),%xmm1
320 ## 128-bit specific part of key schedule.
322 ## This schedule is really simple, because all its parts
323 ## are accomplished by the subroutines.
329 call _vpaes_schedule_round
331 jz
.Lschedule_mangle_last
332 call _vpaes_schedule_mangle
# write output
333 jmp
.Loop_schedule_128
338 ## 192-bit specific part of key schedule.
340 ## The main body of this schedule is the same as the 128-bit
341 ## schedule, but with more smearing. The long, high side is
342 ## stored in %xmm7 as before, and the short, low side is in
343 ## the high bits of %xmm6.
345 ## This schedule is somewhat nastier, however, because each
346 ## round produces 192 bits of key material, or 1.5 round keys.
347 ## Therefore, on each cycle we do 2 rounds and produce 3 round
352 movdqu
8(%rdi),%xmm0 # load key part 2 (very unaligned)
353 call _vpaes_schedule_transform
# input transform
354 movdqa
%xmm0, %xmm6 # save short part
355 pxor
%xmm4, %xmm4 # clear 4
356 movhlps
%xmm4, %xmm6 # clobber low side with zeros
360 call _vpaes_schedule_round
361 palignr \
$8,%xmm6,%xmm0
362 call _vpaes_schedule_mangle
# save key n
363 call _vpaes_schedule_192_smear
364 call _vpaes_schedule_mangle
# save key n+1
365 call _vpaes_schedule_round
367 jz
.Lschedule_mangle_last
368 call _vpaes_schedule_mangle
# save key n+2
369 call _vpaes_schedule_192_smear
370 jmp
.Loop_schedule_192
375 ## 256-bit specific part of key schedule.
377 ## The structure here is very similar to the 128-bit
378 ## schedule, but with an additional "low side" in
379 ## %xmm6. The low side's rounds are the same as the
380 ## high side's, except no rcon and no rotation.
384 movdqu
16(%rdi),%xmm0 # load key part 2 (unaligned)
385 call _vpaes_schedule_transform
# input transform
389 call _vpaes_schedule_mangle
# output low result
390 movdqa
%xmm0, %xmm6 # save cur_lo in xmm6
393 call _vpaes_schedule_round
395 jz
.Lschedule_mangle_last
396 call _vpaes_schedule_mangle
398 # low round. swap xmm7 and xmm6
399 pshufd \
$0xFF, %xmm0, %xmm0
402 call _vpaes_schedule_low_round
405 jmp
.Loop_schedule_256
409 ## .aes_schedule_mangle_last
411 ## Mangler for last round of key schedule
413 ## when encrypting, outputs out(%xmm0) ^ 63
414 ## when decrypting, outputs unskew(%xmm0)
416 ## Always called right before return... jumps to cleanup and exits
419 .Lschedule_mangle_last
:
420 # schedule last round key from xmm0
421 lea
.Lk_deskew
(%rip),%r11 # prepare to deskew
423 jnz
.Lschedule_mangle_last_dec
426 movdqa
(%r8,%r10),%xmm1
427 pshufb
%xmm1, %xmm0 # output permute
428 lea
.Lk_opt
(%rip), %r11 # prepare to output transform
431 .Lschedule_mangle_last_dec
:
433 pxor
.Lk_s63
(%rip), %xmm0
434 call _vpaes_schedule_transform
# output transform
435 movdqu
%xmm0, (%rdx) # save last key
447 .size _vpaes_schedule_core
,.-_vpaes_schedule_core
450 ## .aes_schedule_192_smear
452 ## Smear the short, low side in the 192-bit key schedule.
455 ## %xmm7: high side, b a x y
456 ## %xmm6: low side, d c 0 0
460 ## %xmm6: b+c+d b+c 0 0
461 ## %xmm0: b+c+d b+c b a
463 .type _vpaes_schedule_192_smear
,\
@abi-omnipotent
465 _vpaes_schedule_192_smear
:
466 pshufd \
$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
467 pxor
%xmm0, %xmm6 # -> c+d c 0 0
468 pshufd \
$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
469 pxor
%xmm0, %xmm6 # -> b+c+d b+c b a
472 movhlps
%xmm1, %xmm6 # clobber low side with zeros
474 .size _vpaes_schedule_192_smear
,.-_vpaes_schedule_192_smear
477 ## .aes_schedule_round
479 ## Runs one main round of the key schedule on %xmm0, %xmm7
481 ## Specifically, runs subbytes on the high dword of %xmm0
482 ## then rotates it by one byte and xors into the low dword of
485 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
488 ## Smears the dwords of %xmm7 by xoring the low into the
489 ## second low, result into third, result into highest.
491 ## Returns results in %xmm7 = %xmm0.
492 ## Clobbers %xmm1-%xmm4, %r11.
494 .type _vpaes_schedule_round
,\
@abi-omnipotent
496 _vpaes_schedule_round
:
497 # extract rcon from xmm8
499 palignr \
$15, %xmm8, %xmm1
500 palignr \
$15, %xmm8, %xmm8
504 pshufd \
$0xFF, %xmm0, %xmm0
505 palignr \
$1, %xmm0, %xmm0
509 # low round: same as high round, but no rotation and no rcon.
510 _vpaes_schedule_low_round
:
518 pxor
.Lk_s63
(%rip), %xmm7
523 psrld \
$4, %xmm1 # 1 = i
524 pand
%xmm9, %xmm0 # 0 = k
525 movdqa
%xmm11, %xmm2 # 2 : a/k
526 pshufb
%xmm0, %xmm2 # 2 = a/k
527 pxor
%xmm1, %xmm0 # 0 = j
528 movdqa
%xmm10, %xmm3 # 3 : 1/i
529 pshufb
%xmm1, %xmm3 # 3 = 1/i
530 pxor
%xmm2, %xmm3 # 3 = iak = 1/i + a/k
531 movdqa
%xmm10, %xmm4 # 4 : 1/j
532 pshufb
%xmm0, %xmm4 # 4 = 1/j
533 pxor
%xmm2, %xmm4 # 4 = jak = 1/j + a/k
534 movdqa
%xmm10, %xmm2 # 2 : 1/iak
535 pshufb
%xmm3, %xmm2 # 2 = 1/iak
536 pxor
%xmm0, %xmm2 # 2 = io
537 movdqa
%xmm10, %xmm3 # 3 : 1/jak
538 pshufb
%xmm4, %xmm3 # 3 = 1/jak
539 pxor
%xmm1, %xmm3 # 3 = jo
540 movdqa
%xmm13, %xmm4 # 4 : sbou
541 pshufb
%xmm2, %xmm4 # 4 = sbou
542 movdqa
%xmm12, %xmm0 # 0 : sbot
543 pshufb
%xmm3, %xmm0 # 0 = sb1t
544 pxor
%xmm4, %xmm0 # 0 = sbox output
546 # add in smeared stuff
550 .size _vpaes_schedule_round
,.-_vpaes_schedule_round
553 ## .aes_schedule_transform
555 ## Linear-transform %xmm0 according to tables at (%r11)
557 ## Requires that %xmm9 = 0x0F0F... as in preheat
559 ## Clobbers %xmm1, %xmm2
561 .type _vpaes_schedule_transform
,\
@abi-omnipotent
563 _vpaes_schedule_transform
:
568 movdqa
(%r11), %xmm2 # lo
570 movdqa
16(%r11), %xmm0 # hi
574 .size _vpaes_schedule_transform
,.-_vpaes_schedule_transform
577 ## .aes_schedule_mangle
579 ## Mangle xmm0 from (basis-transformed) standard version
584 ## multiply by circulant 0,1,1,1
585 ## apply shiftrows transform
589 ## multiply by "inverse mixcolumns" circulant E,B,D,9
591 ## apply shiftrows transform
594 ## Writes out to (%rdx), and increments or decrements it
595 ## Keeps track of round number mod 4 in %r8
597 ## Clobbers xmm1-xmm5
599 .type _vpaes_schedule_mangle
,\
@abi-omnipotent
601 _vpaes_schedule_mangle
:
602 movdqa
%xmm0, %xmm4 # save xmm0 for later
603 movdqa
.Lk_mc_forward
(%rip),%xmm5
605 jnz
.Lschedule_mangle_dec
609 pxor
.Lk_s63
(%rip),%xmm4
617 jmp
.Lschedule_mangle_both
619 .Lschedule_mangle_dec
:
620 # inverse mix columns
621 lea
.Lk_dksd
(%rip),%r11
624 psrld \
$4, %xmm1 # 1 = hi
625 pand
%xmm9, %xmm4 # 4 = lo
627 movdqa
0x00(%r11), %xmm2
629 movdqa
0x10(%r11), %xmm3
634 movdqa
0x20(%r11), %xmm2
637 movdqa
0x30(%r11), %xmm3
642 movdqa
0x40(%r11), %xmm2
645 movdqa
0x50(%r11), %xmm3
650 movdqa
0x60(%r11), %xmm2
653 movdqa
0x70(%r11), %xmm3
659 .Lschedule_mangle_both
:
660 movdqa
(%r8,%r10),%xmm1
666 .size _vpaes_schedule_mangle
,.-_vpaes_schedule_mangle
669 # Interface to OpenSSL
671 .globl
${PREFIX
}_set_encrypt_key
672 .type
${PREFIX
}_set_encrypt_key
,\
@function,3
674 ${PREFIX
}_set_encrypt_key
:
676 $code.=<<___
if ($win64);
678 movaps
%xmm6,0x10(%rsp)
679 movaps
%xmm7,0x20(%rsp)
680 movaps
%xmm8,0x30(%rsp)
681 movaps
%xmm9,0x40(%rsp)
682 movaps
%xmm10,0x50(%rsp)
683 movaps
%xmm11,0x60(%rsp)
684 movaps
%xmm12,0x70(%rsp)
685 movaps
%xmm13,0x80(%rsp)
686 movaps
%xmm14,0x90(%rsp)
687 movaps
%xmm15,0xa0(%rsp)
694 mov
%eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
698 call _vpaes_schedule_core
700 $code.=<<___
if ($win64);
701 movaps
0x10(%rsp),%xmm6
702 movaps
0x20(%rsp),%xmm7
703 movaps
0x30(%rsp),%xmm8
704 movaps
0x40(%rsp),%xmm9
705 movaps
0x50(%rsp),%xmm10
706 movaps
0x60(%rsp),%xmm11
707 movaps
0x70(%rsp),%xmm12
708 movaps
0x80(%rsp),%xmm13
709 movaps
0x90(%rsp),%xmm14
710 movaps
0xa0(%rsp),%xmm15
717 .size
${PREFIX
}_set_encrypt_key
,.-${PREFIX
}_set_encrypt_key
719 .globl
${PREFIX
}_set_decrypt_key
720 .type
${PREFIX
}_set_decrypt_key
,\
@function,3
722 ${PREFIX
}_set_decrypt_key
:
724 $code.=<<___
if ($win64);
726 movaps
%xmm6,0x10(%rsp)
727 movaps
%xmm7,0x20(%rsp)
728 movaps
%xmm8,0x30(%rsp)
729 movaps
%xmm9,0x40(%rsp)
730 movaps
%xmm10,0x50(%rsp)
731 movaps
%xmm11,0x60(%rsp)
732 movaps
%xmm12,0x70(%rsp)
733 movaps
%xmm13,0x80(%rsp)
734 movaps
%xmm14,0x90(%rsp)
735 movaps
%xmm15,0xa0(%rsp)
742 mov
%eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
744 lea
16(%rdx,%rax),%rdx
750 xor \
$32,%r8d # nbits==192?0:32
751 call _vpaes_schedule_core
753 $code.=<<___
if ($win64);
754 movaps
0x10(%rsp),%xmm6
755 movaps
0x20(%rsp),%xmm7
756 movaps
0x30(%rsp),%xmm8
757 movaps
0x40(%rsp),%xmm9
758 movaps
0x50(%rsp),%xmm10
759 movaps
0x60(%rsp),%xmm11
760 movaps
0x70(%rsp),%xmm12
761 movaps
0x80(%rsp),%xmm13
762 movaps
0x90(%rsp),%xmm14
763 movaps
0xa0(%rsp),%xmm15
770 .size
${PREFIX
}_set_decrypt_key
,.-${PREFIX
}_set_decrypt_key
772 .globl
${PREFIX
}_encrypt
773 .type
${PREFIX
}_encrypt
,\
@function,3
777 $code.=<<___
if ($win64);
779 movaps
%xmm6,0x10(%rsp)
780 movaps
%xmm7,0x20(%rsp)
781 movaps
%xmm8,0x30(%rsp)
782 movaps
%xmm9,0x40(%rsp)
783 movaps
%xmm10,0x50(%rsp)
784 movaps
%xmm11,0x60(%rsp)
785 movaps
%xmm12,0x70(%rsp)
786 movaps
%xmm13,0x80(%rsp)
787 movaps
%xmm14,0x90(%rsp)
788 movaps
%xmm15,0xa0(%rsp)
794 call _vpaes_encrypt_core
797 $code.=<<___
if ($win64);
798 movaps
0x10(%rsp),%xmm6
799 movaps
0x20(%rsp),%xmm7
800 movaps
0x30(%rsp),%xmm8
801 movaps
0x40(%rsp),%xmm9
802 movaps
0x50(%rsp),%xmm10
803 movaps
0x60(%rsp),%xmm11
804 movaps
0x70(%rsp),%xmm12
805 movaps
0x80(%rsp),%xmm13
806 movaps
0x90(%rsp),%xmm14
807 movaps
0xa0(%rsp),%xmm15
813 .size
${PREFIX
}_encrypt
,.-${PREFIX
}_encrypt
815 .globl
${PREFIX
}_decrypt
816 .type
${PREFIX
}_decrypt
,\
@function,3
820 $code.=<<___
if ($win64);
822 movaps
%xmm6,0x10(%rsp)
823 movaps
%xmm7,0x20(%rsp)
824 movaps
%xmm8,0x30(%rsp)
825 movaps
%xmm9,0x40(%rsp)
826 movaps
%xmm10,0x50(%rsp)
827 movaps
%xmm11,0x60(%rsp)
828 movaps
%xmm12,0x70(%rsp)
829 movaps
%xmm13,0x80(%rsp)
830 movaps
%xmm14,0x90(%rsp)
831 movaps
%xmm15,0xa0(%rsp)
837 call _vpaes_decrypt_core
840 $code.=<<___
if ($win64);
841 movaps
0x10(%rsp),%xmm6
842 movaps
0x20(%rsp),%xmm7
843 movaps
0x30(%rsp),%xmm8
844 movaps
0x40(%rsp),%xmm9
845 movaps
0x50(%rsp),%xmm10
846 movaps
0x60(%rsp),%xmm11
847 movaps
0x70(%rsp),%xmm12
848 movaps
0x80(%rsp),%xmm13
849 movaps
0x90(%rsp),%xmm14
850 movaps
0xa0(%rsp),%xmm15
856 .size
${PREFIX
}_decrypt
,.-${PREFIX
}_decrypt
859 my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
860 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
861 # size_t length, const AES_KEY *key,
862 # unsigned char *ivp,const int enc);
864 .globl
${PREFIX
}_cbc_encrypt
865 .type
${PREFIX
}_cbc_encrypt
,\
@function,6
867 ${PREFIX
}_cbc_encrypt
:
870 ($len,$key)=($key,$len);
875 $code.=<<___
if ($win64);
877 movaps
%xmm6,0x10(%rsp)
878 movaps
%xmm7,0x20(%rsp)
879 movaps
%xmm8,0x30(%rsp)
880 movaps
%xmm9,0x40(%rsp)
881 movaps
%xmm10,0x50(%rsp)
882 movaps
%xmm11,0x60(%rsp)
883 movaps
%xmm12,0x70(%rsp)
884 movaps
%xmm13,0x80(%rsp)
885 movaps
%xmm14,0x90(%rsp)
886 movaps
%xmm15,0xa0(%rsp)
890 movdqu
($ivp),%xmm6 # load IV
900 call _vpaes_encrypt_core
902 movdqu
%xmm0,($out,$inp)
911 call _vpaes_decrypt_core
914 movdqu
%xmm0,($out,$inp)
919 movdqu
%xmm6,($ivp) # save IV
921 $code.=<<___
if ($win64);
922 movaps
0x10(%rsp),%xmm6
923 movaps
0x20(%rsp),%xmm7
924 movaps
0x30(%rsp),%xmm8
925 movaps
0x40(%rsp),%xmm9
926 movaps
0x50(%rsp),%xmm10
927 movaps
0x60(%rsp),%xmm11
928 movaps
0x70(%rsp),%xmm12
929 movaps
0x80(%rsp),%xmm13
930 movaps
0x90(%rsp),%xmm14
931 movaps
0xa0(%rsp),%xmm15
938 .size
${PREFIX
}_cbc_encrypt
,.-${PREFIX
}_cbc_encrypt
945 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
946 ## and %xmm9-%xmm15 as specified below.
948 .type _vpaes_preheat
,\
@abi-omnipotent
951 lea
.Lk_s0F
(%rip), %r10
952 movdqa
-0x20(%r10), %xmm10 # .Lk_inv
953 movdqa
-0x10(%r10), %xmm11 # .Lk_inv+16
954 movdqa
0x00(%r10), %xmm9 # .Lk_s0F
955 movdqa
0x30(%r10), %xmm13 # .Lk_sb1
956 movdqa
0x40(%r10), %xmm12 # .Lk_sb1+16
957 movdqa
0x50(%r10), %xmm15 # .Lk_sb2
958 movdqa
0x60(%r10), %xmm14 # .Lk_sb2+16
960 .size _vpaes_preheat
,.-_vpaes_preheat
961 ########################################################
965 ########################################################
966 .type _vpaes_consts
,\
@object
970 .quad
0x0E05060F0D080180, 0x040703090A0B0C02
971 .quad
0x01040A060F0B0780, 0x030D0E0C02050809
974 .quad
0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
976 .Lk_ipt
: # input transform (lo, hi)
977 .quad
0xC2B2E8985A2A7000, 0xCABAE09052227808
978 .quad
0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
980 .Lk_sb1
: # sb1u, sb1t
981 .quad
0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
982 .quad
0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
983 .Lk_sb2
: # sb2u, sb2t
984 .quad
0xE27A93C60B712400, 0x5EB7E955BC982FCD
985 .quad
0x69EB88400AE12900, 0xC2A163C8AB82234A
986 .Lk_sbo
: # sbou, sbot
987 .quad
0xD0D26D176FBDC700, 0x15AABF7AC502A878
988 .quad
0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
990 .Lk_mc_forward
: # mc_forward
991 .quad
0x0407060500030201, 0x0C0F0E0D080B0A09
992 .quad
0x080B0A0904070605, 0x000302010C0F0E0D
993 .quad
0x0C0F0E0D080B0A09, 0x0407060500030201
994 .quad
0x000302010C0F0E0D, 0x080B0A0904070605
996 .Lk_mc_backward
:# mc_backward
997 .quad
0x0605040702010003, 0x0E0D0C0F0A09080B
998 .quad
0x020100030E0D0C0F, 0x0A09080B06050407
999 .quad
0x0E0D0C0F0A09080B, 0x0605040702010003
1000 .quad
0x0A09080B06050407, 0x020100030E0D0C0F
1003 .quad
0x0706050403020100, 0x0F0E0D0C0B0A0908
1004 .quad
0x030E09040F0A0500, 0x0B06010C07020D08
1005 .quad
0x0F060D040B020900, 0x070E050C030A0108
1006 .quad
0x0B0E0104070A0D00, 0x0306090C0F020508
1009 .quad
0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1011 .Lk_s63
: # s63: all equal to 0x63 transformed
1012 .quad
0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1014 .Lk_opt
: # output transform
1015 .quad
0xFF9F4929D6B66000, 0xF7974121DEBE6808
1016 .quad
0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1018 .Lk_deskew
: # deskew tables: inverts the sbox's "skew"
1019 .quad
0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1020 .quad
0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1024 ## Key schedule constants
1026 .Lk_dksd
: # decryption key schedule: invskew x*D
1027 .quad
0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1028 .quad
0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1029 .Lk_dksb
: # decryption key schedule: invskew x*B
1030 .quad
0x9A4FCA1F8550D500, 0x03D653861CC94C99
1031 .quad
0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1032 .Lk_dkse
: # decryption key schedule: invskew x*E + 0x63
1033 .quad
0xD5031CCA1FC9D600, 0x53859A4C994F5086
1034 .quad
0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1035 .Lk_dks9
: # decryption key schedule: invskew x*9
1036 .quad
0xB6116FC87ED9A700, 0x4AED933482255BFC
1037 .quad
0x4576516227143300, 0x8BB89FACE9DAFDCE
1041 ## Round function constants
1043 .Lk_dipt
: # decryption input transform
1044 .quad
0x0F505B040B545F00, 0x154A411E114E451A
1045 .quad
0x86E383E660056500, 0x12771772F491F194
1047 .Lk_dsb9
: # decryption sbox output *9*u, *9*t
1048 .quad
0x851C03539A86D600, 0xCAD51F504F994CC9
1049 .quad
0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1050 .Lk_dsbd
: # decryption sbox output *D*u, *D*t
1051 .quad
0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1052 .quad
0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1053 .Lk_dsbb
: # decryption sbox output *B*u, *B*t
1054 .quad
0xD022649296B44200, 0x602646F6B0F2D404
1055 .quad
0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1056 .Lk_dsbe
: # decryption sbox output *E*u, *E*t
1057 .quad
0x46F2929626D4D000, 0x2242600464B4F6B0
1058 .quad
0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1059 .Lk_dsbo
: # decryption sbox final output
1060 .quad
0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1061 .quad
0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1062 .asciz
"Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1064 .size _vpaes_consts
,.-_vpaes_consts
1068 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1069 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1076 .extern __imp_RtlVirtualUnwind
1077 .type se_handler
,\
@abi-omnipotent
1091 mov
120($context),%rax # pull context->Rax
1092 mov
248($context),%rbx # pull context->Rip
1094 mov
8($disp),%rsi # disp->ImageBase
1095 mov
56($disp),%r11 # disp->HandlerData
1097 mov
0(%r11),%r10d # HandlerData[0]
1098 lea
(%rsi,%r10),%r10 # prologue label
1099 cmp %r10,%rbx # context->Rip<prologue label
1102 mov
152($context),%rax # pull context->Rsp
1104 mov
4(%r11),%r10d # HandlerData[1]
1105 lea
(%rsi,%r10),%r10 # epilogue label
1106 cmp %r10,%rbx # context->Rip>=epilogue label
1109 lea
16(%rax),%rsi # %xmm save area
1110 lea
512($context),%rdi # &context.Xmm6
1111 mov \
$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1112 .long
0xa548f3fc # cld; rep movsq
1113 lea
0xb8(%rax),%rax # adjust stack pointer
1118 mov
%rax,152($context) # restore context->Rsp
1119 mov
%rsi,168($context) # restore context->Rsi
1120 mov
%rdi,176($context) # restore context->Rdi
1122 mov
40($disp),%rdi # disp->ContextRecord
1123 mov
$context,%rsi # context
1124 mov \
$`1232/8`,%ecx # sizeof(CONTEXT)
1125 .long
0xa548f3fc # cld; rep movsq
1128 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1129 mov
8(%rsi),%rdx # arg2, disp->ImageBase
1130 mov
0(%rsi),%r8 # arg3, disp->ControlPc
1131 mov
16(%rsi),%r9 # arg4, disp->FunctionEntry
1132 mov
40(%rsi),%r10 # disp->ContextRecord
1133 lea
56(%rsi),%r11 # &disp->HandlerData
1134 lea
24(%rsi),%r12 # &disp->EstablisherFrame
1135 mov
%r10,32(%rsp) # arg5
1136 mov
%r11,40(%rsp) # arg6
1137 mov
%r12,48(%rsp) # arg7
1138 mov
%rcx,56(%rsp) # arg8, (NULL)
1139 call
*__imp_RtlVirtualUnwind
(%rip)
1141 mov \
$1,%eax # ExceptionContinueSearch
1153 .size se_handler
,.-se_handler
1157 .rva
.LSEH_begin_
${PREFIX
}_set_encrypt_key
1158 .rva
.LSEH_end_
${PREFIX
}_set_encrypt_key
1159 .rva
.LSEH_info_
${PREFIX
}_set_encrypt_key
1161 .rva
.LSEH_begin_
${PREFIX
}_set_decrypt_key
1162 .rva
.LSEH_end_
${PREFIX
}_set_decrypt_key
1163 .rva
.LSEH_info_
${PREFIX
}_set_decrypt_key
1165 .rva
.LSEH_begin_
${PREFIX
}_encrypt
1166 .rva
.LSEH_end_
${PREFIX
}_encrypt
1167 .rva
.LSEH_info_
${PREFIX
}_encrypt
1169 .rva
.LSEH_begin_
${PREFIX
}_decrypt
1170 .rva
.LSEH_end_
${PREFIX
}_decrypt
1171 .rva
.LSEH_info_
${PREFIX
}_decrypt
1173 .rva
.LSEH_begin_
${PREFIX
}_cbc_encrypt
1174 .rva
.LSEH_end_
${PREFIX
}_cbc_encrypt
1175 .rva
.LSEH_info_
${PREFIX
}_cbc_encrypt
1179 .LSEH_info_
${PREFIX
}_set_encrypt_key
:
1182 .rva
.Lenc_key_body
,.Lenc_key_epilogue
# HandlerData[]
1183 .LSEH_info_
${PREFIX
}_set_decrypt_key
:
1186 .rva
.Ldec_key_body
,.Ldec_key_epilogue
# HandlerData[]
1187 .LSEH_info_
${PREFIX
}_encrypt
:
1190 .rva
.Lenc_body
,.Lenc_epilogue
# HandlerData[]
1191 .LSEH_info_
${PREFIX
}_decrypt
:
1194 .rva
.Ldec_body
,.Ldec_epilogue
# HandlerData[]
1195 .LSEH_info_
${PREFIX
}_cbc_encrypt
:
1198 .rva
.Lcbc_body
,.Lcbc_epilogue
# HandlerData[]
1202 $code =~ s/\`([^\`]*)\`/eval($1)/gem;