3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
17 # To start with see corresponding paragraph in aesni-x86_64.pl...
18 # Instead of filling table similar to one found there I've chosen to
19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20 # The simplified table below represents 32-bit performance relative
21 # to 64-bit one in every given point. Ratios vary for different
22 # encryption modes, therefore interval values.
24 # 16-byte 64-byte 256-byte 1-KB 8-KB
25 # 53-67% 67-84% 91-94% 95-98% 97-99.5%
27 # Lower ratios for smaller block sizes are perfectly understandable,
28 # because function call overhead is higher in 32-bit mode. Largest
29 # 8-KB block performance is virtually same: 32-bit code is less than
30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
35 # interleaves at most 6 aes[enc|dec] instructions, because there are
36 # not enough registers for 8x interleave [which should be optimal for
37 # Sandy Bridge]. Actually, performance results for 6x interleave
38 # factor presented in aesni-x86_64.pl (except for CTR) are for this
43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
46 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
47 # generates drop-in replacement for
48 # crypto/aes/asm/aes-586.pl:-)
49 $inline=1; # inline _aesni_[en|de]crypt
51 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
52 push(@INC,"${dir}","${dir}../../perlasm");
55 &asm_init
($ARGV[0],$0);
57 if ($PREFIX eq "aesni") { $movekey=*movups
; }
58 else { $movekey=*movups
; }
65 $rounds_="ebx"; # backup copy for $rounds
66 $key_="ebp"; # backup copy for $key
73 $inout3="xmm5"; $in1="xmm5";
74 $inout4="xmm6"; $in0="xmm6";
75 $inout5="xmm7"; $ivec="xmm7";
79 { my($dst,$src,$imm)=@_;
80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81 { &data_byte
(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
84 { my($opcodelet,$dst,$src)=@_;
85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86 { &data_byte
(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
88 sub aesimc
{ aescommon
(0xdb,@_); }
89 sub aesenc
{ aescommon
(0xdc,@_); }
90 sub aesenclast
{ aescommon
(0xdd,@_); }
91 sub aesdec
{ aescommon
(0xde,@_); }
92 sub aesdeclast
{ aescommon
(0xdf,@_); }
94 # Inline version of internal aesni_[en|de]crypt1
96 sub aesni_inline_generate1
97 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
100 &$movekey ($rndkey0,&QWP
(0,$key));
101 &$movekey ($rndkey1,&QWP
(16,$key));
102 &xorps
($ivec,$rndkey0) if (defined($ivec));
103 &lea
($key,&DWP
(32,$key));
104 &xorps
($inout,$ivec) if (defined($ivec));
105 &xorps
($inout,$rndkey0) if (!defined($ivec));
106 &set_label
("${p}1_loop_$sn");
107 eval"&aes${p} ($inout,$rndkey1)";
109 &$movekey ($rndkey1,&QWP
(0,$key));
110 &lea
($key,&DWP
(16,$key));
111 &jnz
(&label
("${p}1_loop_$sn"));
112 eval"&aes${p}last ($inout,$rndkey1)";
115 sub aesni_generate1
# fully unrolled loop
116 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
118 &function_begin_B
("_aesni_${p}rypt1");
119 &movups
($rndkey0,&QWP
(0,$key));
120 &$movekey ($rndkey1,&QWP
(0x10,$key));
121 &xorps
($inout,$rndkey0);
122 &$movekey ($rndkey0,&QWP
(0x20,$key));
123 &lea
($key,&DWP
(0x30,$key));
125 &jb
(&label
("${p}128"));
126 &lea
($key,&DWP
(0x20,$key));
127 &je
(&label
("${p}192"));
128 &lea
($key,&DWP
(0x20,$key));
129 eval"&aes${p} ($inout,$rndkey1)";
130 &$movekey ($rndkey1,&QWP
(-0x40,$key));
131 eval"&aes${p} ($inout,$rndkey0)";
132 &$movekey ($rndkey0,&QWP
(-0x30,$key));
133 &set_label
("${p}192");
134 eval"&aes${p} ($inout,$rndkey1)";
135 &$movekey ($rndkey1,&QWP
(-0x20,$key));
136 eval"&aes${p} ($inout,$rndkey0)";
137 &$movekey ($rndkey0,&QWP
(-0x10,$key));
138 &set_label
("${p}128");
139 eval"&aes${p} ($inout,$rndkey1)";
140 &$movekey ($rndkey1,&QWP
(0,$key));
141 eval"&aes${p} ($inout,$rndkey0)";
142 &$movekey ($rndkey0,&QWP
(0x10,$key));
143 eval"&aes${p} ($inout,$rndkey1)";
144 &$movekey ($rndkey1,&QWP
(0x20,$key));
145 eval"&aes${p} ($inout,$rndkey0)";
146 &$movekey ($rndkey0,&QWP
(0x30,$key));
147 eval"&aes${p} ($inout,$rndkey1)";
148 &$movekey ($rndkey1,&QWP
(0x40,$key));
149 eval"&aes${p} ($inout,$rndkey0)";
150 &$movekey ($rndkey0,&QWP
(0x50,$key));
151 eval"&aes${p} ($inout,$rndkey1)";
152 &$movekey ($rndkey1,&QWP
(0x60,$key));
153 eval"&aes${p} ($inout,$rndkey0)";
154 &$movekey ($rndkey0,&QWP
(0x70,$key));
155 eval"&aes${p} ($inout,$rndkey1)";
156 eval"&aes${p}last ($inout,$rndkey0)";
158 &function_end_B
("_aesni_${p}rypt1");
161 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
162 &aesni_generate1
("enc") if (!$inline);
163 &function_begin_B
("${PREFIX}_encrypt");
164 &mov
("eax",&wparam
(0));
165 &mov
($key,&wparam
(2));
166 &movups
($inout0,&QWP
(0,"eax"));
167 &mov
($rounds,&DWP
(240,$key));
168 &mov
("eax",&wparam
(1));
170 { &aesni_inline_generate1
("enc"); }
172 { &call
("_aesni_encrypt1"); }
173 &movups
(&QWP
(0,"eax"),$inout0);
175 &function_end_B
("${PREFIX}_encrypt");
177 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
178 &aesni_generate1
("dec") if(!$inline);
179 &function_begin_B
("${PREFIX}_decrypt");
180 &mov
("eax",&wparam
(0));
181 &mov
($key,&wparam
(2));
182 &movups
($inout0,&QWP
(0,"eax"));
183 &mov
($rounds,&DWP
(240,$key));
184 &mov
("eax",&wparam
(1));
186 { &aesni_inline_generate1
("dec"); }
188 { &call
("_aesni_decrypt1"); }
189 &movups
(&QWP
(0,"eax"),$inout0);
191 &function_end_B
("${PREFIX}_decrypt");
193 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194 # factor. Why 3x subroutine were originally used in loops? Even though
195 # aes[enc|dec] latency was originally 6, it could be scheduled only
196 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
197 # utilization, i.e. when subroutine's throughput is virtually same as
198 # of non-interleaved subroutine [for number of input blocks up to 3].
199 # This is why it makes no sense to implement 2x subroutine.
200 # aes[enc|dec] latency in next processor generation is 8, but the
201 # instructions can be scheduled every cycle. Optimal interleave for
202 # new processor is therefore 8x, but it's unfeasible to accommodate it
203 # in XMM registers addreassable in 32-bit mode and therefore 6x is
209 &function_begin_B
("_aesni_${p}rypt3");
210 &$movekey ($rndkey0,&QWP
(0,$key));
212 &$movekey ($rndkey1,&QWP
(16,$key));
213 &lea
($key,&DWP
(32,$key));
214 &xorps
($inout0,$rndkey0);
215 &pxor
($inout1,$rndkey0);
216 &pxor
($inout2,$rndkey0);
217 &$movekey ($rndkey0,&QWP
(0,$key));
219 &set_label
("${p}3_loop");
220 eval"&aes${p} ($inout0,$rndkey1)";
221 eval"&aes${p} ($inout1,$rndkey1)";
223 eval"&aes${p} ($inout2,$rndkey1)";
224 &$movekey ($rndkey1,&QWP
(16,$key));
225 eval"&aes${p} ($inout0,$rndkey0)";
226 eval"&aes${p} ($inout1,$rndkey0)";
227 &lea
($key,&DWP
(32,$key));
228 eval"&aes${p} ($inout2,$rndkey0)";
229 &$movekey ($rndkey0,&QWP
(0,$key));
230 &jnz
(&label
("${p}3_loop"));
231 eval"&aes${p} ($inout0,$rndkey1)";
232 eval"&aes${p} ($inout1,$rndkey1)";
233 eval"&aes${p} ($inout2,$rndkey1)";
234 eval"&aes${p}last ($inout0,$rndkey0)";
235 eval"&aes${p}last ($inout1,$rndkey0)";
236 eval"&aes${p}last ($inout2,$rndkey0)";
238 &function_end_B
("_aesni_${p}rypt3");
241 # 4x interleave is implemented to improve small block performance,
242 # most notably [and naturally] 4 block by ~30%. One can argue that one
243 # should have implemented 5x as well, but improvement would be <20%,
244 # so it's not worth it...
248 &function_begin_B
("_aesni_${p}rypt4");
249 &$movekey ($rndkey0,&QWP
(0,$key));
250 &$movekey ($rndkey1,&QWP
(16,$key));
252 &lea
($key,&DWP
(32,$key));
253 &xorps
($inout0,$rndkey0);
254 &pxor
($inout1,$rndkey0);
255 &pxor
($inout2,$rndkey0);
256 &pxor
($inout3,$rndkey0);
257 &$movekey ($rndkey0,&QWP
(0,$key));
259 &set_label
("${p}4_loop");
260 eval"&aes${p} ($inout0,$rndkey1)";
261 eval"&aes${p} ($inout1,$rndkey1)";
263 eval"&aes${p} ($inout2,$rndkey1)";
264 eval"&aes${p} ($inout3,$rndkey1)";
265 &$movekey ($rndkey1,&QWP
(16,$key));
266 eval"&aes${p} ($inout0,$rndkey0)";
267 eval"&aes${p} ($inout1,$rndkey0)";
268 &lea
($key,&DWP
(32,$key));
269 eval"&aes${p} ($inout2,$rndkey0)";
270 eval"&aes${p} ($inout3,$rndkey0)";
271 &$movekey ($rndkey0,&QWP
(0,$key));
272 &jnz
(&label
("${p}4_loop"));
274 eval"&aes${p} ($inout0,$rndkey1)";
275 eval"&aes${p} ($inout1,$rndkey1)";
276 eval"&aes${p} ($inout2,$rndkey1)";
277 eval"&aes${p} ($inout3,$rndkey1)";
278 eval"&aes${p}last ($inout0,$rndkey0)";
279 eval"&aes${p}last ($inout1,$rndkey0)";
280 eval"&aes${p}last ($inout2,$rndkey0)";
281 eval"&aes${p}last ($inout3,$rndkey0)";
283 &function_end_B
("_aesni_${p}rypt4");
289 &function_begin_B
("_aesni_${p}rypt6");
290 &static_label
("_aesni_${p}rypt6_enter");
291 &$movekey ($rndkey0,&QWP
(0,$key));
293 &$movekey ($rndkey1,&QWP
(16,$key));
294 &lea
($key,&DWP
(32,$key));
295 &xorps
($inout0,$rndkey0);
296 &pxor
($inout1,$rndkey0); # pxor does better here
297 eval"&aes${p} ($inout0,$rndkey1)";
298 &pxor
($inout2,$rndkey0);
299 eval"&aes${p} ($inout1,$rndkey1)";
300 &pxor
($inout3,$rndkey0);
302 eval"&aes${p} ($inout2,$rndkey1)";
303 &pxor
($inout4,$rndkey0);
304 eval"&aes${p} ($inout3,$rndkey1)";
305 &pxor
($inout5,$rndkey0);
306 eval"&aes${p} ($inout4,$rndkey1)";
307 &$movekey ($rndkey0,&QWP
(0,$key));
308 eval"&aes${p} ($inout5,$rndkey1)";
309 &jmp
(&label
("_aesni_${p}rypt6_enter"));
311 &set_label
("${p}6_loop",16);
312 eval"&aes${p} ($inout0,$rndkey1)";
313 eval"&aes${p} ($inout1,$rndkey1)";
315 eval"&aes${p} ($inout2,$rndkey1)";
316 eval"&aes${p} ($inout3,$rndkey1)";
317 eval"&aes${p} ($inout4,$rndkey1)";
318 eval"&aes${p} ($inout5,$rndkey1)";
319 &set_label
("_aesni_${p}rypt6_enter",16);
320 &$movekey ($rndkey1,&QWP
(16,$key));
321 eval"&aes${p} ($inout0,$rndkey0)";
322 eval"&aes${p} ($inout1,$rndkey0)";
323 &lea
($key,&DWP
(32,$key));
324 eval"&aes${p} ($inout2,$rndkey0)";
325 eval"&aes${p} ($inout3,$rndkey0)";
326 eval"&aes${p} ($inout4,$rndkey0)";
327 eval"&aes${p} ($inout5,$rndkey0)";
328 &$movekey ($rndkey0,&QWP
(0,$key));
329 &jnz
(&label
("${p}6_loop"));
331 eval"&aes${p} ($inout0,$rndkey1)";
332 eval"&aes${p} ($inout1,$rndkey1)";
333 eval"&aes${p} ($inout2,$rndkey1)";
334 eval"&aes${p} ($inout3,$rndkey1)";
335 eval"&aes${p} ($inout4,$rndkey1)";
336 eval"&aes${p} ($inout5,$rndkey1)";
337 eval"&aes${p}last ($inout0,$rndkey0)";
338 eval"&aes${p}last ($inout1,$rndkey0)";
339 eval"&aes${p}last ($inout2,$rndkey0)";
340 eval"&aes${p}last ($inout3,$rndkey0)";
341 eval"&aes${p}last ($inout4,$rndkey0)";
342 eval"&aes${p}last ($inout5,$rndkey0)";
344 &function_end_B
("_aesni_${p}rypt6");
346 &aesni_generate3
("enc") if ($PREFIX eq "aesni");
347 &aesni_generate3
("dec");
348 &aesni_generate4
("enc") if ($PREFIX eq "aesni");
349 &aesni_generate4
("dec");
350 &aesni_generate6
("enc") if ($PREFIX eq "aesni");
351 &aesni_generate6
("dec");
353 if ($PREFIX eq "aesni") {
354 ######################################################################
355 # void aesni_ecb_encrypt (const void *in, void *out,
356 # size_t length, const AES_KEY *key,
358 &function_begin
("aesni_ecb_encrypt");
359 &mov
($inp,&wparam
(0));
360 &mov
($out,&wparam
(1));
361 &mov
($len,&wparam
(2));
362 &mov
($key,&wparam
(3));
363 &mov
($rounds_,&wparam
(4));
365 &jz
(&label
("ecb_ret"));
366 &mov
($rounds,&DWP
(240,$key));
367 &test
($rounds_,$rounds_);
368 &jz
(&label
("ecb_decrypt"));
370 &mov
($key_,$key); # backup $key
371 &mov
($rounds_,$rounds); # backup $rounds
373 &jb
(&label
("ecb_enc_tail"));
375 &movdqu
($inout0,&QWP
(0,$inp));
376 &movdqu
($inout1,&QWP
(0x10,$inp));
377 &movdqu
($inout2,&QWP
(0x20,$inp));
378 &movdqu
($inout3,&QWP
(0x30,$inp));
379 &movdqu
($inout4,&QWP
(0x40,$inp));
380 &movdqu
($inout5,&QWP
(0x50,$inp));
381 &lea
($inp,&DWP
(0x60,$inp));
383 &jmp
(&label
("ecb_enc_loop6_enter"));
385 &set_label
("ecb_enc_loop6",16);
386 &movups
(&QWP
(0,$out),$inout0);
387 &movdqu
($inout0,&QWP
(0,$inp));
388 &movups
(&QWP
(0x10,$out),$inout1);
389 &movdqu
($inout1,&QWP
(0x10,$inp));
390 &movups
(&QWP
(0x20,$out),$inout2);
391 &movdqu
($inout2,&QWP
(0x20,$inp));
392 &movups
(&QWP
(0x30,$out),$inout3);
393 &movdqu
($inout3,&QWP
(0x30,$inp));
394 &movups
(&QWP
(0x40,$out),$inout4);
395 &movdqu
($inout4,&QWP
(0x40,$inp));
396 &movups
(&QWP
(0x50,$out),$inout5);
397 &lea
($out,&DWP
(0x60,$out));
398 &movdqu
($inout5,&QWP
(0x50,$inp));
399 &lea
($inp,&DWP
(0x60,$inp));
400 &set_label
("ecb_enc_loop6_enter");
402 &call
("_aesni_encrypt6");
404 &mov
($key,$key_); # restore $key
405 &mov
($rounds,$rounds_); # restore $rounds
407 &jnc
(&label
("ecb_enc_loop6"));
409 &movups
(&QWP
(0,$out),$inout0);
410 &movups
(&QWP
(0x10,$out),$inout1);
411 &movups
(&QWP
(0x20,$out),$inout2);
412 &movups
(&QWP
(0x30,$out),$inout3);
413 &movups
(&QWP
(0x40,$out),$inout4);
414 &movups
(&QWP
(0x50,$out),$inout5);
415 &lea
($out,&DWP
(0x60,$out));
417 &jz
(&label
("ecb_ret"));
419 &set_label
("ecb_enc_tail");
420 &movups
($inout0,&QWP
(0,$inp));
422 &jb
(&label
("ecb_enc_one"));
423 &movups
($inout1,&QWP
(0x10,$inp));
424 &je
(&label
("ecb_enc_two"));
425 &movups
($inout2,&QWP
(0x20,$inp));
427 &jb
(&label
("ecb_enc_three"));
428 &movups
($inout3,&QWP
(0x30,$inp));
429 &je
(&label
("ecb_enc_four"));
430 &movups
($inout4,&QWP
(0x40,$inp));
431 &xorps
($inout5,$inout5);
432 &call
("_aesni_encrypt6");
433 &movups
(&QWP
(0,$out),$inout0);
434 &movups
(&QWP
(0x10,$out),$inout1);
435 &movups
(&QWP
(0x20,$out),$inout2);
436 &movups
(&QWP
(0x30,$out),$inout3);
437 &movups
(&QWP
(0x40,$out),$inout4);
438 jmp
(&label
("ecb_ret"));
440 &set_label
("ecb_enc_one",16);
442 { &aesni_inline_generate1
("enc"); }
444 { &call
("_aesni_encrypt1"); }
445 &movups
(&QWP
(0,$out),$inout0);
446 &jmp
(&label
("ecb_ret"));
448 &set_label
("ecb_enc_two",16);
449 &xorps
($inout2,$inout2);
450 &call
("_aesni_encrypt3");
451 &movups
(&QWP
(0,$out),$inout0);
452 &movups
(&QWP
(0x10,$out),$inout1);
453 &jmp
(&label
("ecb_ret"));
455 &set_label
("ecb_enc_three",16);
456 &call
("_aesni_encrypt3");
457 &movups
(&QWP
(0,$out),$inout0);
458 &movups
(&QWP
(0x10,$out),$inout1);
459 &movups
(&QWP
(0x20,$out),$inout2);
460 &jmp
(&label
("ecb_ret"));
462 &set_label
("ecb_enc_four",16);
463 &call
("_aesni_encrypt4");
464 &movups
(&QWP
(0,$out),$inout0);
465 &movups
(&QWP
(0x10,$out),$inout1);
466 &movups
(&QWP
(0x20,$out),$inout2);
467 &movups
(&QWP
(0x30,$out),$inout3);
468 &jmp
(&label
("ecb_ret"));
469 ######################################################################
470 &set_label
("ecb_decrypt",16);
471 &mov
($key_,$key); # backup $key
472 &mov
($rounds_,$rounds); # backup $rounds
474 &jb
(&label
("ecb_dec_tail"));
476 &movdqu
($inout0,&QWP
(0,$inp));
477 &movdqu
($inout1,&QWP
(0x10,$inp));
478 &movdqu
($inout2,&QWP
(0x20,$inp));
479 &movdqu
($inout3,&QWP
(0x30,$inp));
480 &movdqu
($inout4,&QWP
(0x40,$inp));
481 &movdqu
($inout5,&QWP
(0x50,$inp));
482 &lea
($inp,&DWP
(0x60,$inp));
484 &jmp
(&label
("ecb_dec_loop6_enter"));
486 &set_label
("ecb_dec_loop6",16);
487 &movups
(&QWP
(0,$out),$inout0);
488 &movdqu
($inout0,&QWP
(0,$inp));
489 &movups
(&QWP
(0x10,$out),$inout1);
490 &movdqu
($inout1,&QWP
(0x10,$inp));
491 &movups
(&QWP
(0x20,$out),$inout2);
492 &movdqu
($inout2,&QWP
(0x20,$inp));
493 &movups
(&QWP
(0x30,$out),$inout3);
494 &movdqu
($inout3,&QWP
(0x30,$inp));
495 &movups
(&QWP
(0x40,$out),$inout4);
496 &movdqu
($inout4,&QWP
(0x40,$inp));
497 &movups
(&QWP
(0x50,$out),$inout5);
498 &lea
($out,&DWP
(0x60,$out));
499 &movdqu
($inout5,&QWP
(0x50,$inp));
500 &lea
($inp,&DWP
(0x60,$inp));
501 &set_label
("ecb_dec_loop6_enter");
503 &call
("_aesni_decrypt6");
505 &mov
($key,$key_); # restore $key
506 &mov
($rounds,$rounds_); # restore $rounds
508 &jnc
(&label
("ecb_dec_loop6"));
510 &movups
(&QWP
(0,$out),$inout0);
511 &movups
(&QWP
(0x10,$out),$inout1);
512 &movups
(&QWP
(0x20,$out),$inout2);
513 &movups
(&QWP
(0x30,$out),$inout3);
514 &movups
(&QWP
(0x40,$out),$inout4);
515 &movups
(&QWP
(0x50,$out),$inout5);
516 &lea
($out,&DWP
(0x60,$out));
518 &jz
(&label
("ecb_ret"));
520 &set_label
("ecb_dec_tail");
521 &movups
($inout0,&QWP
(0,$inp));
523 &jb
(&label
("ecb_dec_one"));
524 &movups
($inout1,&QWP
(0x10,$inp));
525 &je
(&label
("ecb_dec_two"));
526 &movups
($inout2,&QWP
(0x20,$inp));
528 &jb
(&label
("ecb_dec_three"));
529 &movups
($inout3,&QWP
(0x30,$inp));
530 &je
(&label
("ecb_dec_four"));
531 &movups
($inout4,&QWP
(0x40,$inp));
532 &xorps
($inout5,$inout5);
533 &call
("_aesni_decrypt6");
534 &movups
(&QWP
(0,$out),$inout0);
535 &movups
(&QWP
(0x10,$out),$inout1);
536 &movups
(&QWP
(0x20,$out),$inout2);
537 &movups
(&QWP
(0x30,$out),$inout3);
538 &movups
(&QWP
(0x40,$out),$inout4);
539 &jmp
(&label
("ecb_ret"));
541 &set_label
("ecb_dec_one",16);
543 { &aesni_inline_generate1
("dec"); }
545 { &call
("_aesni_decrypt1"); }
546 &movups
(&QWP
(0,$out),$inout0);
547 &jmp
(&label
("ecb_ret"));
549 &set_label
("ecb_dec_two",16);
550 &xorps
($inout2,$inout2);
551 &call
("_aesni_decrypt3");
552 &movups
(&QWP
(0,$out),$inout0);
553 &movups
(&QWP
(0x10,$out),$inout1);
554 &jmp
(&label
("ecb_ret"));
556 &set_label
("ecb_dec_three",16);
557 &call
("_aesni_decrypt3");
558 &movups
(&QWP
(0,$out),$inout0);
559 &movups
(&QWP
(0x10,$out),$inout1);
560 &movups
(&QWP
(0x20,$out),$inout2);
561 &jmp
(&label
("ecb_ret"));
563 &set_label
("ecb_dec_four",16);
564 &call
("_aesni_decrypt4");
565 &movups
(&QWP
(0,$out),$inout0);
566 &movups
(&QWP
(0x10,$out),$inout1);
567 &movups
(&QWP
(0x20,$out),$inout2);
568 &movups
(&QWP
(0x30,$out),$inout3);
570 &set_label
("ecb_ret");
571 &function_end
("aesni_ecb_encrypt");
573 ######################################################################
574 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
575 # size_t blocks, const AES_KEY *key,
576 # const char *ivec,char *cmac);
578 # Handles only complete blocks, operates on 64-bit counter and
579 # does not update *ivec! Nor does it finalize CMAC value
580 # (see engine/eng_aesni.c for details)
583 &function_begin
("aesni_ccm64_encrypt_blocks");
584 &mov
($inp,&wparam
(0));
585 &mov
($out,&wparam
(1));
586 &mov
($len,&wparam
(2));
587 &mov
($key,&wparam
(3));
588 &mov
($rounds_,&wparam
(4));
589 &mov
($rounds,&wparam
(5));
592 &and ("esp",-16); # align stack
593 &mov
(&DWP
(48,"esp"),$key_);
595 &movdqu
($ivec,&QWP
(0,$rounds_)); # load ivec
596 &movdqu
($cmac,&QWP
(0,$rounds)); # load cmac
597 &mov
($rounds,&DWP
(240,$key));
599 # compose byte-swap control mask for pshufb on stack
600 &mov
(&DWP
(0,"esp"),0x0c0d0e0f);
601 &mov
(&DWP
(4,"esp"),0x08090a0b);
602 &mov
(&DWP
(8,"esp"),0x04050607);
603 &mov
(&DWP
(12,"esp"),0x00010203);
605 # compose counter increment vector on stack
608 &mov
(&DWP
(16,"esp"),$rounds_);
609 &mov
(&DWP
(20,"esp"),$key_);
610 &mov
(&DWP
(24,"esp"),$key_);
611 &mov
(&DWP
(28,"esp"),$key_);
614 &lea
($key_,&DWP
(0,$key));
615 &movdqa
($inout3,&QWP
(0,"esp"));
616 &movdqa
($inout0,$ivec);
617 &mov
($rounds_,$rounds);
618 &pshufb
($ivec,$inout3);
620 &set_label
("ccm64_enc_outer");
621 &$movekey ($rndkey0,&QWP
(0,$key_));
622 &mov
($rounds,$rounds_);
623 &movups
($in0,&QWP
(0,$inp));
625 &xorps
($inout0,$rndkey0);
626 &$movekey ($rndkey1,&QWP
(16,$key_));
627 &xorps
($rndkey0,$in0);
628 &lea
($key,&DWP
(32,$key_));
629 &xorps
($cmac,$rndkey0); # cmac^=inp
630 &$movekey ($rndkey0,&QWP
(0,$key));
632 &set_label
("ccm64_enc2_loop");
633 &aesenc
($inout0,$rndkey1);
635 &aesenc
($cmac,$rndkey1);
636 &$movekey ($rndkey1,&QWP
(16,$key));
637 &aesenc
($inout0,$rndkey0);
638 &lea
($key,&DWP
(32,$key));
639 &aesenc
($cmac,$rndkey0);
640 &$movekey ($rndkey0,&QWP
(0,$key));
641 &jnz
(&label
("ccm64_enc2_loop"));
642 &aesenc
($inout0,$rndkey1);
643 &aesenc
($cmac,$rndkey1);
644 &paddq
($ivec,&QWP
(16,"esp"));
645 &aesenclast
($inout0,$rndkey0);
646 &aesenclast
($cmac,$rndkey0);
649 &lea
($inp,&DWP
(16,$inp));
650 &xorps
($in0,$inout0); # inp^=E(ivec)
651 &movdqa
($inout0,$ivec);
652 &movups
(&QWP
(0,$out),$in0); # save output
653 &lea
($out,&DWP
(16,$out));
654 &pshufb
($inout0,$inout3);
655 &jnz
(&label
("ccm64_enc_outer"));
657 &mov
("esp",&DWP
(48,"esp"));
658 &mov
($out,&wparam
(5));
659 &movups
(&QWP
(0,$out),$cmac);
660 &function_end
("aesni_ccm64_encrypt_blocks");
662 &function_begin
("aesni_ccm64_decrypt_blocks");
663 &mov
($inp,&wparam
(0));
664 &mov
($out,&wparam
(1));
665 &mov
($len,&wparam
(2));
666 &mov
($key,&wparam
(3));
667 &mov
($rounds_,&wparam
(4));
668 &mov
($rounds,&wparam
(5));
671 &and ("esp",-16); # align stack
672 &mov
(&DWP
(48,"esp"),$key_);
674 &movdqu
($ivec,&QWP
(0,$rounds_)); # load ivec
675 &movdqu
($cmac,&QWP
(0,$rounds)); # load cmac
676 &mov
($rounds,&DWP
(240,$key));
678 # compose byte-swap control mask for pshufb on stack
679 &mov
(&DWP
(0,"esp"),0x0c0d0e0f);
680 &mov
(&DWP
(4,"esp"),0x08090a0b);
681 &mov
(&DWP
(8,"esp"),0x04050607);
682 &mov
(&DWP
(12,"esp"),0x00010203);
684 # compose counter increment vector on stack
687 &mov
(&DWP
(16,"esp"),$rounds_);
688 &mov
(&DWP
(20,"esp"),$key_);
689 &mov
(&DWP
(24,"esp"),$key_);
690 &mov
(&DWP
(28,"esp"),$key_);
692 &movdqa
($inout3,&QWP
(0,"esp")); # bswap mask
693 &movdqa
($inout0,$ivec);
696 &mov
($rounds_,$rounds);
698 &pshufb
($ivec,$inout3);
700 { &aesni_inline_generate1
("enc"); }
702 { &call
("_aesni_encrypt1"); }
703 &movups
($in0,&QWP
(0,$inp)); # load inp
704 &paddq
($ivec,&QWP
(16,"esp"));
705 &lea
($inp,&QWP
(16,$inp));
706 &jmp
(&label
("ccm64_dec_outer"));
708 &set_label
("ccm64_dec_outer",16);
709 &xorps
($in0,$inout0); # inp ^= E(ivec)
710 &movdqa
($inout0,$ivec);
711 &mov
($rounds,$rounds_);
712 &movups
(&QWP
(0,$out),$in0); # save output
713 &lea
($out,&DWP
(16,$out));
714 &pshufb
($inout0,$inout3);
717 &jz
(&label
("ccm64_dec_break"));
719 &$movekey ($rndkey0,&QWP
(0,$key_));
721 &$movekey ($rndkey1,&QWP
(16,$key_));
722 &xorps
($in0,$rndkey0);
723 &lea
($key,&DWP
(32,$key_));
724 &xorps
($inout0,$rndkey0);
725 &xorps
($cmac,$in0); # cmac^=out
726 &$movekey ($rndkey0,&QWP
(0,$key));
728 &set_label
("ccm64_dec2_loop");
729 &aesenc
($inout0,$rndkey1);
731 &aesenc
($cmac,$rndkey1);
732 &$movekey ($rndkey1,&QWP
(16,$key));
733 &aesenc
($inout0,$rndkey0);
734 &lea
($key,&DWP
(32,$key));
735 &aesenc
($cmac,$rndkey0);
736 &$movekey ($rndkey0,&QWP
(0,$key));
737 &jnz
(&label
("ccm64_dec2_loop"));
738 &movups
($in0,&QWP
(0,$inp)); # load inp
739 &paddq
($ivec,&QWP
(16,"esp"));
740 &aesenc
($inout0,$rndkey1);
741 &aesenc
($cmac,$rndkey1);
742 &lea
($inp,&QWP
(16,$inp));
743 &aesenclast
($inout0,$rndkey0);
744 &aesenclast
($cmac,$rndkey0);
745 &jmp
(&label
("ccm64_dec_outer"));
747 &set_label
("ccm64_dec_break",16);
750 { &aesni_inline_generate1
("enc",$cmac,$in0); }
752 { &call
("_aesni_encrypt1",$cmac); }
754 &mov
("esp",&DWP
(48,"esp"));
755 &mov
($out,&wparam
(5));
756 &movups
(&QWP
(0,$out),$cmac);
757 &function_end
("aesni_ccm64_decrypt_blocks");
760 ######################################################################
761 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
762 # size_t blocks, const AES_KEY *key,
765 # Handles only complete blocks, operates on 32-bit counter and
766 # does not update *ivec! (see engine/eng_aesni.c for details)
770 # 16 vector addend: 0,6,6,6
771 # 32 counter-less ivec
772 # 48 1st triplet of counter vector
773 # 64 2nd triplet of counter vector
776 &function_begin
("aesni_ctr32_encrypt_blocks");
777 &mov
($inp,&wparam
(0));
778 &mov
($out,&wparam
(1));
779 &mov
($len,&wparam
(2));
780 &mov
($key,&wparam
(3));
781 &mov
($rounds_,&wparam
(4));
784 &and ("esp",-16); # align stack
785 &mov
(&DWP
(80,"esp"),$key_);
788 &je
(&label
("ctr32_one_shortcut"));
790 &movdqu
($inout5,&QWP
(0,$rounds_)); # load ivec
792 # compose byte-swap control mask for pshufb on stack
793 &mov
(&DWP
(0,"esp"),0x0c0d0e0f);
794 &mov
(&DWP
(4,"esp"),0x08090a0b);
795 &mov
(&DWP
(8,"esp"),0x04050607);
796 &mov
(&DWP
(12,"esp"),0x00010203);
798 # compose counter increment vector on stack
801 &mov
(&DWP
(16,"esp"),$rounds);
802 &mov
(&DWP
(20,"esp"),$rounds);
803 &mov
(&DWP
(24,"esp"),$rounds);
804 &mov
(&DWP
(28,"esp"),$key_);
806 &pextrd
($rounds_,$inout5,3); # pull 32-bit counter
807 &pinsrd
($inout5,$key_,3); # wipe 32-bit counter
809 &mov
($rounds,&DWP
(240,$key)); # key->rounds
811 # compose 2 vectors of 3x32-bit counters
813 &pxor
($rndkey1,$rndkey1);
814 &pxor
($rndkey0,$rndkey0);
815 &movdqa
($inout0,&QWP
(0,"esp")); # load byte-swap mask
816 &pinsrd
($rndkey1,$rounds_,0);
817 &lea
($key_,&DWP
(3,$rounds_));
818 &pinsrd
($rndkey0,$key_,0);
820 &pinsrd
($rndkey1,$rounds_,1);
822 &pinsrd
($rndkey0,$key_,1);
824 &pinsrd
($rndkey1,$rounds_,2);
826 &pinsrd
($rndkey0,$key_,2);
827 &movdqa
(&QWP
(48,"esp"),$rndkey1); # save 1st triplet
828 &pshufb
($rndkey1,$inout0); # byte swap
829 &movdqa
(&QWP
(64,"esp"),$rndkey0); # save 2nd triplet
830 &pshufb
($rndkey0,$inout0); # byte swap
832 &pshufd
($inout0,$rndkey1,3<<6); # place counter to upper dword
833 &pshufd
($inout1,$rndkey1,2<<6);
835 &jb
(&label
("ctr32_tail"));
836 &movdqa
(&QWP
(32,"esp"),$inout5); # save counter-less ivec
838 &mov
($key_,$key); # backup $key
839 &mov
($rounds_,$rounds); # backup $rounds
841 &jmp
(&label
("ctr32_loop6"));
843 &set_label
("ctr32_loop6",16);
844 &pshufd
($inout2,$rndkey1,1<<6);
845 &movdqa
($rndkey1,&QWP
(32,"esp")); # pull counter-less ivec
846 &pshufd
($inout3,$rndkey0,3<<6);
847 &por
($inout0,$rndkey1); # merge counter-less ivec
848 &pshufd
($inout4,$rndkey0,2<<6);
849 &por
($inout1,$rndkey1);
850 &pshufd
($inout5,$rndkey0,1<<6);
851 &por
($inout2,$rndkey1);
852 &por
($inout3,$rndkey1);
853 &por
($inout4,$rndkey1);
854 &por
($inout5,$rndkey1);
856 # inlining _aesni_encrypt6's prologue gives ~4% improvement...
857 &$movekey ($rndkey0,&QWP
(0,$key_));
858 &$movekey ($rndkey1,&QWP
(16,$key_));
859 &lea
($key,&DWP
(32,$key_));
861 &pxor
($inout0,$rndkey0);
862 &pxor
($inout1,$rndkey0);
863 &aesenc
($inout0,$rndkey1);
864 &pxor
($inout2,$rndkey0);
865 &aesenc
($inout1,$rndkey1);
866 &pxor
($inout3,$rndkey0);
867 &aesenc
($inout2,$rndkey1);
868 &pxor
($inout4,$rndkey0);
869 &aesenc
($inout3,$rndkey1);
870 &pxor
($inout5,$rndkey0);
871 &aesenc
($inout4,$rndkey1);
872 &$movekey ($rndkey0,&QWP
(0,$key));
873 &aesenc
($inout5,$rndkey1);
875 &call
(&label
("_aesni_encrypt6_enter"));
877 &movups
($rndkey1,&QWP
(0,$inp));
878 &movups
($rndkey0,&QWP
(0x10,$inp));
879 &xorps
($inout0,$rndkey1);
880 &movups
($rndkey1,&QWP
(0x20,$inp));
881 &xorps
($inout1,$rndkey0);
882 &movups
(&QWP
(0,$out),$inout0);
883 &movdqa
($rndkey0,&QWP
(16,"esp")); # load increment
884 &xorps
($inout2,$rndkey1);
885 &movdqa
($rndkey1,&QWP
(48,"esp")); # load 1st triplet
886 &movups
(&QWP
(0x10,$out),$inout1);
887 &movups
(&QWP
(0x20,$out),$inout2);
889 &paddd
($rndkey1,$rndkey0); # 1st triplet increment
890 &paddd
($rndkey0,&QWP
(64,"esp")); # 2nd triplet increment
891 &movdqa
($inout0,&QWP
(0,"esp")); # load byte swap mask
893 &movups
($inout1,&QWP
(0x30,$inp));
894 &movups
($inout2,&QWP
(0x40,$inp));
895 &xorps
($inout3,$inout1);
896 &movups
($inout1,&QWP
(0x50,$inp));
897 &lea
($inp,&DWP
(0x60,$inp));
898 &movdqa
(&QWP
(48,"esp"),$rndkey1); # save 1st triplet
899 &pshufb
($rndkey1,$inout0); # byte swap
900 &xorps
($inout4,$inout2);
901 &movups
(&QWP
(0x30,$out),$inout3);
902 &xorps
($inout5,$inout1);
903 &movdqa
(&QWP
(64,"esp"),$rndkey0); # save 2nd triplet
904 &pshufb
($rndkey0,$inout0); # byte swap
905 &movups
(&QWP
(0x40,$out),$inout4);
906 &pshufd
($inout0,$rndkey1,3<<6);
907 &movups
(&QWP
(0x50,$out),$inout5);
908 &lea
($out,&DWP
(0x60,$out));
910 &mov
($rounds,$rounds_);
911 &pshufd
($inout1,$rndkey1,2<<6);
913 &jnc
(&label
("ctr32_loop6"));
916 &jz
(&label
("ctr32_ret"));
918 &lea
($rounds,&DWP
(1,"",$rounds,2)); # restore $rounds
919 &movdqa
($inout5,&QWP
(32,"esp")); # pull count-less ivec
921 &set_label
("ctr32_tail");
922 &por
($inout0,$inout5);
924 &jb
(&label
("ctr32_one"));
926 &pshufd
($inout2,$rndkey1,1<<6);
927 &por
($inout1,$inout5);
928 &je
(&label
("ctr32_two"));
930 &pshufd
($inout3,$rndkey0,3<<6);
931 &por
($inout2,$inout5);
933 &jb
(&label
("ctr32_three"));
935 &pshufd
($inout4,$rndkey0,2<<6);
936 &por
($inout3,$inout5);
937 &je
(&label
("ctr32_four"));
939 &por
($inout4,$inout5);
940 &call
("_aesni_encrypt6");
941 &movups
($rndkey1,&QWP
(0,$inp));
942 &movups
($rndkey0,&QWP
(0x10,$inp));
943 &xorps
($inout0,$rndkey1);
944 &movups
($rndkey1,&QWP
(0x20,$inp));
945 &xorps
($inout1,$rndkey0);
946 &movups
($rndkey0,&QWP
(0x30,$inp));
947 &xorps
($inout2,$rndkey1);
948 &movups
($rndkey1,&QWP
(0x40,$inp));
949 &xorps
($inout3,$rndkey0);
950 &movups
(&QWP
(0,$out),$inout0);
951 &xorps
($inout4,$rndkey1);
952 &movups
(&QWP
(0x10,$out),$inout1);
953 &movups
(&QWP
(0x20,$out),$inout2);
954 &movups
(&QWP
(0x30,$out),$inout3);
955 &movups
(&QWP
(0x40,$out),$inout4);
956 &jmp
(&label
("ctr32_ret"));
958 &set_label
("ctr32_one_shortcut",16);
959 &movups
($inout0,&QWP
(0,$rounds_)); # load ivec
960 &mov
($rounds,&DWP
(240,$key));
962 &set_label
("ctr32_one");
964 { &aesni_inline_generate1
("enc"); }
966 { &call
("_aesni_encrypt1"); }
967 &movups
($in0,&QWP
(0,$inp));
968 &xorps
($in0,$inout0);
969 &movups
(&QWP
(0,$out),$in0);
970 &jmp
(&label
("ctr32_ret"));
972 &set_label
("ctr32_two",16);
973 &call
("_aesni_encrypt3");
974 &movups
($inout3,&QWP
(0,$inp));
975 &movups
($inout4,&QWP
(0x10,$inp));
976 &xorps
($inout0,$inout3);
977 &xorps
($inout1,$inout4);
978 &movups
(&QWP
(0,$out),$inout0);
979 &movups
(&QWP
(0x10,$out),$inout1);
980 &jmp
(&label
("ctr32_ret"));
982 &set_label
("ctr32_three",16);
983 &call
("_aesni_encrypt3");
984 &movups
($inout3,&QWP
(0,$inp));
985 &movups
($inout4,&QWP
(0x10,$inp));
986 &xorps
($inout0,$inout3);
987 &movups
($inout5,&QWP
(0x20,$inp));
988 &xorps
($inout1,$inout4);
989 &movups
(&QWP
(0,$out),$inout0);
990 &xorps
($inout2,$inout5);
991 &movups
(&QWP
(0x10,$out),$inout1);
992 &movups
(&QWP
(0x20,$out),$inout2);
993 &jmp
(&label
("ctr32_ret"));
995 &set_label
("ctr32_four",16);
996 &call
("_aesni_encrypt4");
997 &movups
($inout4,&QWP
(0,$inp));
998 &movups
($inout5,&QWP
(0x10,$inp));
999 &movups
($rndkey1,&QWP
(0x20,$inp));
1000 &xorps
($inout0,$inout4);
1001 &movups
($rndkey0,&QWP
(0x30,$inp));
1002 &xorps
($inout1,$inout5);
1003 &movups
(&QWP
(0,$out),$inout0);
1004 &xorps
($inout2,$rndkey1);
1005 &movups
(&QWP
(0x10,$out),$inout1);
1006 &xorps
($inout3,$rndkey0);
1007 &movups
(&QWP
(0x20,$out),$inout2);
1008 &movups
(&QWP
(0x30,$out),$inout3);
1010 &set_label
("ctr32_ret");
1011 &mov
("esp",&DWP
(80,"esp"));
1012 &function_end
("aesni_ctr32_encrypt_blocks");
1014 ######################################################################
1015 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1016 # const AES_KEY *key1, const AES_KEY *key2
1017 # const unsigned char iv[16]);
1019 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1021 &function_begin
("aesni_xts_encrypt");
1022 &mov
($key,&wparam
(4)); # key2
1023 &mov
($inp,&wparam
(5)); # clear-text tweak
1025 &mov
($rounds,&DWP
(240,$key)); # key2->rounds
1026 &movups
($inout0,&QWP
(0,$inp));
1028 { &aesni_inline_generate1
("enc"); }
1030 { &call
("_aesni_encrypt1"); }
1032 &mov
($inp,&wparam
(0));
1033 &mov
($out,&wparam
(1));
1034 &mov
($len,&wparam
(2));
1035 &mov
($key,&wparam
(3)); # key1
1038 &sub ("esp",16*7+8);
1039 &mov
($rounds,&DWP
(240,$key)); # key1->rounds
1040 &and ("esp",-16); # align stack
1042 &mov
(&DWP
(16*6+0,"esp"),0x87); # compose the magic constant
1043 &mov
(&DWP
(16*6+4,"esp"),0);
1044 &mov
(&DWP
(16*6+8,"esp"),1);
1045 &mov
(&DWP
(16*6+12,"esp"),0);
1046 &mov
(&DWP
(16*7+0,"esp"),$len); # save original $len
1047 &mov
(&DWP
(16*7+4,"esp"),$key_); # save original %esp
1049 &movdqa
($tweak,$inout0);
1050 &pxor
($twtmp,$twtmp);
1051 &movdqa
($twmask,&QWP
(6*16,"esp")); # 0x0...010...87
1052 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1055 &mov
($key_,$key); # backup $key
1056 &mov
($rounds_,$rounds); # backup $rounds
1058 &jc
(&label
("xts_enc_short"));
1061 &mov
($rounds_,$rounds);
1062 &jmp
(&label
("xts_enc_loop6"));
1064 &set_label
("xts_enc_loop6",16);
1065 for ($i=0;$i<4;$i++) {
1066 &pshufd
($twres,$twtmp,0x13);
1067 &pxor
($twtmp,$twtmp);
1068 &movdqa
(&QWP
(16*$i,"esp"),$tweak);
1069 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1070 &pand
($twres,$twmask); # isolate carry and residue
1071 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1072 &pxor
($tweak,$twres);
1074 &pshufd
($inout5,$twtmp,0x13);
1075 &movdqa
(&QWP
(16*$i++,"esp"),$tweak);
1076 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1077 &$movekey ($rndkey0,&QWP
(0,$key_));
1078 &pand
($inout5,$twmask); # isolate carry and residue
1079 &movups
($inout0,&QWP
(0,$inp)); # load input
1080 &pxor
($inout5,$tweak);
1082 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1083 &movdqu
($inout1,&QWP
(16*1,$inp));
1084 &xorps
($inout0,$rndkey0); # input^=rndkey[0]
1085 &movdqu
($inout2,&QWP
(16*2,$inp));
1086 &pxor
($inout1,$rndkey0);
1087 &movdqu
($inout3,&QWP
(16*3,$inp));
1088 &pxor
($inout2,$rndkey0);
1089 &movdqu
($inout4,&QWP
(16*4,$inp));
1090 &pxor
($inout3,$rndkey0);
1091 &movdqu
($rndkey1,&QWP
(16*5,$inp));
1092 &pxor
($inout4,$rndkey0);
1093 &lea
($inp,&DWP
(16*6,$inp));
1094 &pxor
($inout0,&QWP
(16*0,"esp")); # input^=tweak
1095 &movdqa
(&QWP
(16*$i,"esp"),$inout5); # save last tweak
1096 &pxor
($inout5,$rndkey1);
1098 &$movekey ($rndkey1,&QWP
(16,$key_));
1099 &lea
($key,&DWP
(32,$key_));
1100 &pxor
($inout1,&QWP
(16*1,"esp"));
1101 &aesenc
($inout0,$rndkey1);
1102 &pxor
($inout2,&QWP
(16*2,"esp"));
1103 &aesenc
($inout1,$rndkey1);
1104 &pxor
($inout3,&QWP
(16*3,"esp"));
1106 &aesenc
($inout2,$rndkey1);
1107 &pxor
($inout4,&QWP
(16*4,"esp"));
1108 &aesenc
($inout3,$rndkey1);
1109 &pxor
($inout5,$rndkey0);
1110 &aesenc
($inout4,$rndkey1);
1111 &$movekey ($rndkey0,&QWP
(0,$key));
1112 &aesenc
($inout5,$rndkey1);
1113 &call
(&label
("_aesni_encrypt6_enter"));
1115 &movdqa
($tweak,&QWP
(16*5,"esp")); # last tweak
1116 &pxor
($twtmp,$twtmp);
1117 &xorps
($inout0,&QWP
(16*0,"esp")); # output^=tweak
1118 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1119 &xorps
($inout1,&QWP
(16*1,"esp"));
1120 &movups
(&QWP
(16*0,$out),$inout0); # write output
1121 &xorps
($inout2,&QWP
(16*2,"esp"));
1122 &movups
(&QWP
(16*1,$out),$inout1);
1123 &xorps
($inout3,&QWP
(16*3,"esp"));
1124 &movups
(&QWP
(16*2,$out),$inout2);
1125 &xorps
($inout4,&QWP
(16*4,"esp"));
1126 &movups
(&QWP
(16*3,$out),$inout3);
1127 &xorps
($inout5,$tweak);
1128 &movups
(&QWP
(16*4,$out),$inout4);
1129 &pshufd
($twres,$twtmp,0x13);
1130 &movups
(&QWP
(16*5,$out),$inout5);
1131 &lea
($out,&DWP
(16*6,$out));
1132 &movdqa
($twmask,&QWP
(16*6,"esp")); # 0x0...010...87
1134 &pxor
($twtmp,$twtmp);
1135 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1136 &pand
($twres,$twmask); # isolate carry and residue
1137 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1138 &mov
($rounds,$rounds_); # restore $rounds
1139 &pxor
($tweak,$twres);
1142 &jnc
(&label
("xts_enc_loop6"));
1144 &lea
($rounds,&DWP
(1,"",$rounds,2)); # restore $rounds
1145 &mov
($key,$key_); # restore $key
1146 &mov
($rounds_,$rounds);
1148 &set_label
("xts_enc_short");
1150 &jz
(&label
("xts_enc_done6x"));
1152 &movdqa
($inout3,$tweak); # put aside previous tweak
1154 &jb
(&label
("xts_enc_one"));
1156 &pshufd
($twres,$twtmp,0x13);
1157 &pxor
($twtmp,$twtmp);
1158 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1159 &pand
($twres,$twmask); # isolate carry and residue
1160 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1161 &pxor
($tweak,$twres);
1162 &je
(&label
("xts_enc_two"));
1164 &pshufd
($twres,$twtmp,0x13);
1165 &pxor
($twtmp,$twtmp);
1166 &movdqa
($inout4,$tweak); # put aside previous tweak
1167 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1168 &pand
($twres,$twmask); # isolate carry and residue
1169 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1170 &pxor
($tweak,$twres);
1172 &jb
(&label
("xts_enc_three"));
1174 &pshufd
($twres,$twtmp,0x13);
1175 &pxor
($twtmp,$twtmp);
1176 &movdqa
($inout5,$tweak); # put aside previous tweak
1177 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1178 &pand
($twres,$twmask); # isolate carry and residue
1179 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1180 &pxor
($tweak,$twres);
1181 &movdqa
(&QWP
(16*0,"esp"),$inout3);
1182 &movdqa
(&QWP
(16*1,"esp"),$inout4);
1183 &je
(&label
("xts_enc_four"));
1185 &movdqa
(&QWP
(16*2,"esp"),$inout5);
1186 &pshufd
($inout5,$twtmp,0x13);
1187 &movdqa
(&QWP
(16*3,"esp"),$tweak);
1188 &paddq
($tweak,$tweak); # &psllq($inout0,1);
1189 &pand
($inout5,$twmask); # isolate carry and residue
1190 &pxor
($inout5,$tweak);
1192 &movdqu
($inout0,&QWP
(16*0,$inp)); # load input
1193 &movdqu
($inout1,&QWP
(16*1,$inp));
1194 &movdqu
($inout2,&QWP
(16*2,$inp));
1195 &pxor
($inout0,&QWP
(16*0,"esp")); # input^=tweak
1196 &movdqu
($inout3,&QWP
(16*3,$inp));
1197 &pxor
($inout1,&QWP
(16*1,"esp"));
1198 &movdqu
($inout4,&QWP
(16*4,$inp));
1199 &pxor
($inout2,&QWP
(16*2,"esp"));
1200 &lea
($inp,&DWP
(16*5,$inp));
1201 &pxor
($inout3,&QWP
(16*3,"esp"));
1202 &movdqa
(&QWP
(16*4,"esp"),$inout5); # save last tweak
1203 &pxor
($inout4,$inout5);
1205 &call
("_aesni_encrypt6");
1207 &movaps
($tweak,&QWP
(16*4,"esp")); # last tweak
1208 &xorps
($inout0,&QWP
(16*0,"esp")); # output^=tweak
1209 &xorps
($inout1,&QWP
(16*1,"esp"));
1210 &xorps
($inout2,&QWP
(16*2,"esp"));
1211 &movups
(&QWP
(16*0,$out),$inout0); # write output
1212 &xorps
($inout3,&QWP
(16*3,"esp"));
1213 &movups
(&QWP
(16*1,$out),$inout1);
1214 &xorps
($inout4,$tweak);
1215 &movups
(&QWP
(16*2,$out),$inout2);
1216 &movups
(&QWP
(16*3,$out),$inout3);
1217 &movups
(&QWP
(16*4,$out),$inout4);
1218 &lea
($out,&DWP
(16*5,$out));
1219 &jmp
(&label
("xts_enc_done"));
1221 &set_label
("xts_enc_one",16);
1222 &movups
($inout0,&QWP
(16*0,$inp)); # load input
1223 &lea
($inp,&DWP
(16*1,$inp));
1224 &xorps
($inout0,$inout3); # input^=tweak
1226 { &aesni_inline_generate1
("enc"); }
1228 { &call
("_aesni_encrypt1"); }
1229 &xorps
($inout0,$inout3); # output^=tweak
1230 &movups
(&QWP
(16*0,$out),$inout0); # write output
1231 &lea
($out,&DWP
(16*1,$out));
1233 &movdqa
($tweak,$inout3); # last tweak
1234 &jmp
(&label
("xts_enc_done"));
1236 &set_label
("xts_enc_two",16);
1237 &movaps
($inout4,$tweak); # put aside last tweak
1239 &movups
($inout0,&QWP
(16*0,$inp)); # load input
1240 &movups
($inout1,&QWP
(16*1,$inp));
1241 &lea
($inp,&DWP
(16*2,$inp));
1242 &xorps
($inout0,$inout3); # input^=tweak
1243 &xorps
($inout1,$inout4);
1244 &xorps
($inout2,$inout2);
1246 &call
("_aesni_encrypt3");
1248 &xorps
($inout0,$inout3); # output^=tweak
1249 &xorps
($inout1,$inout4);
1250 &movups
(&QWP
(16*0,$out),$inout0); # write output
1251 &movups
(&QWP
(16*1,$out),$inout1);
1252 &lea
($out,&DWP
(16*2,$out));
1254 &movdqa
($tweak,$inout4); # last tweak
1255 &jmp
(&label
("xts_enc_done"));
1257 &set_label
("xts_enc_three",16);
1258 &movaps
($inout5,$tweak); # put aside last tweak
1259 &movups
($inout0,&QWP
(16*0,$inp)); # load input
1260 &movups
($inout1,&QWP
(16*1,$inp));
1261 &movups
($inout2,&QWP
(16*2,$inp));
1262 &lea
($inp,&DWP
(16*3,$inp));
1263 &xorps
($inout0,$inout3); # input^=tweak
1264 &xorps
($inout1,$inout4);
1265 &xorps
($inout2,$inout5);
1267 &call
("_aesni_encrypt3");
1269 &xorps
($inout0,$inout3); # output^=tweak
1270 &xorps
($inout1,$inout4);
1271 &xorps
($inout2,$inout5);
1272 &movups
(&QWP
(16*0,$out),$inout0); # write output
1273 &movups
(&QWP
(16*1,$out),$inout1);
1274 &movups
(&QWP
(16*2,$out),$inout2);
1275 &lea
($out,&DWP
(16*3,$out));
1277 &movdqa
($tweak,$inout5); # last tweak
1278 &jmp
(&label
("xts_enc_done"));
1280 &set_label
("xts_enc_four",16);
1281 &movaps
($inout4,$tweak); # put aside last tweak
1283 &movups
($inout0,&QWP
(16*0,$inp)); # load input
1284 &movups
($inout1,&QWP
(16*1,$inp));
1285 &movups
($inout2,&QWP
(16*2,$inp));
1286 &xorps
($inout0,&QWP
(16*0,"esp")); # input^=tweak
1287 &movups
($inout3,&QWP
(16*3,$inp));
1288 &lea
($inp,&DWP
(16*4,$inp));
1289 &xorps
($inout1,&QWP
(16*1,"esp"));
1290 &xorps
($inout2,$inout5);
1291 &xorps
($inout3,$inout4);
1293 &call
("_aesni_encrypt4");
1295 &xorps
($inout0,&QWP
(16*0,"esp")); # output^=tweak
1296 &xorps
($inout1,&QWP
(16*1,"esp"));
1297 &xorps
($inout2,$inout5);
1298 &movups
(&QWP
(16*0,$out),$inout0); # write output
1299 &xorps
($inout3,$inout4);
1300 &movups
(&QWP
(16*1,$out),$inout1);
1301 &movups
(&QWP
(16*2,$out),$inout2);
1302 &movups
(&QWP
(16*3,$out),$inout3);
1303 &lea
($out,&DWP
(16*4,$out));
1305 &movdqa
($tweak,$inout4); # last tweak
1306 &jmp
(&label
("xts_enc_done"));
1308 &set_label
("xts_enc_done6x",16); # $tweak is pre-calculated
1309 &mov
($len,&DWP
(16*7+0,"esp")); # restore original $len
1311 &jz
(&label
("xts_enc_ret"));
1312 &movdqa
($inout3,$tweak);
1313 &mov
(&DWP
(16*7+0,"esp"),$len); # save $len%16
1314 &jmp
(&label
("xts_enc_steal"));
1316 &set_label
("xts_enc_done",16);
1317 &mov
($len,&DWP
(16*7+0,"esp")); # restore original $len
1318 &pxor
($twtmp,$twtmp);
1320 &jz
(&label
("xts_enc_ret"));
1322 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1323 &mov
(&DWP
(16*7+0,"esp"),$len); # save $len%16
1324 &pshufd
($inout3,$twtmp,0x13);
1325 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1326 &pand
($inout3,&QWP
(16*6,"esp")); # isolate carry and residue
1327 &pxor
($inout3,$tweak);
1329 &set_label
("xts_enc_steal");
1330 &movz
($rounds,&BP
(0,$inp));
1331 &movz
($key,&BP
(-16,$out));
1332 &lea
($inp,&DWP
(1,$inp));
1333 &mov
(&BP
(-16,$out),&LB
($rounds));
1334 &mov
(&BP
(0,$out),&LB
($key));
1335 &lea
($out,&DWP
(1,$out));
1337 &jnz
(&label
("xts_enc_steal"));
1339 &sub ($out,&DWP
(16*7+0,"esp")); # rewind $out
1340 &mov
($key,$key_); # restore $key
1341 &mov
($rounds,$rounds_); # restore $rounds
1343 &movups
($inout0,&QWP
(-16,$out)); # load input
1344 &xorps
($inout0,$inout3); # input^=tweak
1346 { &aesni_inline_generate1
("enc"); }
1348 { &call
("_aesni_encrypt1"); }
1349 &xorps
($inout0,$inout3); # output^=tweak
1350 &movups
(&QWP
(-16,$out),$inout0); # write output
1352 &set_label
("xts_enc_ret");
1353 &mov
("esp",&DWP
(16*7+4,"esp")); # restore %esp
1354 &function_end
("aesni_xts_encrypt");
1356 &function_begin
("aesni_xts_decrypt");
1357 &mov
($key,&wparam
(4)); # key2
1358 &mov
($inp,&wparam
(5)); # clear-text tweak
1360 &mov
($rounds,&DWP
(240,$key)); # key2->rounds
1361 &movups
($inout0,&QWP
(0,$inp));
1363 { &aesni_inline_generate1
("enc"); }
1365 { &call
("_aesni_encrypt1"); }
1367 &mov
($inp,&wparam
(0));
1368 &mov
($out,&wparam
(1));
1369 &mov
($len,&wparam
(2));
1370 &mov
($key,&wparam
(3)); # key1
1373 &sub ("esp",16*7+8);
1374 &and ("esp",-16); # align stack
1376 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1378 &setnz
(&LB
($rounds_));
1380 &sub ($len,$rounds_);
1382 &mov
(&DWP
(16*6+0,"esp"),0x87); # compose the magic constant
1383 &mov
(&DWP
(16*6+4,"esp"),0);
1384 &mov
(&DWP
(16*6+8,"esp"),1);
1385 &mov
(&DWP
(16*6+12,"esp"),0);
1386 &mov
(&DWP
(16*7+0,"esp"),$len); # save original $len
1387 &mov
(&DWP
(16*7+4,"esp"),$key_); # save original %esp
1389 &mov
($rounds,&DWP
(240,$key)); # key1->rounds
1390 &mov
($key_,$key); # backup $key
1391 &mov
($rounds_,$rounds); # backup $rounds
1393 &movdqa
($tweak,$inout0);
1394 &pxor
($twtmp,$twtmp);
1395 &movdqa
($twmask,&QWP
(6*16,"esp")); # 0x0...010...87
1396 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1400 &jc
(&label
("xts_dec_short"));
1403 &mov
($rounds_,$rounds);
1404 &jmp
(&label
("xts_dec_loop6"));
1406 &set_label
("xts_dec_loop6",16);
1407 for ($i=0;$i<4;$i++) {
1408 &pshufd
($twres,$twtmp,0x13);
1409 &pxor
($twtmp,$twtmp);
1410 &movdqa
(&QWP
(16*$i,"esp"),$tweak);
1411 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1412 &pand
($twres,$twmask); # isolate carry and residue
1413 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1414 &pxor
($tweak,$twres);
1416 &pshufd
($inout5,$twtmp,0x13);
1417 &movdqa
(&QWP
(16*$i++,"esp"),$tweak);
1418 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1419 &$movekey ($rndkey0,&QWP
(0,$key_));
1420 &pand
($inout5,$twmask); # isolate carry and residue
1421 &movups
($inout0,&QWP
(0,$inp)); # load input
1422 &pxor
($inout5,$tweak);
1424 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1425 &movdqu
($inout1,&QWP
(16*1,$inp));
1426 &xorps
($inout0,$rndkey0); # input^=rndkey[0]
1427 &movdqu
($inout2,&QWP
(16*2,$inp));
1428 &pxor
($inout1,$rndkey0);
1429 &movdqu
($inout3,&QWP
(16*3,$inp));
1430 &pxor
($inout2,$rndkey0);
1431 &movdqu
($inout4,&QWP
(16*4,$inp));
1432 &pxor
($inout3,$rndkey0);
1433 &movdqu
($rndkey1,&QWP
(16*5,$inp));
1434 &pxor
($inout4,$rndkey0);
1435 &lea
($inp,&DWP
(16*6,$inp));
1436 &pxor
($inout0,&QWP
(16*0,"esp")); # input^=tweak
1437 &movdqa
(&QWP
(16*$i,"esp"),$inout5); # save last tweak
1438 &pxor
($inout5,$rndkey1);
1440 &$movekey ($rndkey1,&QWP
(16,$key_));
1441 &lea
($key,&DWP
(32,$key_));
1442 &pxor
($inout1,&QWP
(16*1,"esp"));
1443 &aesdec
($inout0,$rndkey1);
1444 &pxor
($inout2,&QWP
(16*2,"esp"));
1445 &aesdec
($inout1,$rndkey1);
1446 &pxor
($inout3,&QWP
(16*3,"esp"));
1448 &aesdec
($inout2,$rndkey1);
1449 &pxor
($inout4,&QWP
(16*4,"esp"));
1450 &aesdec
($inout3,$rndkey1);
1451 &pxor
($inout5,$rndkey0);
1452 &aesdec
($inout4,$rndkey1);
1453 &$movekey ($rndkey0,&QWP
(0,$key));
1454 &aesdec
($inout5,$rndkey1);
1455 &call
(&label
("_aesni_decrypt6_enter"));
1457 &movdqa
($tweak,&QWP
(16*5,"esp")); # last tweak
1458 &pxor
($twtmp,$twtmp);
1459 &xorps
($inout0,&QWP
(16*0,"esp")); # output^=tweak
1460 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1461 &xorps
($inout1,&QWP
(16*1,"esp"));
1462 &movups
(&QWP
(16*0,$out),$inout0); # write output
1463 &xorps
($inout2,&QWP
(16*2,"esp"));
1464 &movups
(&QWP
(16*1,$out),$inout1);
1465 &xorps
($inout3,&QWP
(16*3,"esp"));
1466 &movups
(&QWP
(16*2,$out),$inout2);
1467 &xorps
($inout4,&QWP
(16*4,"esp"));
1468 &movups
(&QWP
(16*3,$out),$inout3);
1469 &xorps
($inout5,$tweak);
1470 &movups
(&QWP
(16*4,$out),$inout4);
1471 &pshufd
($twres,$twtmp,0x13);
1472 &movups
(&QWP
(16*5,$out),$inout5);
1473 &lea
($out,&DWP
(16*6,$out));
1474 &movdqa
($twmask,&QWP
(16*6,"esp")); # 0x0...010...87
1476 &pxor
($twtmp,$twtmp);
1477 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1478 &pand
($twres,$twmask); # isolate carry and residue
1479 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1480 &mov
($rounds,$rounds_); # restore $rounds
1481 &pxor
($tweak,$twres);
1484 &jnc
(&label
("xts_dec_loop6"));
1486 &lea
($rounds,&DWP
(1,"",$rounds,2)); # restore $rounds
1487 &mov
($key,$key_); # restore $key
1488 &mov
($rounds_,$rounds);
1490 &set_label
("xts_dec_short");
1492 &jz
(&label
("xts_dec_done6x"));
1494 &movdqa
($inout3,$tweak); # put aside previous tweak
1496 &jb
(&label
("xts_dec_one"));
1498 &pshufd
($twres,$twtmp,0x13);
1499 &pxor
($twtmp,$twtmp);
1500 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1501 &pand
($twres,$twmask); # isolate carry and residue
1502 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1503 &pxor
($tweak,$twres);
1504 &je
(&label
("xts_dec_two"));
1506 &pshufd
($twres,$twtmp,0x13);
1507 &pxor
($twtmp,$twtmp);
1508 &movdqa
($inout4,$tweak); # put aside previous tweak
1509 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1510 &pand
($twres,$twmask); # isolate carry and residue
1511 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1512 &pxor
($tweak,$twres);
1514 &jb
(&label
("xts_dec_three"));
1516 &pshufd
($twres,$twtmp,0x13);
1517 &pxor
($twtmp,$twtmp);
1518 &movdqa
($inout5,$tweak); # put aside previous tweak
1519 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1520 &pand
($twres,$twmask); # isolate carry and residue
1521 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1522 &pxor
($tweak,$twres);
1523 &movdqa
(&QWP
(16*0,"esp"),$inout3);
1524 &movdqa
(&QWP
(16*1,"esp"),$inout4);
1525 &je
(&label
("xts_dec_four"));
1527 &movdqa
(&QWP
(16*2,"esp"),$inout5);
1528 &pshufd
($inout5,$twtmp,0x13);
1529 &movdqa
(&QWP
(16*3,"esp"),$tweak);
1530 &paddq
($tweak,$tweak); # &psllq($inout0,1);
1531 &pand
($inout5,$twmask); # isolate carry and residue
1532 &pxor
($inout5,$tweak);
1534 &movdqu
($inout0,&QWP
(16*0,$inp)); # load input
1535 &movdqu
($inout1,&QWP
(16*1,$inp));
1536 &movdqu
($inout2,&QWP
(16*2,$inp));
1537 &pxor
($inout0,&QWP
(16*0,"esp")); # input^=tweak
1538 &movdqu
($inout3,&QWP
(16*3,$inp));
1539 &pxor
($inout1,&QWP
(16*1,"esp"));
1540 &movdqu
($inout4,&QWP
(16*4,$inp));
1541 &pxor
($inout2,&QWP
(16*2,"esp"));
1542 &lea
($inp,&DWP
(16*5,$inp));
1543 &pxor
($inout3,&QWP
(16*3,"esp"));
1544 &movdqa
(&QWP
(16*4,"esp"),$inout5); # save last tweak
1545 &pxor
($inout4,$inout5);
1547 &call
("_aesni_decrypt6");
1549 &movaps
($tweak,&QWP
(16*4,"esp")); # last tweak
1550 &xorps
($inout0,&QWP
(16*0,"esp")); # output^=tweak
1551 &xorps
($inout1,&QWP
(16*1,"esp"));
1552 &xorps
($inout2,&QWP
(16*2,"esp"));
1553 &movups
(&QWP
(16*0,$out),$inout0); # write output
1554 &xorps
($inout3,&QWP
(16*3,"esp"));
1555 &movups
(&QWP
(16*1,$out),$inout1);
1556 &xorps
($inout4,$tweak);
1557 &movups
(&QWP
(16*2,$out),$inout2);
1558 &movups
(&QWP
(16*3,$out),$inout3);
1559 &movups
(&QWP
(16*4,$out),$inout4);
1560 &lea
($out,&DWP
(16*5,$out));
1561 &jmp
(&label
("xts_dec_done"));
1563 &set_label
("xts_dec_one",16);
1564 &movups
($inout0,&QWP
(16*0,$inp)); # load input
1565 &lea
($inp,&DWP
(16*1,$inp));
1566 &xorps
($inout0,$inout3); # input^=tweak
1568 { &aesni_inline_generate1
("dec"); }
1570 { &call
("_aesni_decrypt1"); }
1571 &xorps
($inout0,$inout3); # output^=tweak
1572 &movups
(&QWP
(16*0,$out),$inout0); # write output
1573 &lea
($out,&DWP
(16*1,$out));
1575 &movdqa
($tweak,$inout3); # last tweak
1576 &jmp
(&label
("xts_dec_done"));
1578 &set_label
("xts_dec_two",16);
1579 &movaps
($inout4,$tweak); # put aside last tweak
1581 &movups
($inout0,&QWP
(16*0,$inp)); # load input
1582 &movups
($inout1,&QWP
(16*1,$inp));
1583 &lea
($inp,&DWP
(16*2,$inp));
1584 &xorps
($inout0,$inout3); # input^=tweak
1585 &xorps
($inout1,$inout4);
1587 &call
("_aesni_decrypt3");
1589 &xorps
($inout0,$inout3); # output^=tweak
1590 &xorps
($inout1,$inout4);
1591 &movups
(&QWP
(16*0,$out),$inout0); # write output
1592 &movups
(&QWP
(16*1,$out),$inout1);
1593 &lea
($out,&DWP
(16*2,$out));
1595 &movdqa
($tweak,$inout4); # last tweak
1596 &jmp
(&label
("xts_dec_done"));
1598 &set_label
("xts_dec_three",16);
1599 &movaps
($inout5,$tweak); # put aside last tweak
1600 &movups
($inout0,&QWP
(16*0,$inp)); # load input
1601 &movups
($inout1,&QWP
(16*1,$inp));
1602 &movups
($inout2,&QWP
(16*2,$inp));
1603 &lea
($inp,&DWP
(16*3,$inp));
1604 &xorps
($inout0,$inout3); # input^=tweak
1605 &xorps
($inout1,$inout4);
1606 &xorps
($inout2,$inout5);
1608 &call
("_aesni_decrypt3");
1610 &xorps
($inout0,$inout3); # output^=tweak
1611 &xorps
($inout1,$inout4);
1612 &xorps
($inout2,$inout5);
1613 &movups
(&QWP
(16*0,$out),$inout0); # write output
1614 &movups
(&QWP
(16*1,$out),$inout1);
1615 &movups
(&QWP
(16*2,$out),$inout2);
1616 &lea
($out,&DWP
(16*3,$out));
1618 &movdqa
($tweak,$inout5); # last tweak
1619 &jmp
(&label
("xts_dec_done"));
1621 &set_label
("xts_dec_four",16);
1622 &movaps
($inout4,$tweak); # put aside last tweak
1624 &movups
($inout0,&QWP
(16*0,$inp)); # load input
1625 &movups
($inout1,&QWP
(16*1,$inp));
1626 &movups
($inout2,&QWP
(16*2,$inp));
1627 &xorps
($inout0,&QWP
(16*0,"esp")); # input^=tweak
1628 &movups
($inout3,&QWP
(16*3,$inp));
1629 &lea
($inp,&DWP
(16*4,$inp));
1630 &xorps
($inout1,&QWP
(16*1,"esp"));
1631 &xorps
($inout2,$inout5);
1632 &xorps
($inout3,$inout4);
1634 &call
("_aesni_decrypt4");
1636 &xorps
($inout0,&QWP
(16*0,"esp")); # output^=tweak
1637 &xorps
($inout1,&QWP
(16*1,"esp"));
1638 &xorps
($inout2,$inout5);
1639 &movups
(&QWP
(16*0,$out),$inout0); # write output
1640 &xorps
($inout3,$inout4);
1641 &movups
(&QWP
(16*1,$out),$inout1);
1642 &movups
(&QWP
(16*2,$out),$inout2);
1643 &movups
(&QWP
(16*3,$out),$inout3);
1644 &lea
($out,&DWP
(16*4,$out));
1646 &movdqa
($tweak,$inout4); # last tweak
1647 &jmp
(&label
("xts_dec_done"));
1649 &set_label
("xts_dec_done6x",16); # $tweak is pre-calculated
1650 &mov
($len,&DWP
(16*7+0,"esp")); # restore original $len
1652 &jz
(&label
("xts_dec_ret"));
1653 &mov
(&DWP
(16*7+0,"esp"),$len); # save $len%16
1654 &jmp
(&label
("xts_dec_only_one_more"));
1656 &set_label
("xts_dec_done",16);
1657 &mov
($len,&DWP
(16*7+0,"esp")); # restore original $len
1658 &pxor
($twtmp,$twtmp);
1660 &jz
(&label
("xts_dec_ret"));
1662 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1663 &mov
(&DWP
(16*7+0,"esp"),$len); # save $len%16
1664 &pshufd
($twres,$twtmp,0x13);
1665 &pxor
($twtmp,$twtmp);
1666 &movdqa
($twmask,&QWP
(16*6,"esp"));
1667 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1668 &pand
($twres,$twmask); # isolate carry and residue
1669 &pcmpgtd
($twtmp,$tweak); # broadcast upper bits
1670 &pxor
($tweak,$twres);
1672 &set_label
("xts_dec_only_one_more");
1673 &pshufd
($inout3,$twtmp,0x13);
1674 &movdqa
($inout4,$tweak); # put aside previous tweak
1675 &paddq
($tweak,$tweak); # &psllq($tweak,1);
1676 &pand
($inout3,$twmask); # isolate carry and residue
1677 &pxor
($inout3,$tweak);
1679 &mov
($key,$key_); # restore $key
1680 &mov
($rounds,$rounds_); # restore $rounds
1682 &movups
($inout0,&QWP
(0,$inp)); # load input
1683 &xorps
($inout0,$inout3); # input^=tweak
1685 { &aesni_inline_generate1
("dec"); }
1687 { &call
("_aesni_decrypt1"); }
1688 &xorps
($inout0,$inout3); # output^=tweak
1689 &movups
(&QWP
(0,$out),$inout0); # write output
1691 &set_label
("xts_dec_steal");
1692 &movz
($rounds,&BP
(16,$inp));
1693 &movz
($key,&BP
(0,$out));
1694 &lea
($inp,&DWP
(1,$inp));
1695 &mov
(&BP
(0,$out),&LB
($rounds));
1696 &mov
(&BP
(16,$out),&LB
($key));
1697 &lea
($out,&DWP
(1,$out));
1699 &jnz
(&label
("xts_dec_steal"));
1701 &sub ($out,&DWP
(16*7+0,"esp")); # rewind $out
1702 &mov
($key,$key_); # restore $key
1703 &mov
($rounds,$rounds_); # restore $rounds
1705 &movups
($inout0,&QWP
(0,$out)); # load input
1706 &xorps
($inout0,$inout4); # input^=tweak
1708 { &aesni_inline_generate1
("dec"); }
1710 { &call
("_aesni_decrypt1"); }
1711 &xorps
($inout0,$inout4); # output^=tweak
1712 &movups
(&QWP
(0,$out),$inout0); # write output
1714 &set_label
("xts_dec_ret");
1715 &mov
("esp",&DWP
(16*7+4,"esp")); # restore %esp
1716 &function_end
("aesni_xts_decrypt");
1720 ######################################################################
1721 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
1722 # size_t length, const AES_KEY *key,
1723 # unsigned char *ivp,const int enc);
1724 &function_begin
("${PREFIX}_cbc_encrypt");
1725 &mov
($inp,&wparam
(0));
1726 &mov
($rounds_,"esp");
1727 &mov
($out,&wparam
(1));
1729 &mov
($len,&wparam
(2));
1730 &and ($rounds_,-16);
1731 &mov
($key,&wparam
(3));
1732 &mov
($key_,&wparam
(4));
1734 &jz
(&label
("cbc_abort"));
1736 &cmp (&wparam
(5),0);
1737 &xchg
($rounds_,"esp"); # alloca
1738 &movups
($ivec,&QWP
(0,$key_)); # load IV
1739 &mov
($rounds,&DWP
(240,$key));
1740 &mov
($key_,$key); # backup $key
1741 &mov
(&DWP
(16,"esp"),$rounds_); # save original %esp
1742 &mov
($rounds_,$rounds); # backup $rounds
1743 &je
(&label
("cbc_decrypt"));
1745 &movaps
($inout0,$ivec);
1747 &jb
(&label
("cbc_enc_tail"));
1749 &jmp
(&label
("cbc_enc_loop"));
1751 &set_label
("cbc_enc_loop",16);
1752 &movups
($ivec,&QWP
(0,$inp)); # input actually
1753 &lea
($inp,&DWP
(16,$inp));
1755 { &aesni_inline_generate1
("enc",$inout0,$ivec); }
1757 { &xorps
($inout0,$ivec); &call
("_aesni_encrypt1"); }
1758 &mov
($rounds,$rounds_); # restore $rounds
1759 &mov
($key,$key_); # restore $key
1760 &movups
(&QWP
(0,$out),$inout0); # store output
1761 &lea
($out,&DWP
(16,$out));
1763 &jnc
(&label
("cbc_enc_loop"));
1765 &jnz
(&label
("cbc_enc_tail"));
1766 &movaps
($ivec,$inout0);
1767 &jmp
(&label
("cbc_ret"));
1769 &set_label
("cbc_enc_tail");
1770 &mov
("ecx",$len); # zaps $rounds
1771 &data_word
(0xA4F3F689); # rep movsb
1772 &mov
("ecx",16); # zero tail
1774 &xor ("eax","eax"); # zaps $len
1775 &data_word
(0xAAF3F689); # rep stosb
1776 &lea
($out,&DWP
(-16,$out)); # rewind $out by 1 block
1777 &mov
($rounds,$rounds_); # restore $rounds
1778 &mov
($inp,$out); # $inp and $out are the same
1779 &mov
($key,$key_); # restore $key
1780 &jmp
(&label
("cbc_enc_loop"));
1781 ######################################################################
1782 &set_label
("cbc_decrypt",16);
1784 &jbe
(&label
("cbc_dec_tail"));
1785 &movaps
(&QWP
(0,"esp"),$ivec); # save IV
1787 &jmp
(&label
("cbc_dec_loop6_enter"));
1789 &set_label
("cbc_dec_loop6",16);
1790 &movaps
(&QWP
(0,"esp"),$rndkey0); # save IV
1791 &movups
(&QWP
(0,$out),$inout5);
1792 &lea
($out,&DWP
(0x10,$out));
1793 &set_label
("cbc_dec_loop6_enter");
1794 &movdqu
($inout0,&QWP
(0,$inp));
1795 &movdqu
($inout1,&QWP
(0x10,$inp));
1796 &movdqu
($inout2,&QWP
(0x20,$inp));
1797 &movdqu
($inout3,&QWP
(0x30,$inp));
1798 &movdqu
($inout4,&QWP
(0x40,$inp));
1799 &movdqu
($inout5,&QWP
(0x50,$inp));
1801 &call
("_aesni_decrypt6");
1803 &movups
($rndkey1,&QWP
(0,$inp));
1804 &movups
($rndkey0,&QWP
(0x10,$inp));
1805 &xorps
($inout0,&QWP
(0,"esp")); # ^=IV
1806 &xorps
($inout1,$rndkey1);
1807 &movups
($rndkey1,&QWP
(0x20,$inp));
1808 &xorps
($inout2,$rndkey0);
1809 &movups
($rndkey0,&QWP
(0x30,$inp));
1810 &xorps
($inout3,$rndkey1);
1811 &movups
($rndkey1,&QWP
(0x40,$inp));
1812 &xorps
($inout4,$rndkey0);
1813 &movups
($rndkey0,&QWP
(0x50,$inp)); # IV
1814 &xorps
($inout5,$rndkey1);
1815 &movups
(&QWP
(0,$out),$inout0);
1816 &movups
(&QWP
(0x10,$out),$inout1);
1817 &lea
($inp,&DWP
(0x60,$inp));
1818 &movups
(&QWP
(0x20,$out),$inout2);
1819 &mov
($rounds,$rounds_) # restore $rounds
1820 &movups
(&QWP
(0x30,$out),$inout3);
1821 &mov
($key,$key_); # restore $key
1822 &movups
(&QWP
(0x40,$out),$inout4);
1823 &lea
($out,&DWP
(0x50,$out));
1825 &ja
(&label
("cbc_dec_loop6"));
1827 &movaps
($inout0,$inout5);
1828 &movaps
($ivec,$rndkey0);
1830 &jle
(&label
("cbc_dec_tail_collected"));
1831 &movups
(&QWP
(0,$out),$inout0);
1832 &lea
($out,&DWP
(0x10,$out));
1833 &set_label
("cbc_dec_tail");
1834 &movups
($inout0,&QWP
(0,$inp));
1835 &movaps
($in0,$inout0);
1837 &jbe
(&label
("cbc_dec_one"));
1839 &movups
($inout1,&QWP
(0x10,$inp));
1840 &movaps
($in1,$inout1);
1842 &jbe
(&label
("cbc_dec_two"));
1844 &movups
($inout2,&QWP
(0x20,$inp));
1846 &jbe
(&label
("cbc_dec_three"));
1848 &movups
($inout3,&QWP
(0x30,$inp));
1850 &jbe
(&label
("cbc_dec_four"));
1852 &movups
($inout4,&QWP
(0x40,$inp));
1853 &movaps
(&QWP
(0,"esp"),$ivec); # save IV
1854 &movups
($inout0,&QWP
(0,$inp));
1855 &xorps
($inout5,$inout5);
1856 &call
("_aesni_decrypt6");
1857 &movups
($rndkey1,&QWP
(0,$inp));
1858 &movups
($rndkey0,&QWP
(0x10,$inp));
1859 &xorps
($inout0,&QWP
(0,"esp")); # ^= IV
1860 &xorps
($inout1,$rndkey1);
1861 &movups
($rndkey1,&QWP
(0x20,$inp));
1862 &xorps
($inout2,$rndkey0);
1863 &movups
($rndkey0,&QWP
(0x30,$inp));
1864 &xorps
($inout3,$rndkey1);
1865 &movups
($ivec,&QWP
(0x40,$inp)); # IV
1866 &xorps
($inout4,$rndkey0);
1867 &movups
(&QWP
(0,$out),$inout0);
1868 &movups
(&QWP
(0x10,$out),$inout1);
1869 &movups
(&QWP
(0x20,$out),$inout2);
1870 &movups
(&QWP
(0x30,$out),$inout3);
1871 &lea
($out,&DWP
(0x40,$out));
1872 &movaps
($inout0,$inout4);
1874 &jmp
(&label
("cbc_dec_tail_collected"));
1876 &set_label
("cbc_dec_one",16);
1878 { &aesni_inline_generate1
("dec"); }
1880 { &call
("_aesni_decrypt1"); }
1881 &xorps
($inout0,$ivec);
1882 &movaps
($ivec,$in0);
1884 &jmp
(&label
("cbc_dec_tail_collected"));
1886 &set_label
("cbc_dec_two",16);
1887 &xorps
($inout2,$inout2);
1888 &call
("_aesni_decrypt3");
1889 &xorps
($inout0,$ivec);
1890 &xorps
($inout1,$in0);
1891 &movups
(&QWP
(0,$out),$inout0);
1892 &movaps
($inout0,$inout1);
1893 &lea
($out,&DWP
(0x10,$out));
1894 &movaps
($ivec,$in1);
1896 &jmp
(&label
("cbc_dec_tail_collected"));
1898 &set_label
("cbc_dec_three",16);
1899 &call
("_aesni_decrypt3");
1900 &xorps
($inout0,$ivec);
1901 &xorps
($inout1,$in0);
1902 &xorps
($inout2,$in1);
1903 &movups
(&QWP
(0,$out),$inout0);
1904 &movaps
($inout0,$inout2);
1905 &movups
(&QWP
(0x10,$out),$inout1);
1906 &lea
($out,&DWP
(0x20,$out));
1907 &movups
($ivec,&QWP
(0x20,$inp));
1909 &jmp
(&label
("cbc_dec_tail_collected"));
1911 &set_label
("cbc_dec_four",16);
1912 &call
("_aesni_decrypt4");
1913 &movups
($rndkey1,&QWP
(0x10,$inp));
1914 &movups
($rndkey0,&QWP
(0x20,$inp));
1915 &xorps
($inout0,$ivec);
1916 &movups
($ivec,&QWP
(0x30,$inp));
1917 &xorps
($inout1,$in0);
1918 &movups
(&QWP
(0,$out),$inout0);
1919 &xorps
($inout2,$rndkey1);
1920 &movups
(&QWP
(0x10,$out),$inout1);
1921 &xorps
($inout3,$rndkey0);
1922 &movups
(&QWP
(0x20,$out),$inout2);
1923 &lea
($out,&DWP
(0x30,$out));
1924 &movaps
($inout0,$inout3);
1927 &set_label
("cbc_dec_tail_collected");
1929 &jnz
(&label
("cbc_dec_tail_partial"));
1930 &movups
(&QWP
(0,$out),$inout0);
1931 &jmp
(&label
("cbc_ret"));
1933 &set_label
("cbc_dec_tail_partial",16);
1934 &movaps
(&QWP
(0,"esp"),$inout0);
1938 &data_word
(0xA4F3F689); # rep movsb
1940 &set_label
("cbc_ret");
1941 &mov
("esp",&DWP
(16,"esp")); # pull original %esp
1942 &mov
($key_,&wparam
(4));
1943 &movups
(&QWP
(0,$key_),$ivec); # output IV
1944 &set_label
("cbc_abort");
1945 &function_end
("${PREFIX}_cbc_encrypt");
1947 ######################################################################
1948 # Mechanical port from aesni-x86_64.pl.
1950 # _aesni_set_encrypt_key is private interface,
1952 # "eax" const unsigned char *userKey
1959 &function_begin_B
("_aesni_set_encrypt_key");
1960 &test
("eax","eax");
1961 &jz
(&label
("bad_pointer"));
1963 &jz
(&label
("bad_pointer"));
1965 &movups
("xmm0",&QWP
(0,"eax")); # pull first 128 bits of *userKey
1966 &xorps
("xmm4","xmm4"); # low dword of xmm4 is assumed 0
1967 &lea
($key,&DWP
(16,$key));
1969 &je
(&label
("14rounds"));
1971 &je
(&label
("12rounds"));
1973 &jne
(&label
("bad_keybits"));
1975 &set_label
("10rounds",16);
1977 &$movekey (&QWP
(-16,$key),"xmm0"); # round 0
1978 &aeskeygenassist
("xmm1","xmm0",0x01); # round 1
1979 &call
(&label
("key_128_cold"));
1980 &aeskeygenassist
("xmm1","xmm0",0x2); # round 2
1981 &call
(&label
("key_128"));
1982 &aeskeygenassist
("xmm1","xmm0",0x04); # round 3
1983 &call
(&label
("key_128"));
1984 &aeskeygenassist
("xmm1","xmm0",0x08); # round 4
1985 &call
(&label
("key_128"));
1986 &aeskeygenassist
("xmm1","xmm0",0x10); # round 5
1987 &call
(&label
("key_128"));
1988 &aeskeygenassist
("xmm1","xmm0",0x20); # round 6
1989 &call
(&label
("key_128"));
1990 &aeskeygenassist
("xmm1","xmm0",0x40); # round 7
1991 &call
(&label
("key_128"));
1992 &aeskeygenassist
("xmm1","xmm0",0x80); # round 8
1993 &call
(&label
("key_128"));
1994 &aeskeygenassist
("xmm1","xmm0",0x1b); # round 9
1995 &call
(&label
("key_128"));
1996 &aeskeygenassist
("xmm1","xmm0",0x36); # round 10
1997 &call
(&label
("key_128"));
1998 &$movekey (&QWP
(0,$key),"xmm0");
1999 &mov
(&DWP
(80,$key),$rounds);
2003 &set_label
("key_128",16);
2004 &$movekey (&QWP
(0,$key),"xmm0");
2005 &lea
($key,&DWP
(16,$key));
2006 &set_label
("key_128_cold");
2007 &shufps
("xmm4","xmm0",0b00010000
);
2008 &xorps
("xmm0","xmm4");
2009 &shufps
("xmm4","xmm0",0b10001100
);
2010 &xorps
("xmm0","xmm4");
2011 &shufps
("xmm1","xmm1",0b11111111
); # critical path
2012 &xorps
("xmm0","xmm1");
2015 &set_label
("12rounds",16);
2016 &movq
("xmm2",&QWP
(16,"eax")); # remaining 1/3 of *userKey
2018 &$movekey (&QWP
(-16,$key),"xmm0") # round 0
2019 &aeskeygenassist
("xmm1","xmm2",0x01); # round 1,2
2020 &call
(&label
("key_192a_cold"));
2021 &aeskeygenassist
("xmm1","xmm2",0x02); # round 2,3
2022 &call
(&label
("key_192b"));
2023 &aeskeygenassist
("xmm1","xmm2",0x04); # round 4,5
2024 &call
(&label
("key_192a"));
2025 &aeskeygenassist
("xmm1","xmm2",0x08); # round 5,6
2026 &call
(&label
("key_192b"));
2027 &aeskeygenassist
("xmm1","xmm2",0x10); # round 7,8
2028 &call
(&label
("key_192a"));
2029 &aeskeygenassist
("xmm1","xmm2",0x20); # round 8,9
2030 &call
(&label
("key_192b"));
2031 &aeskeygenassist
("xmm1","xmm2",0x40); # round 10,11
2032 &call
(&label
("key_192a"));
2033 &aeskeygenassist
("xmm1","xmm2",0x80); # round 11,12
2034 &call
(&label
("key_192b"));
2035 &$movekey (&QWP
(0,$key),"xmm0");
2036 &mov
(&DWP
(48,$key),$rounds);
2040 &set_label
("key_192a",16);
2041 &$movekey (&QWP
(0,$key),"xmm0");
2042 &lea
($key,&DWP
(16,$key));
2043 &set_label
("key_192a_cold",16);
2044 &movaps
("xmm5","xmm2");
2045 &set_label
("key_192b_warm");
2046 &shufps
("xmm4","xmm0",0b00010000
);
2047 &movdqa
("xmm3","xmm2");
2048 &xorps
("xmm0","xmm4");
2049 &shufps
("xmm4","xmm0",0b10001100
);
2051 &xorps
("xmm0","xmm4");
2052 &pshufd
("xmm1","xmm1",0b01010101
); # critical path
2053 &pxor
("xmm2","xmm3");
2054 &pxor
("xmm0","xmm1");
2055 &pshufd
("xmm3","xmm0",0b11111111
);
2056 &pxor
("xmm2","xmm3");
2059 &set_label
("key_192b",16);
2060 &movaps
("xmm3","xmm0");
2061 &shufps
("xmm5","xmm0",0b01000100
);
2062 &$movekey (&QWP
(0,$key),"xmm5");
2063 &shufps
("xmm3","xmm2",0b01001110
);
2064 &$movekey (&QWP
(16,$key),"xmm3");
2065 &lea
($key,&DWP
(32,$key));
2066 &jmp
(&label
("key_192b_warm"));
2068 &set_label
("14rounds",16);
2069 &movups
("xmm2",&QWP
(16,"eax")); # remaining half of *userKey
2071 &lea
($key,&DWP
(16,$key));
2072 &$movekey (&QWP
(-32,$key),"xmm0"); # round 0
2073 &$movekey (&QWP
(-16,$key),"xmm2"); # round 1
2074 &aeskeygenassist
("xmm1","xmm2",0x01); # round 2
2075 &call
(&label
("key_256a_cold"));
2076 &aeskeygenassist
("xmm1","xmm0",0x01); # round 3
2077 &call
(&label
("key_256b"));
2078 &aeskeygenassist
("xmm1","xmm2",0x02); # round 4
2079 &call
(&label
("key_256a"));
2080 &aeskeygenassist
("xmm1","xmm0",0x02); # round 5
2081 &call
(&label
("key_256b"));
2082 &aeskeygenassist
("xmm1","xmm2",0x04); # round 6
2083 &call
(&label
("key_256a"));
2084 &aeskeygenassist
("xmm1","xmm0",0x04); # round 7
2085 &call
(&label
("key_256b"));
2086 &aeskeygenassist
("xmm1","xmm2",0x08); # round 8
2087 &call
(&label
("key_256a"));
2088 &aeskeygenassist
("xmm1","xmm0",0x08); # round 9
2089 &call
(&label
("key_256b"));
2090 &aeskeygenassist
("xmm1","xmm2",0x10); # round 10
2091 &call
(&label
("key_256a"));
2092 &aeskeygenassist
("xmm1","xmm0",0x10); # round 11
2093 &call
(&label
("key_256b"));
2094 &aeskeygenassist
("xmm1","xmm2",0x20); # round 12
2095 &call
(&label
("key_256a"));
2096 &aeskeygenassist
("xmm1","xmm0",0x20); # round 13
2097 &call
(&label
("key_256b"));
2098 &aeskeygenassist
("xmm1","xmm2",0x40); # round 14
2099 &call
(&label
("key_256a"));
2100 &$movekey (&QWP
(0,$key),"xmm0");
2101 &mov
(&DWP
(16,$key),$rounds);
2105 &set_label
("key_256a",16);
2106 &$movekey (&QWP
(0,$key),"xmm2");
2107 &lea
($key,&DWP
(16,$key));
2108 &set_label
("key_256a_cold");
2109 &shufps
("xmm4","xmm0",0b00010000
);
2110 &xorps
("xmm0","xmm4");
2111 &shufps
("xmm4","xmm0",0b10001100
);
2112 &xorps
("xmm0","xmm4");
2113 &shufps
("xmm1","xmm1",0b11111111
); # critical path
2114 &xorps
("xmm0","xmm1");
2117 &set_label
("key_256b",16);
2118 &$movekey (&QWP
(0,$key),"xmm0");
2119 &lea
($key,&DWP
(16,$key));
2121 &shufps
("xmm4","xmm2",0b00010000
);
2122 &xorps
("xmm2","xmm4");
2123 &shufps
("xmm4","xmm2",0b10001100
);
2124 &xorps
("xmm2","xmm4");
2125 &shufps
("xmm1","xmm1",0b10101010
); # critical path
2126 &xorps
("xmm2","xmm1");
2129 &set_label
("bad_pointer",4);
2132 &set_label
("bad_keybits",4);
2135 &function_end_B
("_aesni_set_encrypt_key");
2137 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2139 &function_begin_B
("${PREFIX}_set_encrypt_key");
2140 &mov
("eax",&wparam
(0));
2141 &mov
($rounds,&wparam
(1));
2142 &mov
($key,&wparam
(2));
2143 &call
("_aesni_set_encrypt_key");
2145 &function_end_B
("${PREFIX}_set_encrypt_key");
2147 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2149 &function_begin_B
("${PREFIX}_set_decrypt_key");
2150 &mov
("eax",&wparam
(0));
2151 &mov
($rounds,&wparam
(1));
2152 &mov
($key,&wparam
(2));
2153 &call
("_aesni_set_encrypt_key");
2154 &mov
($key,&wparam
(2));
2155 &shl
($rounds,4) # rounds-1 after _aesni_set_encrypt_key
2156 &test
("eax","eax");
2157 &jnz
(&label
("dec_key_ret"));
2158 &lea
("eax",&DWP
(16,$key,$rounds)); # end of key schedule
2160 &$movekey ("xmm0",&QWP
(0,$key)); # just swap
2161 &$movekey ("xmm1",&QWP
(0,"eax"));
2162 &$movekey (&QWP
(0,"eax"),"xmm0");
2163 &$movekey (&QWP
(0,$key),"xmm1");
2164 &lea
($key,&DWP
(16,$key));
2165 &lea
("eax",&DWP
(-16,"eax"));
2167 &set_label
("dec_key_inverse");
2168 &$movekey ("xmm0",&QWP
(0,$key)); # swap and inverse
2169 &$movekey ("xmm1",&QWP
(0,"eax"));
2170 &aesimc
("xmm0","xmm0");
2171 &aesimc
("xmm1","xmm1");
2172 &lea
($key,&DWP
(16,$key));
2173 &lea
("eax",&DWP
(-16,"eax"));
2174 &$movekey (&QWP
(16,"eax"),"xmm0");
2175 &$movekey (&QWP
(-16,$key),"xmm1");
2177 &ja
(&label
("dec_key_inverse"));
2179 &$movekey ("xmm0",&QWP
(0,$key)); # inverse middle
2180 &aesimc
("xmm0","xmm0");
2181 &$movekey (&QWP
(0,$key),"xmm0");
2183 &xor ("eax","eax"); # return success
2184 &set_label
("dec_key_ret");
2186 &function_end_B
("${PREFIX}_set_decrypt_key");
2187 &asciz
("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");