3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
17 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
19 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
21 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
22 ( $xlate="${dir}x86_64-xlate.pl" and -f
$xlate ) or
23 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f
$xlate) or
24 die "can't locate x86_64-xlate.pl";
26 open STDOUT
,"| $^X $xlate $flavour $output";
30 %PADLOCK_PREFETCH=(ecb
=>128, cbc
=>64, ctr32
=>32); # prefetch errata
31 $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
39 ($arg1,$arg2,$arg3,$arg4)=$win64?
("%rcx","%rdx","%r8", "%r9") : # Win64 order
40 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
43 .globl padlock_capability
44 .type padlock_capability
,\
@abi-omnipotent
51 cmp \
$`"0x".unpack("H*",'tneC')`,%ebx
53 cmp \
$`"0x".unpack("H*",'Hrua')`,%edx
55 cmp \
$`"0x".unpack("H*",'slua')`,%ecx
67 or \
$0x10,%eax # set Nano bit#4
71 .size padlock_capability
,.-padlock_capability
73 .globl padlock_key_bswap
74 .type padlock_key_bswap
,\
@abi-omnipotent
,0
86 .size padlock_key_bswap
,.-padlock_key_bswap
88 .globl padlock_verify_context
89 .type padlock_verify_context
,\
@abi-omnipotent
91 padlock_verify_context
:
94 lea
.Lpadlock_saved_context
(%rip),%rax
95 call _padlock_verify_ctx
98 .size padlock_verify_context
,.-padlock_verify_context
100 .type _padlock_verify_ctx
,\
@abi-omnipotent
113 .size _padlock_verify_ctx
,.-_padlock_verify_ctx
115 .globl padlock_reload_key
116 .type padlock_reload_key
,\
@abi-omnipotent
122 .size padlock_reload_key
,.-padlock_reload_key
124 .globl padlock_aes_block
125 .type padlock_aes_block
,\
@function,3
130 lea
32($ctx),%rbx # key
131 lea
16($ctx),$ctx # control word
132 .byte
0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
135 .size padlock_aes_block
,.-padlock_aes_block
137 .globl padlock_xstore
138 .type padlock_xstore
,\
@function,2
142 .byte
0x0f,0xa7,0xc0 # xstore
144 .size padlock_xstore
,.-padlock_xstore
146 .globl padlock_sha1_oneshot
147 .type padlock_sha1_oneshot
,\
@function,3
149 padlock_sha1_oneshot
:
151 mov
%rdi,%rdx # put aside %rdi
152 movups
(%rdi),%xmm0 # copy-in context
159 .byte
0xf3,0x0f,0xa6,0xc8 # rep xsha1
163 movups
%xmm0,(%rdx) # copy-out context
166 .size padlock_sha1_oneshot
,.-padlock_sha1_oneshot
168 .globl padlock_sha1_blocks
169 .type padlock_sha1_blocks
,\
@function,3
173 mov
%rdi,%rdx # put aside %rdi
174 movups
(%rdi),%xmm0 # copy-in context
181 .byte
0xf3,0x0f,0xa6,0xc8 # rep xsha1
185 movups
%xmm0,(%rdx) # copy-out context
188 .size padlock_sha1_blocks
,.-padlock_sha1_blocks
190 .globl padlock_sha256_oneshot
191 .type padlock_sha256_oneshot
,\
@function,3
193 padlock_sha256_oneshot
:
195 mov
%rdi,%rdx # put aside %rdi
196 movups
(%rdi),%xmm0 # copy-in context
198 movups
16(%rdi),%xmm1
201 movaps
%xmm1,16(%rsp)
203 .byte
0xf3,0x0f,0xa6,0xd0 # rep xsha256
205 movaps
16(%rsp),%xmm1
207 movups
%xmm0,(%rdx) # copy-out context
208 movups
%xmm1,16(%rdx)
210 .size padlock_sha256_oneshot
,.-padlock_sha256_oneshot
212 .globl padlock_sha256_blocks
213 .type padlock_sha256_blocks
,\
@function,3
215 padlock_sha256_blocks
:
217 mov
%rdi,%rdx # put aside %rdi
218 movups
(%rdi),%xmm0 # copy-in context
220 movups
16(%rdi),%xmm1
223 movaps
%xmm1,16(%rsp)
225 .byte
0xf3,0x0f,0xa6,0xd0 # rep xsha256
227 movaps
16(%rsp),%xmm1
229 movups
%xmm0,(%rdx) # copy-out context
230 movups
%xmm1,16(%rdx)
232 .size padlock_sha256_blocks
,.-padlock_sha256_blocks
234 .globl padlock_sha512_blocks
235 .type padlock_sha512_blocks
,\
@function,3
237 padlock_sha512_blocks
:
239 mov
%rdi,%rdx # put aside %rdi
240 movups
(%rdi),%xmm0 # copy-in context
242 movups
16(%rdi),%xmm1
243 movups
32(%rdi),%xmm2
244 movups
48(%rdi),%xmm3
247 movaps
%xmm1,16(%rsp)
248 movaps
%xmm2,32(%rsp)
249 movaps
%xmm3,48(%rsp)
250 .byte
0xf3,0x0f,0xa6,0xe0 # rep xha512
252 movaps
16(%rsp),%xmm1
253 movaps
32(%rsp),%xmm2
254 movaps
48(%rsp),%xmm3
256 movups
%xmm0,(%rdx) # copy-out context
257 movups
%xmm1,16(%rdx)
258 movups
%xmm2,32(%rdx)
259 movups
%xmm3,48(%rdx)
261 .size padlock_sha512_blocks
,.-padlock_sha512_blocks
265 my ($mode,$opcode) = @_;
266 # int padlock_$mode_encrypt(void *out, const void *inp,
267 # struct padlock_cipher_data *ctx, size_t len);
269 .globl padlock_
${mode
}_encrypt
270 .type padlock_
${mode
}_encrypt
,\
@function,4
272 padlock_
${mode
}_encrypt
:
281 lea
.Lpadlock_saved_context
(%rip),%rax
284 call _padlock_verify_ctx
285 lea
16($ctx),$ctx # control word
288 testl \
$`1<<5`,($ctx) # align bit in control word
289 jnz
.L
${mode
}_aligned
291 setz
%al # !out_misaligned
293 setz
%bl # !inp_misaligned
295 jnz
.L
${mode
}_aligned
297 mov \
$$PADLOCK_CHUNK,$chunk
298 not %rax # out_misaligned?-1:0
301 cmovc
$len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
302 and $chunk,%rax # out_misaligned?chunk:0
305 and \
$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
307 mov \
$$PADLOCK_CHUNK,%rax
308 cmovz
%rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
310 $code.=<<___
if ($mode eq "ctr32");
312 mov
-4($ctx),%eax # pull 32-bit counter
315 and \
$`$PADLOCK_CHUNK/16-1`,%eax
316 mov \
$$PADLOCK_CHUNK,$chunk
320 cmova
%rax,$chunk # don't let counter cross PADLOCK_CHUNK
323 $code.=<<___
if ($PADLOCK_PREFETCH{$mode});
326 mov
$inp,%rax # check if prefetch crosses page
331 and \
$0xfff,%rax # distance to page boundary
332 cmp \
$$PADLOCK_PREFETCH{$mode},%rax
333 mov \
$-$PADLOCK_PREFETCH{$mode},%rax
334 cmovae
$chunk,%rax # mask=distance<prefetch?-prefetch:-1
336 jz
.L
${mode
}_unaligned_tail
342 cmp $len,$chunk # ctr32 artefact
343 cmova
$len,$chunk # ctr32 artefact
344 mov
$out,%r8 # save parameters
349 test \
$0x0f,$out # out_misaligned
351 test \
$0x0f,$inp # inp_misaligned
352 jz
.L
${mode
}_inp_aligned
354 .byte
0xf3,0x48,0xa5 # rep movsq
358 .L
${mode
}_inp_aligned
:
359 lea
-16($ctx),%rax # ivp
360 lea
16($ctx),%rbx # key
362 .byte
0xf3,0x0f,0xa7,$opcode # rep xcrypt*
364 $code.=<<___
if ($mode !~ /ecb|ctr/);
366 movdqa
%xmm0,-16($ctx) # copy [or refresh] iv
368 $code.=<<___
if ($mode eq "ctr32");
369 mov
-4($ctx),%eax # pull 32-bit counter
370 test \
$0xffff0000,%eax
371 jnz
.L
${mode
}_no_carry
379 mov
%r8,$out # restore paramters
382 jz
.L
${mode
}_out_aligned
386 .byte
0xf3,0x48,0xa5 # rep movsq
388 .L
${mode
}_out_aligned
:
394 mov \
$$PADLOCK_CHUNK,$chunk
396 if (!$PADLOCK_PREFETCH{$mode}) {
406 $code.=<<___
if ($mode eq "ctr32");
408 mov
$inp,%rax # check if prefetch crosses page
413 and \
$0xfff,%rax # distance to page boundary
414 cmp \
$$PADLOCK_PREFETCH{$mode},%rax
415 mov \
$-$PADLOCK_PREFETCH{$mode},%rax
421 .L
${mode
}_unaligned_tail
:
425 mov
$out,%r8 # save parameters
427 sub %rax,%rsp # alloca
430 .byte
0xf3,0x48,0xa5 # rep movsq
432 mov
%r8, $out # restore parameters
458 $code.=<<___
if ($mode eq "ctr32");
459 mov
-4($ctx),%eax # pull 32-bit counter
463 mov \
$`16*0x10000`,$chunk
467 cmova
%rax,$chunk # don't let counter cross 2^16
469 jbe
.L
${mode
}_aligned_skip
471 .L
${mode
}_aligned_loop
:
472 mov
$len,%r10 # save parameters
476 lea
-16($ctx),%rax # ivp
477 lea
16($ctx),%rbx # key
478 shr \
$4,$len # len/=AES_BLOCK_SIZE
479 .byte
0xf3,0x0f,0xa7,$opcode # rep xcrypt*
481 mov
-4($ctx),%eax # pull 32-bit counter
487 mov
%r10,$len # restore paramters
489 mov \
$`16*0x10000`,$chunk
492 jae
.L
${mode
}_aligned_loop
494 .L
${mode
}_aligned_skip
:
496 $code.=<<___
if ($PADLOCK_PREFETCH{$mode});
499 and \
$0xfff,%rbp # distance to page boundary
501 cmp \
$$PADLOCK_PREFETCH{$mode},%rbp
502 mov \
$$PADLOCK_PREFETCH{$mode}-1,%rbp
504 and $len,%rbp # remainder
506 jz
.L
${mode
}_aligned_tail
509 lea
-16($ctx),%rax # ivp
510 lea
16($ctx),%rbx # key
511 shr \
$4,$len # len/=AES_BLOCK_SIZE
512 .byte
0xf3,0x0f,0xa7,$opcode # rep xcrypt*
514 $code.=<<___
if ($mode !~ /ecb|ctr/);
516 movdqa
%xmm0,-16($ctx) # copy [or refresh] iv
518 $code.=<<___
if ($PADLOCK_PREFETCH{$mode});
519 test
%rbp,%rbp # check remainder
522 .L
${mode
}_aligned_tail
:
530 .byte
0xf3,0x48,0xa5 # rep movsq
544 .size padlock_
${mode
}_encrypt
,.-padlock_
${mode
}_encrypt
548 &generate_mode
("ecb",0xc8);
549 &generate_mode
("cbc",0xd0);
550 #&generate_mode("cfb",0xe0);
551 #&generate_mode("ofb",0xe8);
552 #&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
555 .asciz
"VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
559 .Lpadlock_saved_context
:
562 $code =~ s/\`([^\`]*)\`/eval($1)/gem;