doc update
[gnutls.git] / devel / perlasm / e_padlock-x86_64.pl
blob4d71d06f02d1591ed577d028193de22a5fd1106a
1 #!/usr/bin/env perl
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # September 2011
12 # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
13 # details.
15 $flavour = shift;
16 $output = shift;
17 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
19 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
21 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
22 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
23 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
24 die "can't locate x86_64-xlate.pl";
26 open STDOUT,"| $^X $xlate $flavour $output";
28 $code=".text\n";
30 %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
31 $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
33 $ctx="%rdx";
34 $out="%rdi";
35 $inp="%rsi";
36 $len="%rcx";
37 $chunk="%rbx";
39 ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
40 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
42 $code.=<<___;
43 .globl padlock_capability
44 .type padlock_capability,\@abi-omnipotent
45 .align 16
46 padlock_capability:
47 mov %rbx,%r8
48 xor %eax,%eax
49 cpuid
50 xor %eax,%eax
51 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
52 jne .Lnoluck
53 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
54 jne .Lnoluck
55 cmp \$`"0x".unpack("H*",'slua')`,%ecx
56 jne .Lnoluck
57 mov \$0xC0000000,%eax
58 cpuid
59 mov %eax,%edx
60 xor %eax,%eax
61 cmp \$0xC0000001,%edx
62 jb .Lnoluck
63 mov \$0xC0000001,%eax
64 cpuid
65 mov %edx,%eax
66 and \$0xffffffef,%eax
67 or \$0x10,%eax # set Nano bit#4
68 .Lnoluck:
69 mov %r8,%rbx
70 ret
71 .size padlock_capability,.-padlock_capability
73 .globl padlock_key_bswap
74 .type padlock_key_bswap,\@abi-omnipotent,0
75 .align 16
76 padlock_key_bswap:
77 mov 240($arg1),%edx
78 .Lbswap_loop:
79 mov ($arg1),%eax
80 bswap %eax
81 mov %eax,($arg1)
82 lea 4($arg1),$arg1
83 sub \$1,%edx
84 jnz .Lbswap_loop
85 ret
86 .size padlock_key_bswap,.-padlock_key_bswap
88 .globl padlock_verify_context
89 .type padlock_verify_context,\@abi-omnipotent
90 .align 16
91 padlock_verify_context:
92 mov $arg1,$ctx
93 pushf
94 lea .Lpadlock_saved_context(%rip),%rax
95 call _padlock_verify_ctx
96 lea 8(%rsp),%rsp
97 ret
98 .size padlock_verify_context,.-padlock_verify_context
100 .type _padlock_verify_ctx,\@abi-omnipotent
101 .align 16
102 _padlock_verify_ctx:
103 mov 8(%rsp),%r8
104 bt \$30,%r8
105 jnc .Lverified
106 cmp (%rax),$ctx
107 je .Lverified
108 pushf
109 popf
110 .Lverified:
111 mov $ctx,(%rax)
113 .size _padlock_verify_ctx,.-_padlock_verify_ctx
115 .globl padlock_reload_key
116 .type padlock_reload_key,\@abi-omnipotent
117 .align 16
118 padlock_reload_key:
119 pushf
120 popf
122 .size padlock_reload_key,.-padlock_reload_key
124 .globl padlock_aes_block
125 .type padlock_aes_block,\@function,3
126 .align 16
127 padlock_aes_block:
128 mov %rbx,%r8
129 mov \$1,$len
130 lea 32($ctx),%rbx # key
131 lea 16($ctx),$ctx # control word
132 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
133 mov %r8,%rbx
135 .size padlock_aes_block,.-padlock_aes_block
137 .globl padlock_xstore
138 .type padlock_xstore,\@function,2
139 .align 16
140 padlock_xstore:
141 mov %esi,%edx
142 .byte 0x0f,0xa7,0xc0 # xstore
144 .size padlock_xstore,.-padlock_xstore
146 .globl padlock_sha1_oneshot
147 .type padlock_sha1_oneshot,\@function,3
148 .align 16
149 padlock_sha1_oneshot:
150 mov %rdx,%rcx
151 mov %rdi,%rdx # put aside %rdi
152 movups (%rdi),%xmm0 # copy-in context
153 sub \$128+8,%rsp
154 mov 16(%rdi),%eax
155 movaps %xmm0,(%rsp)
156 mov %rsp,%rdi
157 mov %eax,16(%rsp)
158 xor %rax,%rax
159 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
160 movaps (%rsp),%xmm0
161 mov 16(%rsp),%eax
162 add \$128+8,%rsp
163 movups %xmm0,(%rdx) # copy-out context
164 mov %eax,16(%rdx)
166 .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
168 .globl padlock_sha1_blocks
169 .type padlock_sha1_blocks,\@function,3
170 .align 16
171 padlock_sha1_blocks:
172 mov %rdx,%rcx
173 mov %rdi,%rdx # put aside %rdi
174 movups (%rdi),%xmm0 # copy-in context
175 sub \$128+8,%rsp
176 mov 16(%rdi),%eax
177 movaps %xmm0,(%rsp)
178 mov %rsp,%rdi
179 mov %eax,16(%rsp)
180 mov \$-1,%rax
181 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
182 movaps (%rsp),%xmm0
183 mov 16(%rsp),%eax
184 add \$128+8,%rsp
185 movups %xmm0,(%rdx) # copy-out context
186 mov %eax,16(%rdx)
188 .size padlock_sha1_blocks,.-padlock_sha1_blocks
190 .globl padlock_sha256_oneshot
191 .type padlock_sha256_oneshot,\@function,3
192 .align 16
193 padlock_sha256_oneshot:
194 mov %rdx,%rcx
195 mov %rdi,%rdx # put aside %rdi
196 movups (%rdi),%xmm0 # copy-in context
197 sub \$128+8,%rsp
198 movups 16(%rdi),%xmm1
199 movaps %xmm0,(%rsp)
200 mov %rsp,%rdi
201 movaps %xmm1,16(%rsp)
202 xor %rax,%rax
203 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
204 movaps (%rsp),%xmm0
205 movaps 16(%rsp),%xmm1
206 add \$128+8,%rsp
207 movups %xmm0,(%rdx) # copy-out context
208 movups %xmm1,16(%rdx)
210 .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
212 .globl padlock_sha256_blocks
213 .type padlock_sha256_blocks,\@function,3
214 .align 16
215 padlock_sha256_blocks:
216 mov %rdx,%rcx
217 mov %rdi,%rdx # put aside %rdi
218 movups (%rdi),%xmm0 # copy-in context
219 sub \$128+8,%rsp
220 movups 16(%rdi),%xmm1
221 movaps %xmm0,(%rsp)
222 mov %rsp,%rdi
223 movaps %xmm1,16(%rsp)
224 mov \$-1,%rax
225 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
226 movaps (%rsp),%xmm0
227 movaps 16(%rsp),%xmm1
228 add \$128+8,%rsp
229 movups %xmm0,(%rdx) # copy-out context
230 movups %xmm1,16(%rdx)
232 .size padlock_sha256_blocks,.-padlock_sha256_blocks
234 .globl padlock_sha512_blocks
235 .type padlock_sha512_blocks,\@function,3
236 .align 16
237 padlock_sha512_blocks:
238 mov %rdx,%rcx
239 mov %rdi,%rdx # put aside %rdi
240 movups (%rdi),%xmm0 # copy-in context
241 sub \$128+8,%rsp
242 movups 16(%rdi),%xmm1
243 movups 32(%rdi),%xmm2
244 movups 48(%rdi),%xmm3
245 movaps %xmm0,(%rsp)
246 mov %rsp,%rdi
247 movaps %xmm1,16(%rsp)
248 movaps %xmm2,32(%rsp)
249 movaps %xmm3,48(%rsp)
250 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
251 movaps (%rsp),%xmm0
252 movaps 16(%rsp),%xmm1
253 movaps 32(%rsp),%xmm2
254 movaps 48(%rsp),%xmm3
255 add \$128+8,%rsp
256 movups %xmm0,(%rdx) # copy-out context
257 movups %xmm1,16(%rdx)
258 movups %xmm2,32(%rdx)
259 movups %xmm3,48(%rdx)
261 .size padlock_sha512_blocks,.-padlock_sha512_blocks
264 sub generate_mode {
265 my ($mode,$opcode) = @_;
266 # int padlock_$mode_encrypt(void *out, const void *inp,
267 # struct padlock_cipher_data *ctx, size_t len);
268 $code.=<<___;
269 .globl padlock_${mode}_encrypt
270 .type padlock_${mode}_encrypt,\@function,4
271 .align 16
272 padlock_${mode}_encrypt:
273 push %rbp
274 push %rbx
276 xor %eax,%eax
277 test \$15,$ctx
278 jnz .L${mode}_abort
279 test \$15,$len
280 jnz .L${mode}_abort
281 lea .Lpadlock_saved_context(%rip),%rax
282 pushf
284 call _padlock_verify_ctx
285 lea 16($ctx),$ctx # control word
286 xor %eax,%eax
287 xor %ebx,%ebx
288 testl \$`1<<5`,($ctx) # align bit in control word
289 jnz .L${mode}_aligned
290 test \$0x0f,$out
291 setz %al # !out_misaligned
292 test \$0x0f,$inp
293 setz %bl # !inp_misaligned
294 test %ebx,%eax
295 jnz .L${mode}_aligned
296 neg %rax
297 mov \$$PADLOCK_CHUNK,$chunk
298 not %rax # out_misaligned?-1:0
299 lea (%rsp),%rbp
300 cmp $chunk,$len
301 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
302 and $chunk,%rax # out_misaligned?chunk:0
303 mov $len,$chunk
304 neg %rax
305 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
306 lea (%rax,%rbp),%rsp
307 mov \$$PADLOCK_CHUNK,%rax
308 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
310 $code.=<<___ if ($mode eq "ctr32");
311 .L${mode}_reenter:
312 mov -4($ctx),%eax # pull 32-bit counter
313 bswap %eax
314 neg %eax
315 and \$`$PADLOCK_CHUNK/16-1`,%eax
316 mov \$$PADLOCK_CHUNK,$chunk
317 shl \$4,%eax
318 cmovz $chunk,%rax
319 cmp %rax,$len
320 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
321 cmovbe $len,$chunk
323 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
324 cmp $chunk,$len
325 ja .L${mode}_loop
326 mov $inp,%rax # check if prefetch crosses page
327 cmp %rsp,%rbp
328 cmove $out,%rax
329 add $len,%rax
330 neg %rax
331 and \$0xfff,%rax # distance to page boundary
332 cmp \$$PADLOCK_PREFETCH{$mode},%rax
333 mov \$-$PADLOCK_PREFETCH{$mode},%rax
334 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
335 and %rax,$chunk
336 jz .L${mode}_unaligned_tail
338 $code.=<<___;
339 jmp .L${mode}_loop
340 .align 16
341 .L${mode}_loop:
342 cmp $len,$chunk # ctr32 artefact
343 cmova $len,$chunk # ctr32 artefact
344 mov $out,%r8 # save parameters
345 mov $inp,%r9
346 mov $len,%r10
347 mov $chunk,$len
348 mov $chunk,%r11
349 test \$0x0f,$out # out_misaligned
350 cmovnz %rsp,$out
351 test \$0x0f,$inp # inp_misaligned
352 jz .L${mode}_inp_aligned
353 shr \$3,$len
354 .byte 0xf3,0x48,0xa5 # rep movsq
355 sub $chunk,$out
356 mov $chunk,$len
357 mov $out,$inp
358 .L${mode}_inp_aligned:
359 lea -16($ctx),%rax # ivp
360 lea 16($ctx),%rbx # key
361 shr \$4,$len
362 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
364 $code.=<<___ if ($mode !~ /ecb|ctr/);
365 movdqa (%rax),%xmm0
366 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
368 $code.=<<___ if ($mode eq "ctr32");
369 mov -4($ctx),%eax # pull 32-bit counter
370 test \$0xffff0000,%eax
371 jnz .L${mode}_no_carry
372 bswap %eax
373 add \$0x10000,%eax
374 bswap %eax
375 mov %eax,-4($ctx)
376 .L${mode}_no_carry:
378 $code.=<<___;
379 mov %r8,$out # restore paramters
380 mov %r11,$chunk
381 test \$0x0f,$out
382 jz .L${mode}_out_aligned
383 mov $chunk,$len
384 lea (%rsp),$inp
385 shr \$3,$len
386 .byte 0xf3,0x48,0xa5 # rep movsq
387 sub $chunk,$out
388 .L${mode}_out_aligned:
389 mov %r9,$inp
390 mov %r10,$len
391 add $chunk,$out
392 add $chunk,$inp
393 sub $chunk,$len
394 mov \$$PADLOCK_CHUNK,$chunk
396 if (!$PADLOCK_PREFETCH{$mode}) {
397 $code.=<<___;
398 jnz .L${mode}_loop
400 } else {
401 $code.=<<___;
402 jz .L${mode}_break
403 cmp $chunk,$len
404 jae .L${mode}_loop
406 $code.=<<___ if ($mode eq "ctr32");
407 mov $len,$chunk
408 mov $inp,%rax # check if prefetch crosses page
409 cmp %rsp,%rbp
410 cmove $out,%rax
411 add $len,%rax
412 neg %rax
413 and \$0xfff,%rax # distance to page boundary
414 cmp \$$PADLOCK_PREFETCH{$mode},%rax
415 mov \$-$PADLOCK_PREFETCH{$mode},%rax
416 cmovae $chunk,%rax
417 and %rax,$chunk
418 jnz .L${mode}_loop
420 $code.=<<___;
421 .L${mode}_unaligned_tail:
422 xor %eax,%eax
423 cmp %rsp,%rbp
424 cmove $len,%rax
425 mov $out,%r8 # save parameters
426 mov $len,$chunk
427 sub %rax,%rsp # alloca
428 shr \$3,$len
429 lea (%rsp),$out
430 .byte 0xf3,0x48,0xa5 # rep movsq
431 mov %rsp,$inp
432 mov %r8, $out # restore parameters
433 mov $chunk,$len
434 jmp .L${mode}_loop
435 .align 16
436 .L${mode}_break:
439 $code.=<<___;
440 cmp %rbp,%rsp
441 je .L${mode}_done
443 pxor %xmm0,%xmm0
444 lea (%rsp),%rax
445 .L${mode}_bzero:
446 movaps %xmm0,(%rax)
447 lea 16(%rax),%rax
448 cmp %rax,%rbp
449 ja .L${mode}_bzero
451 .L${mode}_done:
452 lea (%rbp),%rsp
453 jmp .L${mode}_exit
455 .align 16
456 .L${mode}_aligned:
458 $code.=<<___ if ($mode eq "ctr32");
459 mov -4($ctx),%eax # pull 32-bit counter
460 bswap %eax
461 neg %eax
462 and \$0xffff,%eax
463 mov \$`16*0x10000`,$chunk
464 shl \$4,%eax
465 cmovz $chunk,%rax
466 cmp %rax,$len
467 cmova %rax,$chunk # don't let counter cross 2^16
468 cmovbe $len,$chunk
469 jbe .L${mode}_aligned_skip
471 .L${mode}_aligned_loop:
472 mov $len,%r10 # save parameters
473 mov $chunk,$len
474 mov $chunk,%r11
476 lea -16($ctx),%rax # ivp
477 lea 16($ctx),%rbx # key
478 shr \$4,$len # len/=AES_BLOCK_SIZE
479 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
481 mov -4($ctx),%eax # pull 32-bit counter
482 bswap %eax
483 add \$0x10000,%eax
484 bswap %eax
485 mov %eax,-4($ctx)
487 mov %r10,$len # restore paramters
488 sub %r11,$len
489 mov \$`16*0x10000`,$chunk
490 jz .L${mode}_exit
491 cmp $chunk,$len
492 jae .L${mode}_aligned_loop
494 .L${mode}_aligned_skip:
496 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
497 lea ($inp,$len),%rbp
498 neg %rbp
499 and \$0xfff,%rbp # distance to page boundary
500 xor %eax,%eax
501 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
502 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
503 cmovae %rax,%rbp
504 and $len,%rbp # remainder
505 sub %rbp,$len
506 jz .L${mode}_aligned_tail
508 $code.=<<___;
509 lea -16($ctx),%rax # ivp
510 lea 16($ctx),%rbx # key
511 shr \$4,$len # len/=AES_BLOCK_SIZE
512 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
514 $code.=<<___ if ($mode !~ /ecb|ctr/);
515 movdqa (%rax),%xmm0
516 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
518 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
519 test %rbp,%rbp # check remainder
520 jz .L${mode}_exit
522 .L${mode}_aligned_tail:
523 mov $out,%r8
524 mov %rbp,$chunk
525 mov %rbp,$len
526 lea (%rsp),%rbp
527 sub $len,%rsp
528 shr \$3,$len
529 lea (%rsp),$out
530 .byte 0xf3,0x48,0xa5 # rep movsq
531 lea (%r8),$out
532 lea (%rsp),$inp
533 mov $chunk,$len
534 jmp .L${mode}_loop
536 $code.=<<___;
537 .L${mode}_exit:
538 mov \$1,%eax
539 lea 8(%rsp),%rsp
540 .L${mode}_abort:
541 pop %rbx
542 pop %rbp
544 .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
548 &generate_mode("ecb",0xc8);
549 &generate_mode("cbc",0xd0);
550 #&generate_mode("cfb",0xe0);
551 #&generate_mode("ofb",0xe8);
552 #&generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
554 $code.=<<___;
555 .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
556 .align 16
557 .data
558 .align 8
559 .Lpadlock_saved_context:
560 .quad 0
562 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
564 print $code;
566 close STDOUT;