3 # ====================================================================
4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
6 # This module may be used under the terms of either the GNU General
7 # Public License version 2 or later, the GNU Lesser General Public
8 # License version 2.1 or later, the Mozilla Public License version
9 # 1.1 or the BSD License. The exact terms of either license are
10 # distributed along with this module. For further details see
11 # http://www.openssl.org/~appro/camellia/.
12 # ====================================================================
14 # Performance in cycles per processed byte (less is better) in
15 # 'openssl speed ...' benchmark:
17 # AMD K8 Core2 PIII P4
18 # -evp camellia-128-ecb 21.5 22.8 27.0 28.9
19 # + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64%
20 # + over icc 8.0 +48/19% +21/15% +21/17% +55/37%
22 # camellia-128-cbc 17.3 21.1 23.9 25.9
24 # 128-bit key setup 196 280 256 240 cycles/key
25 # + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40%
26 # + over icc 8.0 +18/3% +10/0% +10/3% +21/10%
28 # Pairs of numbers in "+" rows represent performance improvement over
29 # compiler generated position-independent code, PIC, and non-PIC
30 # respectively. PIC results are of greater relevance, as this module
31 # is position-independent, i.e. suitable for a shared library or PIE.
32 # Position independence "costs" one register, which is why compilers
33 # are so close with non-PIC results, they have an extra register to
34 # spare. CBC results are better than ECB ones thanks to "zero-copy"
35 # private _x86_* interface, and are ~30-40% better than with compiler
36 # generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
37 # same CPU (where applicable).
39 $0 =~ m/(.*[\/\\])[^\
/\\]+$/; $dir=$1;
40 push(@INC,"${dir}","${dir}../../perlasm");
45 &asm_init
($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
47 @T=("eax","ebx","ecx","edx");
52 # stack frame layout in _x86_Camellia_* routines, frame is allocated
54 $__ra=&DWP
(0,"esp"); # return address
55 $__s0=&DWP
(4,"esp"); # s0 backing store
56 $__s1=&DWP
(8,"esp"); # s1 backing store
57 $__s2=&DWP
(12,"esp"); # s2 backing store
58 $__s3=&DWP
(16,"esp"); # s3 backing store
59 $__end=&DWP
(20,"esp"); # pointer to end/start of key schedule
61 # stack frame layout in Camellia_[en|crypt] routines, which differs from
62 # above by 4 and overlaps by pointer to end/start of key schedule
66 # const unsigned int Camellia_SBOX[4][256];
67 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
68 # and [2][] - with [3][]. This is done to optimize code size.
69 $SBOX1_1110=0; # Camellia_SBOX[0]
70 $SBOX4_4404=4; # Camellia_SBOX[1]
71 $SBOX2_0222=2048; # Camellia_SBOX[2]
72 $SBOX3_3033=2052; # Camellia_SBOX[3]
73 &static_label
("Camellia_SIGMA");
74 &static_label
("Camellia_SBOX");
76 sub Camellia_Feistel
{
78 my $seed=defined(@_[1])?
@_[1]:0;
79 my $scale=$seed<0?
-8:8;
80 my $frame=defined(@_[2])?
@_[2]:0;
82 my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
84 &xor ($t0,$idx); # t0^=key[0]
85 &xor ($t1,&DWP
($seed+$i*$scale+4,$key)); # t1^=key[1]
86 &movz
($idx,&HB
($t0)); # (t0>>8)&0xff
87 &mov
($t3,&DWP
($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0]
88 &movz
($idx,&LB
($t0)); # (t0>>0)&0xff
89 &xor ($t3,&DWP
($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0]
91 &movz
($idx,&LB
($t1)); # (t1>>0)&0xff
92 &mov
($t2,&DWP
($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1]
93 &movz
($idx,&HB
($t0)); # (t0>>24)&0xff
94 &xor ($t3,&DWP
($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0]
95 &movz
($idx,&HB
($t1)); # (t1>>8)&0xff
96 &xor ($t2,&DWP
($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1]
98 &movz
($t0,&LB
($t0)); # (t0>>16)&0xff
99 &xor ($t3,&DWP
($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
100 &movz
($idx,&HB
($t1)); # (t1>>24)&0xff
101 &mov
($t0,&DWP
($frame+4*(($j+3)%4),"esp")); # prefetch "s3"
102 &xor ($t2,$t3); # t2^=t3
103 &rotr
($t3,8); # t3=RightRotate(t3,8)
104 &xor ($t2,&DWP
($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1]
105 &movz
($idx,&LB
($t1)); # (t1>>16)&0xff
106 &mov
($t1,&DWP
($frame+4*(($j+2)%4),"esp")); # prefetch "s2"
107 &xor ($t3,$t0); # t3^=s3
108 &xor ($t2,&DWP
($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1]
109 &mov
($idx,&DWP
($seed+($i+1)*$scale,$key)); # prefetch key[i+1]
110 &xor ($t3,$t2); # t3^=t2
111 &mov
(&DWP
($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3
112 &xor ($t2,$t1); # t2^=s2
113 &mov
(&DWP
($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2
116 # void Camellia_EncryptBlock_Rounds(
118 # const Byte plaintext[],
119 # const KEY_TABLE_TYPE keyTable,
121 &function_begin
("Camellia_EncryptBlock_Rounds");
122 &mov
("eax",&wparam
(0)); # load grandRounds
123 &mov
($idx,&wparam
(1)); # load plaintext pointer
124 &mov
($key,&wparam
(2)); # load key schedule pointer
127 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
130 # place stack frame just "above mod 1024" the key schedule
131 # this ensures that cache associativity of 2 suffices
132 &lea
("ecx",&DWP
(-64-63,$key));
135 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
137 &add
("esp",4); # 4 is reserved for callee's return address
140 &lea
("eax",&DWP
(0,$key,"eax"));
141 &mov
($_esp,"ebx"); # save %esp
142 &mov
($_end,"eax"); # save keyEnd
144 &call
(&label
("pic_point"));
145 &set_label
("pic_point");
147 &lea
($Tbl,&DWP
(&label
("Camellia_SBOX")."-".&label
("pic_point"),$Tbl));
149 &mov
(@T[0],&DWP
(0,$idx)); # load plaintext
150 &mov
(@T[1],&DWP
(4,$idx));
151 &mov
(@T[2],&DWP
(8,$idx));
153 &mov
(@T[3],&DWP
(12,$idx));
158 &call
("_x86_Camellia_encrypt");
162 &mov
($idx,&wparam
(3)); # load ciphertext pointer
166 &mov
(&DWP
(0,$idx),@T[0]); # write ciphertext
167 &mov
(&DWP
(4,$idx),@T[1]);
168 &mov
(&DWP
(8,$idx),@T[2]);
169 &mov
(&DWP
(12,$idx),@T[3]);
170 &function_end
("Camellia_EncryptBlock_Rounds");
172 &function_begin_B
("Camellia_EncryptBlock");
174 &sub ("eax",&wparam
(0)); # load keyBitLength
176 &adc
("eax",0); # keyBitLength==128?3:4
177 &mov
(&wparam
(0),"eax");
178 &jmp
(&label
("Camellia_EncryptBlock_Rounds"));
179 &function_end_B
("Camellia_EncryptBlock");
182 # void Camellia_encrypt(
183 # const unsigned char *in,
184 # unsigned char *out,
185 # const CAMELLIA_KEY *key)
186 &function_begin
("Camellia_encrypt");
187 &mov
($idx,&wparam
(0)); # load plaintext pointer
188 &mov
($key,&wparam
(2)); # load key schedule pointer
191 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
193 &mov
("eax",&DWP
(272,$key)); # load grandRounds counter
195 # place stack frame just "above mod 1024" the key schedule
196 # this ensures that cache associativity of 2 suffices
197 &lea
("ecx",&DWP
(-64-63,$key));
200 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
202 &add
("esp",4); # 4 is reserved for callee's return address
205 &lea
("eax",&DWP
(0,$key,"eax"));
206 &mov
($_esp,"ebx"); # save %esp
207 &mov
($_end,"eax"); # save keyEnd
209 &call
(&label
("pic_point"));
210 &set_label
("pic_point");
212 &lea
($Tbl,&DWP
(&label
("Camellia_SBOX")."-".&label
("pic_point"),$Tbl));
214 &mov
(@T[0],&DWP
(0,$idx)); # load plaintext
215 &mov
(@T[1],&DWP
(4,$idx));
216 &mov
(@T[2],&DWP
(8,$idx));
218 &mov
(@T[3],&DWP
(12,$idx));
223 &call
("_x86_Camellia_encrypt");
227 &mov
($idx,&wparam
(1)); # load ciphertext pointer
231 &mov
(&DWP
(0,$idx),@T[0]); # write ciphertext
232 &mov
(&DWP
(4,$idx),@T[1]);
233 &mov
(&DWP
(8,$idx),@T[2]);
234 &mov
(&DWP
(12,$idx),@T[3]);
235 &function_end
("Camellia_encrypt");
238 &function_begin_B
("_x86_Camellia_encrypt");
239 &xor (@T[0],&DWP
(0,$key)); # ^=key[0-3]
240 &xor (@T[1],&DWP
(4,$key));
241 &xor (@T[2],&DWP
(8,$key));
242 &xor (@T[3],&DWP
(12,$key));
243 &mov
($idx,&DWP
(16,$key)); # prefetch key[4]
245 &mov
($__s0,@T[0]); # save s[0-3]
250 &set_label
("loop",16);
251 for ($i=0;$i<6;$i++) { Camellia_Feistel
($i,16,4); }
255 &je
(&label
("done"));
257 # @T[0-1] are preloaded, $idx is preloaded with key[0]
263 &or (@T[2],&DWP
(12,$key));
264 &mov
($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
267 &mov
($idx,&DWP
(4,$key));
268 &mov
($__s2,@T[2]); # s2^=s3|key[3];
270 &and (@T[2],&DWP
(8,$key));
273 &mov
($__s0,@T[0]); # s0^=s1|key[1];
275 &mov
($idx,&DWP
(16,$key)); # prefetch key[4]
276 &mov
($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
277 &jmp
(&label
("loop"));
279 &set_label
("done",8);
280 &mov
(@T[2],@T[0]); # SwapHalf
284 &xor (@T[0],$idx); # $idx is preloaded with key[0]
285 &xor (@T[1],&DWP
(4,$key));
286 &xor (@T[2],&DWP
(8,$key));
287 &xor (@T[3],&DWP
(12,$key));
289 &function_end_B
("_x86_Camellia_encrypt");
291 # void Camellia_DecryptBlock_Rounds(
293 # const Byte ciphertext[],
294 # const KEY_TABLE_TYPE keyTable,
296 &function_begin
("Camellia_DecryptBlock_Rounds");
297 &mov
("eax",&wparam
(0)); # load grandRounds
298 &mov
($idx,&wparam
(1)); # load ciphertext pointer
299 &mov
($key,&wparam
(2)); # load key schedule pointer
302 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
305 # place stack frame just "above mod 1024" the key schedule
306 # this ensures that cache associativity of 2 suffices
307 &lea
("ecx",&DWP
(-64-63,$key));
310 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
312 &add
("esp",4); # 4 is reserved for callee's return address
315 &mov
(&DWP
(4*4,"esp"),$key); # save keyStart
316 &lea
($key,&DWP
(0,$key,"eax"));
317 &mov
(&DWP
(5*4,"esp"),"ebx");# save %esp
319 &call
(&label
("pic_point"));
320 &set_label
("pic_point");
322 &lea
($Tbl,&DWP
(&label
("Camellia_SBOX")."-".&label
("pic_point"),$Tbl));
324 &mov
(@T[0],&DWP
(0,$idx)); # load ciphertext
325 &mov
(@T[1],&DWP
(4,$idx));
326 &mov
(@T[2],&DWP
(8,$idx));
328 &mov
(@T[3],&DWP
(12,$idx));
333 &call
("_x86_Camellia_decrypt");
335 &mov
("esp",&DWP
(5*4,"esp"));
337 &mov
($idx,&wparam
(3)); # load plaintext pointer
341 &mov
(&DWP
(0,$idx),@T[0]); # write plaintext
342 &mov
(&DWP
(4,$idx),@T[1]);
343 &mov
(&DWP
(8,$idx),@T[2]);
344 &mov
(&DWP
(12,$idx),@T[3]);
345 &function_end
("Camellia_DecryptBlock_Rounds");
347 &function_begin_B
("Camellia_DecryptBlock");
349 &sub ("eax",&wparam
(0)); # load keyBitLength
351 &adc
("eax",0); # keyBitLength==128?3:4
352 &mov
(&wparam
(0),"eax");
353 &jmp
(&label
("Camellia_DecryptBlock_Rounds"));
354 &function_end_B
("Camellia_DecryptBlock");
357 # void Camellia_decrypt(
358 # const unsigned char *in,
359 # unsigned char *out,
360 # const CAMELLIA_KEY *key)
361 &function_begin
("Camellia_decrypt");
362 &mov
($idx,&wparam
(0)); # load ciphertext pointer
363 &mov
($key,&wparam
(2)); # load key schedule pointer
366 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra
368 &mov
("eax",&DWP
(272,$key)); # load grandRounds counter
370 # place stack frame just "above mod 1024" the key schedule
371 # this ensures that cache associativity of 2 suffices
372 &lea
("ecx",&DWP
(-64-63,$key));
375 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line
377 &add
("esp",4); # 4 is reserved for callee's return address
380 &mov
(&DWP
(4*4,"esp"),$key); # save keyStart
381 &lea
($key,&DWP
(0,$key,"eax"));
382 &mov
(&DWP
(5*4,"esp"),"ebx");# save %esp
384 &call
(&label
("pic_point"));
385 &set_label
("pic_point");
387 &lea
($Tbl,&DWP
(&label
("Camellia_SBOX")."-".&label
("pic_point"),$Tbl));
389 &mov
(@T[0],&DWP
(0,$idx)); # load ciphertext
390 &mov
(@T[1],&DWP
(4,$idx));
391 &mov
(@T[2],&DWP
(8,$idx));
393 &mov
(@T[3],&DWP
(12,$idx));
398 &call
("_x86_Camellia_decrypt");
400 &mov
("esp",&DWP
(5*4,"esp"));
402 &mov
($idx,&wparam
(1)); # load plaintext pointer
406 &mov
(&DWP
(0,$idx),@T[0]); # write plaintext
407 &mov
(&DWP
(4,$idx),@T[1]);
408 &mov
(&DWP
(8,$idx),@T[2]);
409 &mov
(&DWP
(12,$idx),@T[3]);
410 &function_end
("Camellia_decrypt");
413 &function_begin_B
("_x86_Camellia_decrypt");
414 &xor (@T[0],&DWP
(0,$key)); # ^=key[0-3]
415 &xor (@T[1],&DWP
(4,$key));
416 &xor (@T[2],&DWP
(8,$key));
417 &xor (@T[3],&DWP
(12,$key));
418 &mov
($idx,&DWP
(-8,$key)); # prefetch key[-2]
420 &mov
($__s0,@T[0]); # save s[0-3]
425 &set_label
("loop",16);
426 for ($i=0;$i<6;$i++) { Camellia_Feistel
($i,-8,4); }
430 &je
(&label
("done"));
432 # @T[0-1] are preloaded, $idx is preloaded with key[2]
438 &or (@T[2],&DWP
(4,$key));
439 &mov
($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1);
442 &mov
($idx,&DWP
(12,$key));
443 &mov
($__s2,@T[2]); # s2^=s3|key[3];
445 &and (@T[2],&DWP
(0,$key));
448 &mov
($__s0,@T[0]); # s0^=s1|key[1];
450 &mov
($idx,&DWP
(-8,$key)); # prefetch key[4]
451 &mov
($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1);
452 &jmp
(&label
("loop"));
454 &set_label
("done",8);
455 &mov
(@T[2],@T[0]); # SwapHalf
459 &xor (@T[2],$idx); # $idx is preloaded with key[2]
460 &xor (@T[3],&DWP
(12,$key));
461 &xor (@T[0],&DWP
(0,$key));
462 &xor (@T[1],&DWP
(4,$key));
464 &function_end_B
("_x86_Camellia_decrypt");
466 # shld is very slow on Intel P4 family. Even on AMD it limits
467 # instruction decode rate [because it's VectorPath] and consequently
468 # performance. PIII, PM and Core[2] seem to be the only ones which
469 # execute this code ~7% faster...
471 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
476 &shld
($i0,$i1,$rot);
477 &shld
($i1,$i2,$rot);
478 &shld
($i2,$i3,$rot);
479 &shld
($i3,$idx,$rot);
481 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
482 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
483 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
484 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
487 # ... Implementing 128-bit rotate without shld gives >3x performance
488 # improvement on P4, only ~7% degradation on other Intel CPUs and
489 # not worse performance on AMD. This is therefore preferred.
491 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
503 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
509 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
513 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
514 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
516 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]);
517 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]);
518 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]);
519 &mov
(&DWP
(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]);
524 my ($rnd,$key,@T)=@_;
525 my $bias=int(@T[0])?
shift(@T):0;
527 &mov
(&DWP
($bias+$rnd*8+0,$key),@T[0]);
528 &mov
(&DWP
($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1);
529 &mov
(&DWP
($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2);
530 &mov
(&DWP
($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3);
534 my ($rnd,$key,@T)=@_;
535 my $bias=int(@T[0])?
shift(@T):0;
537 &mov
(@T[0],&DWP
($bias+$rnd*8+0,$key));
538 &mov
(@T[1],&DWP
($bias+$rnd*8+4,$key)) if ($#T>=1);
539 &mov
(@T[2],&DWP
($bias+$rnd*8+8,$key)) if ($#T>=2);
540 &mov
(@T[3],&DWP
($bias+$rnd*8+12,$key)) if ($#T>=3);
543 # void Camellia_Ekeygen(
544 # const int keyBitLength,
545 # const Byte *rawKey,
546 # KEY_TABLE_TYPE keyTable)
547 &function_begin
("Camellia_Ekeygen");
550 &stack_push
(4); # place for s[0-3]
552 &mov
($Tbl,&wparam
(0)); # load arguments
553 &mov
($idx,&wparam
(1));
554 &mov
($key,&wparam
(2));
556 &mov
(@T[0],&DWP
(0,$idx)); # load 0-127 bits
557 &mov
(@T[1],&DWP
(4,$idx));
558 &mov
(@T[2],&DWP
(8,$idx));
559 &mov
(@T[3],&DWP
(12,$idx));
566 &_saveround
(0,$key,@T); # KL<<<0
569 &je
(&label
("1st128"));
571 &mov
(@T[0],&DWP
(16,$idx)); # load 128-191 bits
572 &mov
(@T[1],&DWP
(20,$idx));
574 &je
(&label
("1st192"));
575 &mov
(@T[2],&DWP
(24,$idx)); # load 192-255 bits
576 &mov
(@T[3],&DWP
(28,$idx));
577 &jmp
(&label
("1st256"));
578 &set_label
("1st192",4);
583 &set_label
("1st256",4);
589 &_saveround
(4,$key,@T); # temporary storage for KR!
591 &xor (@T[0],&DWP
(0*8+0,$key)); # KR^KL
592 &xor (@T[1],&DWP
(0*8+4,$key));
593 &xor (@T[2],&DWP
(1*8+0,$key));
594 &xor (@T[3],&DWP
(1*8+4,$key));
596 &set_label
("1st128",4);
597 &call
(&label
("pic_point"));
598 &set_label
("pic_point");
600 &lea
($Tbl,&DWP
(&label
("Camellia_SBOX")."-".&label
("pic_point"),$Tbl));
601 &lea
($key,&DWP
(&label
("Camellia_SIGMA")."-".&label
("Camellia_SBOX"),$Tbl));
603 &mov
($idx,&DWP
($step*8,$key)); # prefetch SIGMA[0]
604 &mov
(&swtmp
(0),@T[0]); # save s[0-3]
605 &mov
(&swtmp
(1),@T[1]);
606 &mov
(&swtmp
(2),@T[2]);
607 &mov
(&swtmp
(3),@T[3]);
608 &Camellia_Feistel
($step++);
609 &Camellia_Feistel
($step++);
610 &mov
(@T[2],&swtmp
(2));
611 &mov
(@T[3],&swtmp
(3));
613 &mov
($idx,&wparam
(2));
614 &xor (@T[0],&DWP
(0*8+0,$idx)); # ^KL
615 &xor (@T[1],&DWP
(0*8+4,$idx));
616 &xor (@T[2],&DWP
(1*8+0,$idx));
617 &xor (@T[3],&DWP
(1*8+4,$idx));
619 &mov
($idx,&DWP
($step*8,$key)); # prefetch SIGMA[4]
620 &mov
(&swtmp
(0),@T[0]); # save s[0-3]
621 &mov
(&swtmp
(1),@T[1]);
622 &mov
(&swtmp
(2),@T[2]);
623 &mov
(&swtmp
(3),@T[3]);
624 &Camellia_Feistel
($step++);
625 &Camellia_Feistel
($step++);
626 &mov
(@T[2],&swtmp
(2));
627 &mov
(@T[3],&swtmp
(3));
629 &mov
($idx,&wparam
(0));
631 &jne
(&label
("2nd256"));
633 &mov
($key,&wparam
(2));
634 &lea
($key,&DWP
(128,$key)); # size optimization
637 &_saveround
(2,$key,-128,@T); # KA<<<0
638 &_rotl128
(@T,15,6,@T); # KA<<<15
639 &_rotl128
(@T,15,8,@T); # KA<<<(15+15=30)
640 &_rotl128
(@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
641 &_rotl128
(@T,15,14,@T); # KA<<<(45+15=60)
642 push (@T,shift(@T)); # rotl128(@T,32);
643 &_rotl128
(@T,2,20,@T); # KA<<<(60+32+2=94)
644 &_rotl128
(@T,17,24,@T); # KA<<<(94+17=111)
647 &_loadround
(0,$key,-128,@T); # load KL
648 &_rotl128
(@T,15,4,@T); # KL<<<15
649 &_rotl128
(@T,30,10,@T); # KL<<<(15+30=45)
650 &_rotl128
(@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
651 &_rotl128
(@T,17,16,@T); # KL<<<(60+17=77)
652 &_rotl128
(@T,17,18,@T); # KL<<<(77+17=94)
653 &_rotl128
(@T,17,22,@T); # KL<<<(94+17=111)
655 while (@T[0] ne "eax") # restore order
656 { unshift (@T,pop(@T)); }
658 &mov
("eax",3); # 3 grandRounds
659 &jmp
(&label
("done"));
661 &set_label
("2nd256",16);
662 &mov
($idx,&wparam
(2));
663 &_saveround
(6,$idx,@T); # temporary storage for KA!
665 &xor (@T[0],&DWP
(4*8+0,$idx)); # KA^KR
666 &xor (@T[1],&DWP
(4*8+4,$idx));
667 &xor (@T[2],&DWP
(5*8+0,$idx));
668 &xor (@T[3],&DWP
(5*8+4,$idx));
670 &mov
($idx,&DWP
($step*8,$key)); # prefetch SIGMA[8]
671 &mov
(&swtmp
(0),@T[0]); # save s[0-3]
672 &mov
(&swtmp
(1),@T[1]);
673 &mov
(&swtmp
(2),@T[2]);
674 &mov
(&swtmp
(3),@T[3]);
675 &Camellia_Feistel
($step++);
676 &Camellia_Feistel
($step++);
677 &mov
(@T[2],&swtmp
(2));
678 &mov
(@T[3],&swtmp
(3));
680 &mov
($key,&wparam
(2));
681 &lea
($key,&DWP
(128,$key)); # size optimization
684 &_saveround
(2,$key,-128,@T); # KB<<<0
685 &_rotl128
(@T,30,10,@T); # KB<<<30
686 &_rotl128
(@T,30,20,@T); # KB<<<(30+30=60)
687 push (@T,shift(@T)); # rotl128(@T,32);
688 &_rotl128
(@T,19,32,@T); # KB<<<(60+32+19=111)
691 &_loadround
(4,$key,-128,@T); # load KR
692 &_rotl128
(@T,15,4,@T); # KR<<<15
693 &_rotl128
(@T,15,8,@T); # KR<<<(15+15=30)
694 &_rotl128
(@T,30,18,@T); # KR<<<(30+30=60)
695 push (@T,shift(@T)); # rotl128(@T,32);
696 &_rotl128
(@T,2,26,@T); # KR<<<(60+32+2=94)
699 &_loadround
(6,$key,-128,@T); # load KA
700 &_rotl128
(@T,15,6,@T); # KA<<<15
701 &_rotl128
(@T,30,14,@T); # KA<<<(15+30=45)
702 push (@T,shift(@T)); # rotl128(@T,32);
703 &_rotl128
(@T,0,24,@T); # KA<<<(45+32+0=77)
704 &_rotl128
(@T,17,28,@T); # KA<<<(77+17=94)
707 &_loadround
(0,$key,-128,@T); # load KL
708 push (@T,shift(@T)); # rotl128(@T,32);
709 &_rotl128
(@T,13,12,@T); # KL<<<(32+13=45)
710 &_rotl128
(@T,15,16,@T); # KL<<<(45+15=60)
711 &_rotl128
(@T,17,22,@T); # KL<<<(60+17=77)
712 push (@T,shift(@T)); # rotl128(@T,32);
713 &_rotl128
(@T,2,30,@T); # KL<<<(77+32+2=111)
715 while (@T[0] ne "eax") # restore order
716 { unshift (@T,pop(@T)); }
718 &mov
("eax",4); # 4 grandRounds
720 &lea
("edx",&DWP
(272-128,$key)); # end of key schedule
723 &function_end
("Camellia_Ekeygen");
726 # int private_Camellia_set_key (
727 # const unsigned char *userKey,
730 &function_begin_B
("private_Camellia_set_key");
732 &mov
("ecx",&wparam
(0)); # pull arguments
733 &mov
("ebx",&wparam
(1));
734 &mov
("edx",&wparam
(2));
738 &jz
(&label
("done")); # userKey==NULL?
740 &jz
(&label
("done")); # key==NULL?
744 &je
(&label
("arg_ok")); # bits==256?
746 &je
(&label
("arg_ok")); # bits==192?
748 &jne
(&label
("done")); # bits!=128?
749 &set_label
("arg_ok",4);
751 &push ("edx"); # push arguments
754 &call
("Camellia_Ekeygen");
757 # eax holds grandRounds and edx points at where to put it
758 &mov
(&DWP
(0,"edx"),"eax");
760 &set_label
("done",4);
763 &function_end_B
("private_Camellia_set_key");
767 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
768 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
769 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
770 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
771 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
772 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
773 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
774 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
775 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
776 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
777 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
778 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
779 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
780 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
781 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
782 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
784 sub S1110
{ my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
785 sub S4404
{ my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
786 sub S0222
{ my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
787 sub S3033
{ my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
789 &set_label
("Camellia_SIGMA",64);
791 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
792 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
793 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
795 &set_label
("Camellia_SBOX",64);
796 # tables are interleaved, remember?
797 for ($i=0;$i<256;$i++) { &data_word
(&S1110
($i),&S4404
($i)); }
798 for ($i=0;$i<256;$i++) { &data_word
(&S0222
($i),&S3033
($i)); }
800 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
801 # size_t length, const CAMELLIA_KEY *key,
802 # unsigned char *ivp,const int enc);
805 # -4(%esp) # return address 0(%esp)
806 # 0(%esp) # s0 4(%esp)
807 # 4(%esp) # s1 8(%esp)
808 # 8(%esp) # s2 12(%esp)
809 # 12(%esp) # s3 16(%esp)
810 # 16(%esp) # end of key schedule 20(%esp)
811 # 20(%esp) # %esp backup
812 my $_inp=&DWP
(24,"esp"); #copy of wparam(0)
813 my $_out=&DWP
(28,"esp"); #copy of wparam(1)
814 my $_len=&DWP
(32,"esp"); #copy of wparam(2)
815 my $_key=&DWP
(36,"esp"); #copy of wparam(3)
816 my $_ivp=&DWP
(40,"esp"); #copy of wparam(4)
817 my $ivec=&DWP
(44,"esp"); #ivec[16]
818 my $_tmp=&DWP
(44,"esp"); #volatile variable [yes, aliases with ivec]
819 my ($s0,$s1,$s2,$s3) = @T;
821 &function_begin
("Camellia_cbc_encrypt");
822 &mov
($s2 eq "ecx"?
$s2 : "",&wparam
(2)); # load len
824 &je
(&label
("enc_out"));
829 &mov
($s0,&wparam
(0)); # load inp
830 &mov
($s1,&wparam
(1)); # load out
831 #&mov ($s2,&wparam(2)); # load len
832 &mov
($s3,&wparam
(3)); # load key
833 &mov
($Tbl,&wparam
(4)); # load ivp
835 # allocate aligned stack frame...
836 &lea
($idx,&DWP
(-64,"esp"));
839 # place stack frame just "above mod 1024" the key schedule
840 # this ensures that cache associativity of 2 suffices
841 &lea
($key,&DWP
(-64-63,$s3));
844 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line
847 &mov
($key,&wparam
(5)); # load enc
850 &add
("esp",4); # reserve for return address!
851 &mov
($_esp,$idx); # save %esp
853 &mov
($_inp,$s0); # save copy of inp
854 &mov
($_out,$s1); # save copy of out
855 &mov
($_len,$s2); # save copy of len
856 &mov
($_key,$s3); # save copy of key
857 &mov
($_ivp,$Tbl); # save copy of ivp
859 &call
(&label
("pic_point")); # make it PIC!
860 &set_label
("pic_point");
862 &lea
($Tbl,&DWP
(&label
("Camellia_SBOX")."-".&label
("pic_point"),$Tbl));
865 &set_label
("prefetch_sbox",4);
866 &mov
($s0,&DWP
(0,$Tbl));
867 &mov
($s1,&DWP
(32,$Tbl));
868 &mov
($s2,&DWP
(64,$Tbl));
869 &mov
($s3,&DWP
(96,$Tbl));
870 &lea
($Tbl,&DWP
(128,$Tbl));
872 &jnz
(&label
("prefetch_sbox"));
876 &mov
($s3,&DWP
(272,$s0)); # load grandRounds
879 &je
(&label
("DECRYPT"));
884 &lea
($s3,&DWP
(0,$s0,$s3));
887 &test
($s2,0xFFFFFFF0);
888 &jz
(&label
("enc_tail")); # short input...
890 &mov
($s0,&DWP
(0,$key)); # load iv
891 &mov
($s1,&DWP
(4,$key));
893 &set_label
("enc_loop",4);
894 &mov
($s2,&DWP
(8,$key));
895 &mov
($s3,&DWP
(12,$key));
897 &xor ($s0,&DWP
(0,$idx)); # xor input data
898 &xor ($s1,&DWP
(4,$idx));
899 &xor ($s2,&DWP
(8,$idx));
901 &xor ($s3,&DWP
(12,$idx));
903 &mov
($key,$_key); # load key
907 &call
("_x86_Camellia_encrypt");
909 &mov
($idx,$_inp); # load inp
910 &mov
($key,$_out); # load out
915 &mov
(&DWP
(0,$key),$s0); # save output data
917 &mov
(&DWP
(4,$key),$s1);
918 &mov
(&DWP
(8,$key),$s2);
919 &mov
(&DWP
(12,$key),$s3);
921 &mov
($s2,$_len); # load len
923 &lea
($idx,&DWP
(16,$idx));
924 &mov
($_inp,$idx); # save inp
926 &lea
($s3,&DWP
(16,$key));
927 &mov
($_out,$s3); # save out
930 &test
($s2,0xFFFFFFF0);
931 &mov
($_len,$s2); # save len
932 &jnz
(&label
("enc_loop"));
934 &jnz
(&label
("enc_tail"));
935 &mov
($idx,$_ivp); # load ivp
936 &mov
($s2,&DWP
(8,$key)); # restore last dwords
937 &mov
($s3,&DWP
(12,$key));
938 &mov
(&DWP
(0,$idx),$s0); # save ivec
939 &mov
(&DWP
(4,$idx),$s1);
940 &mov
(&DWP
(8,$idx),$s2);
941 &mov
(&DWP
(12,$idx),$s3);
945 &set_label
("enc_out");
947 &pushf
(); # kludge, never executed
949 &set_label
("enc_tail",4);
950 &mov
($s0,$key eq "edi" ?
$key : "");
951 &mov
($key,$_out); # load out
952 &push ($s0); # push ivp
955 &cmp ($key,$idx); # compare with inp
956 &je
(&label
("enc_in_place"));
958 &data_word
(0xA4F3F689); # rep movsb # copy input
959 &jmp
(&label
("enc_skip_in_place"));
960 &set_label
("enc_in_place");
961 &lea
($key,&DWP
(0,$key,$s2));
962 &set_label
("enc_skip_in_place");
966 &data_word
(0xAAF3F689); # rep stosb # zero tail
967 &pop ($key); # pop ivp
969 &mov
($idx,$_out); # output as input
970 &mov
($s0,&DWP
(0,$key));
971 &mov
($s1,&DWP
(4,$key));
972 &mov
($_len,16); # len=16
973 &jmp
(&label
("enc_loop")); # one more spin...
975 #----------------------------- DECRYPT -----------------------------#
976 &set_label
("DECRYPT",16);
978 &lea
($s3,&DWP
(0,$s0,$s3));
983 &je
(&label
("dec_in_place")); # in-place processing...
985 &mov
($key,$_ivp); # load ivp
988 &set_label
("dec_loop",4);
989 &mov
($s0,&DWP
(0,$idx)); # read input
990 &mov
($s1,&DWP
(4,$idx));
991 &mov
($s2,&DWP
(8,$idx));
993 &mov
($s3,&DWP
(12,$idx));
995 &mov
($key,$_key); # load key
999 &call
("_x86_Camellia_decrypt");
1001 &mov
($key,$_tmp); # load ivp
1002 &mov
($idx,$_len); # load len
1007 &xor ($s0,&DWP
(0,$key)); # xor iv
1009 &xor ($s1,&DWP
(4,$key));
1010 &xor ($s2,&DWP
(8,$key));
1011 &xor ($s3,&DWP
(12,$key));
1014 &jc
(&label
("dec_partial"));
1015 &mov
($_len,$idx); # save len
1016 &mov
($idx,$_inp); # load inp
1017 &mov
($key,$_out); # load out
1019 &mov
(&DWP
(0,$key),$s0); # write output
1020 &mov
(&DWP
(4,$key),$s1);
1021 &mov
(&DWP
(8,$key),$s2);
1022 &mov
(&DWP
(12,$key),$s3);
1024 &mov
($_tmp,$idx); # save ivp
1025 &lea
($idx,&DWP
(16,$idx));
1026 &mov
($_inp,$idx); # save inp
1028 &lea
($key,&DWP
(16,$key));
1029 &mov
($_out,$key); # save out
1031 &jnz
(&label
("dec_loop"));
1032 &mov
($key,$_tmp); # load temp ivp
1033 &set_label
("dec_end");
1034 &mov
($idx,$_ivp); # load user ivp
1035 &mov
($s0,&DWP
(0,$key)); # load iv
1036 &mov
($s1,&DWP
(4,$key));
1037 &mov
($s2,&DWP
(8,$key));
1038 &mov
($s3,&DWP
(12,$key));
1039 &mov
(&DWP
(0,$idx),$s0); # copy back to user
1040 &mov
(&DWP
(4,$idx),$s1);
1041 &mov
(&DWP
(8,$idx),$s2);
1042 &mov
(&DWP
(12,$idx),$s3);
1043 &jmp
(&label
("dec_out"));
1045 &set_label
("dec_partial",4);
1047 &mov
(&DWP
(0,$key),$s0); # dump output to stack
1048 &mov
(&DWP
(4,$key),$s1);
1049 &mov
(&DWP
(8,$key),$s2);
1050 &mov
(&DWP
(12,$key),$s3);
1051 &lea
($s2 eq "ecx" ?
$s2 : "",&DWP
(16,$idx));
1052 &mov
($idx eq "esi" ?
$idx : "",$key);
1053 &mov
($key eq "edi" ?
$key : "",$_out); # load out
1054 &data_word
(0xA4F3F689); # rep movsb # copy output
1055 &mov
($key,$_inp); # use inp as temp ivp
1056 &jmp
(&label
("dec_end"));
1058 &set_label
("dec_in_place",4);
1059 &set_label
("dec_in_place_loop");
1061 &mov
($s0,&DWP
(0,$idx)); # read input
1062 &mov
($s1,&DWP
(4,$idx));
1063 &mov
($s2,&DWP
(8,$idx));
1064 &mov
($s3,&DWP
(12,$idx));
1066 &mov
(&DWP
(0,$key),$s0); # copy to temp
1067 &mov
(&DWP
(4,$key),$s1);
1068 &mov
(&DWP
(8,$key),$s2);
1070 &mov
(&DWP
(12,$key),$s3);
1072 &mov
($key,$_key); # load key
1076 &call
("_x86_Camellia_decrypt");
1078 &mov
($key,$_ivp); # load ivp
1079 &mov
($idx,$_out); # load out
1084 &xor ($s0,&DWP
(0,$key)); # xor iv
1086 &xor ($s1,&DWP
(4,$key));
1087 &xor ($s2,&DWP
(8,$key));
1088 &xor ($s3,&DWP
(12,$key));
1090 &mov
(&DWP
(0,$idx),$s0); # write output
1091 &mov
(&DWP
(4,$idx),$s1);
1092 &mov
(&DWP
(8,$idx),$s2);
1093 &mov
(&DWP
(12,$idx),$s3);
1095 &lea
($idx,&DWP
(16,$idx));
1096 &mov
($_out,$idx); # save out
1099 &mov
($s0,&DWP
(0,$idx)); # read temp
1100 &mov
($s1,&DWP
(4,$idx));
1101 &mov
($s2,&DWP
(8,$idx));
1102 &mov
($s3,&DWP
(12,$idx));
1104 &mov
(&DWP
(0,$key),$s0); # copy iv
1105 &mov
(&DWP
(4,$key),$s1);
1106 &mov
(&DWP
(8,$key),$s2);
1107 &mov
(&DWP
(12,$key),$s3);
1109 &mov
($idx,$_inp); # load inp
1111 &lea
($idx,&DWP
(16,$idx));
1112 &mov
($_inp,$idx); # save inp
1114 &mov
($s2,$_len); # load len
1116 &jc
(&label
("dec_in_place_partial"));
1117 &mov
($_len,$s2); # save len
1118 &jnz
(&label
("dec_in_place_loop"));
1119 &jmp
(&label
("dec_out"));
1121 &set_label
("dec_in_place_partial",4);
1122 # one can argue if this is actually required...
1123 &mov
($key eq "edi" ?
$key : "",$_out);
1124 &lea
($idx eq "esi" ?
$idx : "",$ivec);
1125 &lea
($key,&DWP
(0,$key,$s2));
1126 &lea
($idx,&DWP
(16,$idx,$s2));
1127 &neg
($s2 eq "ecx" ?
$s2 : "");
1128 &data_word
(0xA4F3F689); # rep movsb # restore tail
1130 &set_label
("dec_out",4);
1133 &function_end
("Camellia_cbc_encrypt");
1136 &asciz
("Camellia for x86 by <appro\@openssl.org>");