2 * Camellia Cipher Algorithm (x86_64)
4 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 .file "camellia-x86_64-asm_64.S"
26 .extern camellia_sp10011110;
27 .extern camellia_sp22000222;
28 .extern camellia_sp03303033;
29 .extern camellia_sp00444404;
30 .extern camellia_sp02220222;
31 .extern camellia_sp30333033;
32 .extern camellia_sp44044404;
33 .extern camellia_sp11101110;
35 #define sp10011110 camellia_sp10011110
36 #define sp22000222 camellia_sp22000222
37 #define sp03303033 camellia_sp03303033
38 #define sp00444404 camellia_sp00444404
39 #define sp02220222 camellia_sp02220222
40 #define sp30333033 camellia_sp30333033
41 #define sp44044404 camellia_sp44044404
42 #define sp11101110 camellia_sp11101110
44 #define CAMELLIA_TABLE_BYTE_LEN 272
46 /* struct camellia_ctx: */
48 #define key_length CAMELLIA_TABLE_BYTE_LEN
92 #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
93 movzbl ab ## bl, tmp2 ## d; \
94 movzbl ab ## bh, tmp1 ## d; \
96 xorq T0(, tmp2, 8), dst; \
97 xorq T1(, tmp1, 8), dst;
99 /**********************************************************************
101 **********************************************************************/
102 #define roundsm(ab, subkey, cd) \
103 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
105 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
106 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
107 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
108 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
112 #define fls(l, r, kl, kr) \
113 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
114 andl l ## 0d, RT0d; \
118 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
123 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \
127 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \
128 andl r ## 0d, RT0d; \
133 #define enc_rounds(i) \
134 roundsm(RAB, i + 2, RCD); \
135 roundsm(RCD, i + 3, RAB); \
136 roundsm(RAB, i + 4, RCD); \
137 roundsm(RCD, i + 5, RAB); \
138 roundsm(RAB, i + 6, RCD); \
139 roundsm(RCD, i + 7, RAB);
142 fls(RAB, RCD, i + 0, i + 1);
144 #define enc_inpack() \
148 movq 4*2(RIO), RCD0; \
151 xorq key_table(CTX), RAB0;
153 #define enc_outunpack(op, max) \
154 xorq key_table(CTX, max, 8), RCD0; \
157 op ## q RCD0, (RIO); \
160 op ## q RAB0, 4*2(RIO);
162 #define dec_rounds(i) \
163 roundsm(RAB, i + 7, RCD); \
164 roundsm(RCD, i + 6, RAB); \
165 roundsm(RAB, i + 5, RCD); \
166 roundsm(RCD, i + 4, RAB); \
167 roundsm(RAB, i + 3, RCD); \
168 roundsm(RCD, i + 2, RAB);
171 fls(RAB, RCD, i + 1, i + 0);
173 #define dec_inpack(max) \
177 movq 4*2(RIO), RCD0; \
180 xorq key_table(CTX, max, 8), RAB0;
182 #define dec_outunpack() \
183 xorq key_table(CTX), RCD0; \
191 .global __camellia_enc_blk;
192 .type __camellia_enc_blk,@function;
214 movl $24, RT1d; /* max */
216 cmpb $16, key_length(CTX);
221 movl $32, RT1d; /* max */
224 testb RXORbl, RXORbl;
229 enc_outunpack(mov, RT1);
235 enc_outunpack(xor, RT1);
240 .global camellia_dec_blk;
241 .type camellia_dec_blk,@function;
249 cmpl $16, key_length(CTX);
252 cmovel RXORd, RT2d; /* max */
280 /**********************************************************************
282 **********************************************************************/
283 #define roundsm2(ab, subkey, cd) \
284 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \
287 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
288 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
289 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
290 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
292 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
294 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
295 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
296 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
298 #define fls2(l, r, kl, kr) \
299 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \
300 andl l ## 0d, RT0d; \
304 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \
309 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \
310 andl l ## 1d, RT2d; \
314 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \
319 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \
323 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \
324 andl r ## 0d, RT2d; \
329 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \
333 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \
334 andl r ## 1d, RT1d; \
339 #define enc_rounds2(i) \
340 roundsm2(RAB, i + 2, RCD); \
341 roundsm2(RCD, i + 3, RAB); \
342 roundsm2(RAB, i + 4, RCD); \
343 roundsm2(RCD, i + 5, RAB); \
344 roundsm2(RAB, i + 6, RCD); \
345 roundsm2(RCD, i + 7, RAB);
347 #define enc_fls2(i) \
348 fls2(RAB, RCD, i + 0, i + 1);
350 #define enc_inpack2() \
354 movq 4*2(RIO), RCD0; \
357 xorq key_table(CTX), RAB0; \
359 movq 8*2(RIO), RAB1; \
362 movq 12*2(RIO), RCD1; \
365 xorq key_table(CTX), RAB1;
367 #define enc_outunpack2(op, max) \
368 xorq key_table(CTX, max, 8), RCD0; \
371 op ## q RCD0, (RIO); \
374 op ## q RAB0, 4*2(RIO); \
376 xorq key_table(CTX, max, 8), RCD1; \
379 op ## q RCD1, 8*2(RIO); \
382 op ## q RAB1, 12*2(RIO);
384 #define dec_rounds2(i) \
385 roundsm2(RAB, i + 7, RCD); \
386 roundsm2(RCD, i + 6, RAB); \
387 roundsm2(RAB, i + 5, RCD); \
388 roundsm2(RCD, i + 4, RAB); \
389 roundsm2(RAB, i + 3, RCD); \
390 roundsm2(RCD, i + 2, RAB);
392 #define dec_fls2(i) \
393 fls2(RAB, RCD, i + 1, i + 0);
395 #define dec_inpack2(max) \
399 movq 4*2(RIO), RCD0; \
402 xorq key_table(CTX, max, 8), RAB0; \
404 movq 8*2(RIO), RAB1; \
407 movq 12*2(RIO), RCD1; \
410 xorq key_table(CTX, max, 8), RAB1;
412 #define dec_outunpack2() \
413 xorq key_table(CTX), RCD0; \
419 movq RAB0, 4*2(RIO); \
421 xorq key_table(CTX), RCD1; \
424 movq RCD1, 8*2(RIO); \
427 movq RAB1, 12*2(RIO);
429 .global __camellia_enc_blk_2way;
430 .type __camellia_enc_blk_2way,@function;
432 __camellia_enc_blk_2way:
453 movl $24, RT2d; /* max */
455 cmpb $16, key_length(CTX);
460 movl $32, RT2d; /* max */
467 enc_outunpack2(mov, RT2);
474 enc_outunpack2(xor, RT2);
480 .global camellia_dec_blk_2way;
481 .type camellia_dec_blk_2way,@function;
483 camellia_dec_blk_2way:
489 cmpl $16, key_length(CTX);
492 cmovel RXORd, RT2d; /* max */