arch/x86/crypto/twofish-i586-asm_32.S

   1 /***************************************************************************
   2 *   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
   3 *                                                                         *
   4 *   This program is free software; you can redistribute it and/or modify  *
   5 *   it under the terms of the GNU General Public License as published by  *
   6 *   the Free Software Foundation; either version 2 of the License, or     *
   7 *   (at your option) any later version.                                   *
   8 *                                                                         *
   9 *   This program is distributed in the hope that it will be useful,       *
  10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  12 *   GNU General Public License for more details.                          *
  13 *                                                                         *
  14 *   You should have received a copy of the GNU General Public License     *
  15 *   along with this program; if not, write to the                         *
  16 *   Free Software Foundation, Inc.,                                       *
  17 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
  18 ***************************************************************************/
  19
  20 .file "twofish-i586-asm.S"
  21 .text
  22
  23 #include <asm/asm-offsets.h>
  24
  25 /* return adress at 0 */
  26
  27 #define in_blk    12  /* input byte array address parameter*/
  28 #define out_blk   8  /* output byte array address parameter*/
  29 #define tfm       4  /* Twofish context structure */
  30
  31 #define a_offset        0
  32 #define b_offset        4
  33 #define c_offset        8
  34 #define d_offset        12
  35
  36 /* Structure of the crypto context struct*/
  37
  38 #define s0      0       /* S0 Array 256 Words each */
  39 #define s1      1024    /* S1 Array */
  40 #define s2      2048    /* S2 Array */
  41 #define s3      3072    /* S3 Array */
  42 #define w       4096    /* 8 whitening keys (word) */
  43 #define k       4128    /* key 1-32 ( word ) */
  44
  45 /* define a few register aliases to allow macro substitution */
  46
  47 #define R0D    %eax
  48 #define R0B    %al
  49 #define R0H    %ah
  50
  51 #define R1D    %ebx
  52 #define R1B    %bl
  53 #define R1H    %bh
  54
  55 #define R2D    %ecx
  56 #define R2B    %cl
  57 #define R2H    %ch
  58
  59 #define R3D    %edx
  60 #define R3B    %dl
  61 #define R3H    %dh
  62
  63
  64 /* performs input whitening */
  65 #define input_whitening(src,context,offset)\
  66         xor     w+offset(context),      src;
  67
  68 /* performs input whitening */
  69 #define output_whitening(src,context,offset)\
  70         xor     w+16+offset(context),   src;
  71
  72 /*
  73  * a input register containing a (rotated 16)
  74  * b input register containing b
  75  * c input register containing c
  76  * d input register containing d (already rol $1)
  77  * operations on a and b are interleaved to increase performance
  78  */
  79 #define encrypt_round(a,b,c,d,round)\
  80         push    d ## D;\
  81         movzx   b ## B,         %edi;\
  82         mov     s1(%ebp,%edi,4),d ## D;\
  83         movzx   a ## B,         %edi;\
  84         mov     s2(%ebp,%edi,4),%esi;\
  85         movzx   b ## H,         %edi;\
  86         ror     $16,            b ## D;\
  87         xor     s2(%ebp,%edi,4),d ## D;\
  88         movzx   a ## H,         %edi;\
  89         ror     $16,            a ## D;\
  90         xor     s3(%ebp,%edi,4),%esi;\
  91         movzx   b ## B,         %edi;\
  92         xor     s3(%ebp,%edi,4),d ## D;\
  93         movzx   a ## B,         %edi;\
  94         xor     (%ebp,%edi,4),  %esi;\
  95         movzx   b ## H,         %edi;\
  96         ror     $15,            b ## D;\
  97         xor     (%ebp,%edi,4),  d ## D;\
  98         movzx   a ## H,         %edi;\
  99         xor     s1(%ebp,%edi,4),%esi;\
 100         pop     %edi;\
 101         add     d ## D,         %esi;\
 102         add     %esi,           d ## D;\
 103         add     k+round(%ebp),  %esi;\
 104         xor     %esi,           c ## D;\
 105         rol     $15,            c ## D;\
 106         add     k+4+round(%ebp),d ## D;\
 107         xor     %edi,           d ## D;
 108
 109 /*
 110  * a input register containing a (rotated 16)
 111  * b input register containing b
 112  * c input register containing c
 113  * d input register containing d (already rol $1)
 114  * operations on a and b are interleaved to increase performance
 115  * last round has different rotations for the output preparation
 116  */
 117 #define encrypt_last_round(a,b,c,d,round)\
 118         push    d ## D;\
 119         movzx   b ## B,         %edi;\
 120         mov     s1(%ebp,%edi,4),d ## D;\
 121         movzx   a ## B,         %edi;\
 122         mov     s2(%ebp,%edi,4),%esi;\
 123         movzx   b ## H,         %edi;\
 124         ror     $16,            b ## D;\
 125         xor     s2(%ebp,%edi,4),d ## D;\
 126         movzx   a ## H,         %edi;\
 127         ror     $16,            a ## D;\
 128         xor     s3(%ebp,%edi,4),%esi;\
 129         movzx   b ## B,         %edi;\
 130         xor     s3(%ebp,%edi,4),d ## D;\
 131         movzx   a ## B,         %edi;\
 132         xor     (%ebp,%edi,4),  %esi;\
 133         movzx   b ## H,         %edi;\
 134         ror     $16,            b ## D;\
 135         xor     (%ebp,%edi,4),  d ## D;\
 136         movzx   a ## H,         %edi;\
 137         xor     s1(%ebp,%edi,4),%esi;\
 138         pop     %edi;\
 139         add     d ## D,         %esi;\
 140         add     %esi,           d ## D;\
 141         add     k+round(%ebp),  %esi;\
 142         xor     %esi,           c ## D;\
 143         ror     $1,             c ## D;\
 144         add     k+4+round(%ebp),d ## D;\
 145         xor     %edi,           d ## D;
 146
 147 /*
 148  * a input register containing a
 149  * b input register containing b (rotated 16)
 150  * c input register containing c
 151  * d input register containing d (already rol $1)
 152  * operations on a and b are interleaved to increase performance
 153  */
 154 #define decrypt_round(a,b,c,d,round)\
 155         push    c ## D;\
 156         movzx   a ## B,         %edi;\
 157         mov     (%ebp,%edi,4),  c ## D;\
 158         movzx   b ## B,         %edi;\
 159         mov     s3(%ebp,%edi,4),%esi;\
 160         movzx   a ## H,         %edi;\
 161         ror     $16,            a ## D;\
 162         xor     s1(%ebp,%edi,4),c ## D;\
 163         movzx   b ## H,         %edi;\
 164         ror     $16,            b ## D;\
 165         xor     (%ebp,%edi,4),  %esi;\
 166         movzx   a ## B,         %edi;\
 167         xor     s2(%ebp,%edi,4),c ## D;\
 168         movzx   b ## B,         %edi;\
 169         xor     s1(%ebp,%edi,4),%esi;\
 170         movzx   a ## H,         %edi;\
 171         ror     $15,            a ## D;\
 172         xor     s3(%ebp,%edi,4),c ## D;\
 173         movzx   b ## H,         %edi;\
 174         xor     s2(%ebp,%edi,4),%esi;\
 175         pop     %edi;\
 176         add     %esi,           c ## D;\
 177         add     c ## D,         %esi;\
 178         add     k+round(%ebp),  c ## D;\
 179         xor     %edi,           c ## D;\
 180         add     k+4+round(%ebp),%esi;\
 181         xor     %esi,           d ## D;\
 182         rol     $15,            d ## D;
 183
 184 /*
 185  * a input register containing a
 186  * b input register containing b (rotated 16)
 187  * c input register containing c
 188  * d input register containing d (already rol $1)
 189  * operations on a and b are interleaved to increase performance
 190  * last round has different rotations for the output preparation
 191  */
 192 #define decrypt_last_round(a,b,c,d,round)\
 193         push    c ## D;\
 194         movzx   a ## B,         %edi;\
 195         mov     (%ebp,%edi,4),  c ## D;\
 196         movzx   b ## B,         %edi;\
 197         mov     s3(%ebp,%edi,4),%esi;\
 198         movzx   a ## H,         %edi;\
 199         ror     $16,            a ## D;\
 200         xor     s1(%ebp,%edi,4),c ## D;\
 201         movzx   b ## H,         %edi;\
 202         ror     $16,            b ## D;\
 203         xor     (%ebp,%edi,4),  %esi;\
 204         movzx   a ## B,         %edi;\
 205         xor     s2(%ebp,%edi,4),c ## D;\
 206         movzx   b ## B,         %edi;\
 207         xor     s1(%ebp,%edi,4),%esi;\
 208         movzx   a ## H,         %edi;\
 209         ror     $16,            a ## D;\
 210         xor     s3(%ebp,%edi,4),c ## D;\
 211         movzx   b ## H,         %edi;\
 212         xor     s2(%ebp,%edi,4),%esi;\
 213         pop     %edi;\
 214         add     %esi,           c ## D;\
 215         add     c ## D,         %esi;\
 216         add     k+round(%ebp),  c ## D;\
 217         xor     %edi,           c ## D;\
 218         add     k+4+round(%ebp),%esi;\
 219         xor     %esi,           d ## D;\
 220         ror     $1,             d ## D;
 221
 222 .align 4
 223 .global twofish_enc_blk
 224 .global twofish_dec_blk
 225
 226 twofish_enc_blk:
 227         push    %ebp                    /* save registers according to calling convention*/
 228         push    %ebx
 229         push    %esi
 230         push    %edi
 231
 232         mov     tfm + 16(%esp), %ebp    /* abuse the base pointer: set new base bointer to the crypto tfm */
 233         add     $crypto_tfm_ctx_offset, %ebp    /* ctx adress */
 234         mov     in_blk+16(%esp),%edi    /* input adress in edi */
 235
 236         mov     (%edi),         %eax
 237         mov     b_offset(%edi), %ebx
 238         mov     c_offset(%edi), %ecx
 239         mov     d_offset(%edi), %edx
 240         input_whitening(%eax,%ebp,a_offset)
 241         ror     $16,    %eax
 242         input_whitening(%ebx,%ebp,b_offset)
 243         input_whitening(%ecx,%ebp,c_offset)
 244         input_whitening(%edx,%ebp,d_offset)
 245         rol     $1,     %edx
 246
 247         encrypt_round(R0,R1,R2,R3,0);
 248         encrypt_round(R2,R3,R0,R1,8);
 249         encrypt_round(R0,R1,R2,R3,2*8);
 250         encrypt_round(R2,R3,R0,R1,3*8);
 251         encrypt_round(R0,R1,R2,R3,4*8);
 252         encrypt_round(R2,R3,R0,R1,5*8);
 253         encrypt_round(R0,R1,R2,R3,6*8);
 254         encrypt_round(R2,R3,R0,R1,7*8);
 255         encrypt_round(R0,R1,R2,R3,8*8);
 256         encrypt_round(R2,R3,R0,R1,9*8);
 257         encrypt_round(R0,R1,R2,R3,10*8);
 258         encrypt_round(R2,R3,R0,R1,11*8);
 259         encrypt_round(R0,R1,R2,R3,12*8);
 260         encrypt_round(R2,R3,R0,R1,13*8);
 261         encrypt_round(R0,R1,R2,R3,14*8);
 262         encrypt_last_round(R2,R3,R0,R1,15*8);
 263
 264         output_whitening(%eax,%ebp,c_offset)
 265         output_whitening(%ebx,%ebp,d_offset)
 266         output_whitening(%ecx,%ebp,a_offset)
 267         output_whitening(%edx,%ebp,b_offset)
 268         mov     out_blk+16(%esp),%edi;
 269         mov     %eax,           c_offset(%edi)
 270         mov     %ebx,           d_offset(%edi)
 271         mov     %ecx,           (%edi)
 272         mov     %edx,           b_offset(%edi)
 273
 274         pop     %edi
 275         pop     %esi
 276         pop     %ebx
 277         pop     %ebp
 278         mov     $1,     %eax
 279         ret
 280
 281 twofish_dec_blk:
 282         push    %ebp                    /* save registers according to calling convention*/
 283         push    %ebx
 284         push    %esi
 285         push    %edi
 286
 287
 288         mov     tfm + 16(%esp), %ebp    /* abuse the base pointer: set new base bointer to the crypto tfm */
 289         add     $crypto_tfm_ctx_offset, %ebp    /* ctx adress */
 290         mov     in_blk+16(%esp),%edi    /* input adress in edi */
 291
 292         mov     (%edi),         %eax
 293         mov     b_offset(%edi), %ebx
 294         mov     c_offset(%edi), %ecx
 295         mov     d_offset(%edi), %edx
 296         output_whitening(%eax,%ebp,a_offset)
 297         output_whitening(%ebx,%ebp,b_offset)
 298         ror     $16,    %ebx
 299         output_whitening(%ecx,%ebp,c_offset)
 300         output_whitening(%edx,%ebp,d_offset)
 301         rol     $1,     %ecx
 302
 303         decrypt_round(R0,R1,R2,R3,15*8);
 304         decrypt_round(R2,R3,R0,R1,14*8);
 305         decrypt_round(R0,R1,R2,R3,13*8);
 306         decrypt_round(R2,R3,R0,R1,12*8);
 307         decrypt_round(R0,R1,R2,R3,11*8);
 308         decrypt_round(R2,R3,R0,R1,10*8);
 309         decrypt_round(R0,R1,R2,R3,9*8);
 310         decrypt_round(R2,R3,R0,R1,8*8);
 311         decrypt_round(R0,R1,R2,R3,7*8);
 312         decrypt_round(R2,R3,R0,R1,6*8);
 313         decrypt_round(R0,R1,R2,R3,5*8);
 314         decrypt_round(R2,R3,R0,R1,4*8);
 315         decrypt_round(R0,R1,R2,R3,3*8);
 316         decrypt_round(R2,R3,R0,R1,2*8);
 317         decrypt_round(R0,R1,R2,R3,1*8);
 318         decrypt_last_round(R2,R3,R0,R1,0);
 319
 320         input_whitening(%eax,%ebp,c_offset)
 321         input_whitening(%ebx,%ebp,d_offset)
 322         input_whitening(%ecx,%ebp,a_offset)
 323         input_whitening(%edx,%ebp,b_offset)
 324         mov     out_blk+16(%esp),%edi;
 325         mov     %eax,           c_offset(%edi)
 326         mov     %ebx,           d_offset(%edi)
 327         mov     %ecx,           (%edi)
 328         mov     %edx,           b_offset(%edi)
 329
 330         pop     %edi
 331         pop     %esi
 332         pop     %ebx
 333         pop     %ebp
 334         mov     $1,     %eax
 335         ret