arch/x86/crypto/twofish-x86_64-asm_64.S

   1 /***************************************************************************
   2 *   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
   3 *                                                                         *
   4 *   This program is free software; you can redistribute it and/or modify  *
   5 *   it under the terms of the GNU General Public License as published by  *
   6 *   the Free Software Foundation; either version 2 of the License, or     *
   7 *   (at your option) any later version.                                   *
   8 *                                                                         *
   9 *   This program is distributed in the hope that it will be useful,       *
  10 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  11 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
  12 *   GNU General Public License for more details.                          *
  13 *                                                                         *
  14 *   You should have received a copy of the GNU General Public License     *
  15 *   along with this program; if not, write to the                         *
  16 *   Free Software Foundation, Inc.,                                       *
  17 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
  18 ***************************************************************************/
  19
  20 .file "twofish-x86_64-asm.S"
  21 .text
  22
  23 #include <asm/asm-offsets.h>
  24
  25 #define a_offset        0
  26 #define b_offset        4
  27 #define c_offset        8
  28 #define d_offset        12
  29
  30 /* Structure of the crypto context struct*/
  31
  32 #define s0      0       /* S0 Array 256 Words each */
  33 #define s1      1024    /* S1 Array */
  34 #define s2      2048    /* S2 Array */
  35 #define s3      3072    /* S3 Array */
  36 #define w       4096    /* 8 whitening keys (word) */
  37 #define k       4128    /* key 1-32 ( word ) */
  38
  39 /* define a few register aliases to allow macro substitution */
  40
  41 #define R0     %rax
  42 #define R0D    %eax
  43 #define R0B    %al
  44 #define R0H    %ah
  45
  46 #define R1     %rbx
  47 #define R1D    %ebx
  48 #define R1B    %bl
  49 #define R1H    %bh
  50
  51 #define R2     %rcx
  52 #define R2D    %ecx
  53 #define R2B    %cl
  54 #define R2H    %ch
  55
  56 #define R3     %rdx
  57 #define R3D    %edx
  58 #define R3B    %dl
  59 #define R3H    %dh
  60
  61
  62 /* performs input whitening */
  63 #define input_whitening(src,context,offset)\
  64         xor     w+offset(context),      src;
  65
  66 /* performs input whitening */
  67 #define output_whitening(src,context,offset)\
  68         xor     w+16+offset(context),   src;
  69
  70
  71 /*
  72  * a input register containing a (rotated 16)
  73  * b input register containing b
  74  * c input register containing c
  75  * d input register containing d (already rol $1)
  76  * operations on a and b are interleaved to increase performance
  77  */
  78 #define encrypt_round(a,b,c,d,round)\
  79         movzx   b ## B,         %edi;\
  80         mov     s1(%r11,%rdi,4),%r8d;\
  81         movzx   a ## B,         %edi;\
  82         mov     s2(%r11,%rdi,4),%r9d;\
  83         movzx   b ## H,         %edi;\
  84         ror     $16,            b ## D;\
  85         xor     s2(%r11,%rdi,4),%r8d;\
  86         movzx   a ## H,         %edi;\
  87         ror     $16,            a ## D;\
  88         xor     s3(%r11,%rdi,4),%r9d;\
  89         movzx   b ## B,         %edi;\
  90         xor     s3(%r11,%rdi,4),%r8d;\
  91         movzx   a ## B,         %edi;\
  92         xor     (%r11,%rdi,4),  %r9d;\
  93         movzx   b ## H,         %edi;\
  94         ror     $15,            b ## D;\
  95         xor     (%r11,%rdi,4),  %r8d;\
  96         movzx   a ## H,         %edi;\
  97         xor     s1(%r11,%rdi,4),%r9d;\
  98         add     %r8d,           %r9d;\
  99         add     %r9d,           %r8d;\
 100         add     k+round(%r11),  %r9d;\
 101         xor     %r9d,           c ## D;\
 102         rol     $15,            c ## D;\
 103         add     k+4+round(%r11),%r8d;\
 104         xor     %r8d,           d ## D;
 105
 106 /*
 107  * a input register containing a(rotated 16)
 108  * b input register containing b
 109  * c input register containing c
 110  * d input register containing d (already rol $1)
 111  * operations on a and b are interleaved to increase performance
 112  * during the round a and b are prepared for the output whitening
 113  */
 114 #define encrypt_last_round(a,b,c,d,round)\
 115         mov     b ## D,         %r10d;\
 116         shl     $32,            %r10;\
 117         movzx   b ## B,         %edi;\
 118         mov     s1(%r11,%rdi,4),%r8d;\
 119         movzx   a ## B,         %edi;\
 120         mov     s2(%r11,%rdi,4),%r9d;\
 121         movzx   b ## H,         %edi;\
 122         ror     $16,            b ## D;\
 123         xor     s2(%r11,%rdi,4),%r8d;\
 124         movzx   a ## H,         %edi;\
 125         ror     $16,            a ## D;\
 126         xor     s3(%r11,%rdi,4),%r9d;\
 127         movzx   b ## B,         %edi;\
 128         xor     s3(%r11,%rdi,4),%r8d;\
 129         movzx   a ## B,         %edi;\
 130         xor     (%r11,%rdi,4),  %r9d;\
 131         xor     a,              %r10;\
 132         movzx   b ## H,         %edi;\
 133         xor     (%r11,%rdi,4),  %r8d;\
 134         movzx   a ## H,         %edi;\
 135         xor     s1(%r11,%rdi,4),%r9d;\
 136         add     %r8d,           %r9d;\
 137         add     %r9d,           %r8d;\
 138         add     k+round(%r11),  %r9d;\
 139         xor     %r9d,           c ## D;\
 140         ror     $1,             c ## D;\
 141         add     k+4+round(%r11),%r8d;\
 142         xor     %r8d,           d ## D
 143
 144 /*
 145  * a input register containing a
 146  * b input register containing b (rotated 16)
 147  * c input register containing c (already rol $1)
 148  * d input register containing d
 149  * operations on a and b are interleaved to increase performance
 150  */
 151 #define decrypt_round(a,b,c,d,round)\
 152         movzx   a ## B,         %edi;\
 153         mov     (%r11,%rdi,4),  %r9d;\
 154         movzx   b ## B,         %edi;\
 155         mov     s3(%r11,%rdi,4),%r8d;\
 156         movzx   a ## H,         %edi;\
 157         ror     $16,            a ## D;\
 158         xor     s1(%r11,%rdi,4),%r9d;\
 159         movzx   b ## H,         %edi;\
 160         ror     $16,            b ## D;\
 161         xor     (%r11,%rdi,4),  %r8d;\
 162         movzx   a ## B,         %edi;\
 163         xor     s2(%r11,%rdi,4),%r9d;\
 164         movzx   b ## B,         %edi;\
 165         xor     s1(%r11,%rdi,4),%r8d;\
 166         movzx   a ## H,         %edi;\
 167         ror     $15,            a ## D;\
 168         xor     s3(%r11,%rdi,4),%r9d;\
 169         movzx   b ## H,         %edi;\
 170         xor     s2(%r11,%rdi,4),%r8d;\
 171         add     %r8d,           %r9d;\
 172         add     %r9d,           %r8d;\
 173         add     k+round(%r11),  %r9d;\
 174         xor     %r9d,           c ## D;\
 175         add     k+4+round(%r11),%r8d;\
 176         xor     %r8d,           d ## D;\
 177         rol     $15,            d ## D;
 178
 179 /*
 180  * a input register containing a
 181  * b input register containing b
 182  * c input register containing c (already rol $1)
 183  * d input register containing d
 184  * operations on a and b are interleaved to increase performance
 185  * during the round a and b are prepared for the output whitening
 186  */
 187 #define decrypt_last_round(a,b,c,d,round)\
 188         movzx   a ## B,         %edi;\
 189         mov     (%r11,%rdi,4),  %r9d;\
 190         movzx   b ## B,         %edi;\
 191         mov     s3(%r11,%rdi,4),%r8d;\
 192         movzx   b ## H,         %edi;\
 193         ror     $16,            b ## D;\
 194         xor     (%r11,%rdi,4),  %r8d;\
 195         movzx   a ## H,         %edi;\
 196         mov     b ## D,         %r10d;\
 197         shl     $32,            %r10;\
 198         xor     a,              %r10;\
 199         ror     $16,            a ## D;\
 200         xor     s1(%r11,%rdi,4),%r9d;\
 201         movzx   b ## B,         %edi;\
 202         xor     s1(%r11,%rdi,4),%r8d;\
 203         movzx   a ## B,         %edi;\
 204         xor     s2(%r11,%rdi,4),%r9d;\
 205         movzx   b ## H,         %edi;\
 206         xor     s2(%r11,%rdi,4),%r8d;\
 207         movzx   a ## H,         %edi;\
 208         xor     s3(%r11,%rdi,4),%r9d;\
 209         add     %r8d,           %r9d;\
 210         add     %r9d,           %r8d;\
 211         add     k+round(%r11),  %r9d;\
 212         xor     %r9d,           c ## D;\
 213         add     k+4+round(%r11),%r8d;\
 214         xor     %r8d,           d ## D;\
 215         ror     $1,             d ## D;
 216
 217 .align 8
 218 .global twofish_enc_blk
 219 .global twofish_dec_blk
 220
 221 twofish_enc_blk:
 222         pushq    R1
 223
 224         /* %rdi contains the ctx address */
 225         /* %rsi contains the output address */
 226         /* %rdx contains the input address */
 227         /* ctx address is moved to free one non-rex register
 228         as target for the 8bit high operations */
 229         mov     %rdi,           %r11
 230
 231         movq    (R3),   R1
 232         movq    8(R3),  R3
 233         input_whitening(R1,%r11,a_offset)
 234         input_whitening(R3,%r11,c_offset)
 235         mov     R1D,    R0D
 236         rol     $16,    R0D
 237         shr     $32,    R1
 238         mov     R3D,    R2D
 239         shr     $32,    R3
 240         rol     $1,     R3D
 241
 242         encrypt_round(R0,R1,R2,R3,0);
 243         encrypt_round(R2,R3,R0,R1,8);
 244         encrypt_round(R0,R1,R2,R3,2*8);
 245         encrypt_round(R2,R3,R0,R1,3*8);
 246         encrypt_round(R0,R1,R2,R3,4*8);
 247         encrypt_round(R2,R3,R0,R1,5*8);
 248         encrypt_round(R0,R1,R2,R3,6*8);
 249         encrypt_round(R2,R3,R0,R1,7*8);
 250         encrypt_round(R0,R1,R2,R3,8*8);
 251         encrypt_round(R2,R3,R0,R1,9*8);
 252         encrypt_round(R0,R1,R2,R3,10*8);
 253         encrypt_round(R2,R3,R0,R1,11*8);
 254         encrypt_round(R0,R1,R2,R3,12*8);
 255         encrypt_round(R2,R3,R0,R1,13*8);
 256         encrypt_round(R0,R1,R2,R3,14*8);
 257         encrypt_last_round(R2,R3,R0,R1,15*8);
 258
 259
 260         output_whitening(%r10,%r11,a_offset)
 261         movq    %r10,   (%rsi)
 262
 263         shl     $32,    R1
 264         xor     R0,     R1
 265
 266         output_whitening(R1,%r11,c_offset)
 267         movq    R1,     8(%rsi)
 268
 269         popq    R1
 270         movq    $1,%rax
 271         ret
 272
 273 twofish_dec_blk:
 274         pushq    R1
 275
 276         /* %rdi contains the ctx address */
 277         /* %rsi contains the output address */
 278         /* %rdx contains the input address */
 279         /* ctx address is moved to free one non-rex register
 280         as target for the 8bit high operations */
 281         mov     %rdi,           %r11
 282
 283         movq    (R3),   R1
 284         movq    8(R3),  R3
 285         output_whitening(R1,%r11,a_offset)
 286         output_whitening(R3,%r11,c_offset)
 287         mov     R1D,    R0D
 288         shr     $32,    R1
 289         rol     $16,    R1D
 290         mov     R3D,    R2D
 291         shr     $32,    R3
 292         rol     $1,     R2D
 293
 294         decrypt_round(R0,R1,R2,R3,15*8);
 295         decrypt_round(R2,R3,R0,R1,14*8);
 296         decrypt_round(R0,R1,R2,R3,13*8);
 297         decrypt_round(R2,R3,R0,R1,12*8);
 298         decrypt_round(R0,R1,R2,R3,11*8);
 299         decrypt_round(R2,R3,R0,R1,10*8);
 300         decrypt_round(R0,R1,R2,R3,9*8);
 301         decrypt_round(R2,R3,R0,R1,8*8);
 302         decrypt_round(R0,R1,R2,R3,7*8);
 303         decrypt_round(R2,R3,R0,R1,6*8);
 304         decrypt_round(R0,R1,R2,R3,5*8);
 305         decrypt_round(R2,R3,R0,R1,4*8);
 306         decrypt_round(R0,R1,R2,R3,3*8);
 307         decrypt_round(R2,R3,R0,R1,2*8);
 308         decrypt_round(R0,R1,R2,R3,1*8);
 309         decrypt_last_round(R2,R3,R0,R1,0);
 310
 311         input_whitening(%r10,%r11,a_offset)
 312         movq    %r10,   (%rsi)
 313
 314         shl     $32,    R1
 315         xor     R0,     R1
 316
 317         input_whitening(R1,%r11,c_offset)
 318         movq    R1,     8(%rsi)
 319
 320         popq    R1
 321         movq    $1,%rax
 322         ret