arch/arm/crypto/aes-neonbs-core.S

   1 /*
   2  * Bit sliced AES using NEON instructions
   3  *
   4  * Copyright (C) 2017 Linaro Ltd.
   5  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  */
  11
  12 /*
  13  * The algorithm implemented here is described in detail by the paper
  14  * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
  15  * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
  16  *
  17  * This implementation is based primarily on the OpenSSL implementation
  18  * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
  19  */
  20
  21 #include <linux/linkage.h>
  22 #include <asm/assembler.h>
  23
  24         .text
  25         .fpu            neon
  26
  27         rounds          .req    ip
  28         bskey           .req    r4
  29
  30         q0l             .req    d0
  31         q0h             .req    d1
  32         q1l             .req    d2
  33         q1h             .req    d3
  34         q2l             .req    d4
  35         q2h             .req    d5
  36         q3l             .req    d6
  37         q3h             .req    d7
  38         q4l             .req    d8
  39         q4h             .req    d9
  40         q5l             .req    d10
  41         q5h             .req    d11
  42         q6l             .req    d12
  43         q6h             .req    d13
  44         q7l             .req    d14
  45         q7h             .req    d15
  46         q8l             .req    d16
  47         q8h             .req    d17
  48         q9l             .req    d18
  49         q9h             .req    d19
  50         q10l            .req    d20
  51         q10h            .req    d21
  52         q11l            .req    d22
  53         q11h            .req    d23
  54         q12l            .req    d24
  55         q12h            .req    d25
  56         q13l            .req    d26
  57         q13h            .req    d27
  58         q14l            .req    d28
  59         q14h            .req    d29
  60         q15l            .req    d30
  61         q15h            .req    d31
  62
  63         .macro          __tbl, out, tbl, in, tmp
  64         .ifc            \out, \tbl
  65         .ifb            \tmp
  66         .error          __tbl needs temp register if out == tbl
  67         .endif
  68         vmov            \tmp, \out
  69         .endif
  70         vtbl.8          \out\()l, {\tbl}, \in\()l
  71         .ifc            \out, \tbl
  72         vtbl.8          \out\()h, {\tmp}, \in\()h
  73         .else
  74         vtbl.8          \out\()h, {\tbl}, \in\()h
  75         .endif
  76         .endm
  77
  78         .macro          __ldr, out, sym
  79         vldr            \out\()l, \sym
  80         vldr            \out\()h, \sym + 8
  81         .endm
  82
  83         .macro          __adr, reg, lbl
  84         adr             \reg, \lbl
  85 THUMB(  orr             \reg, \reg, #1          )
  86         .endm
  87
  88         .macro          in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
  89         veor            \b2, \b2, \b1
  90         veor            \b5, \b5, \b6
  91         veor            \b3, \b3, \b0
  92         veor            \b6, \b6, \b2
  93         veor            \b5, \b5, \b0
  94         veor            \b6, \b6, \b3
  95         veor            \b3, \b3, \b7
  96         veor            \b7, \b7, \b5
  97         veor            \b3, \b3, \b4
  98         veor            \b4, \b4, \b5
  99         veor            \b2, \b2, \b7
 100         veor            \b3, \b3, \b1
 101         veor            \b1, \b1, \b5
 102         .endm
 103
 104         .macro          out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
 105         veor            \b0, \b0, \b6
 106         veor            \b1, \b1, \b4
 107         veor            \b4, \b4, \b6
 108         veor            \b2, \b2, \b0
 109         veor            \b6, \b6, \b1
 110         veor            \b1, \b1, \b5
 111         veor            \b5, \b5, \b3
 112         veor            \b3, \b3, \b7
 113         veor            \b7, \b7, \b5
 114         veor            \b2, \b2, \b5
 115         veor            \b4, \b4, \b7
 116         .endm
 117
 118         .macro          inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
 119         veor            \b1, \b1, \b7
 120         veor            \b4, \b4, \b7
 121         veor            \b7, \b7, \b5
 122         veor            \b1, \b1, \b3
 123         veor            \b2, \b2, \b5
 124         veor            \b3, \b3, \b7
 125         veor            \b6, \b6, \b1
 126         veor            \b2, \b2, \b0
 127         veor            \b5, \b5, \b3
 128         veor            \b4, \b4, \b6
 129         veor            \b0, \b0, \b6
 130         veor            \b1, \b1, \b4
 131         .endm
 132
 133         .macro          inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
 134         veor            \b1, \b1, \b5
 135         veor            \b2, \b2, \b7
 136         veor            \b3, \b3, \b1
 137         veor            \b4, \b4, \b5
 138         veor            \b7, \b7, \b5
 139         veor            \b3, \b3, \b4
 140         veor            \b5, \b5, \b0
 141         veor            \b3, \b3, \b7
 142         veor            \b6, \b6, \b2
 143         veor            \b2, \b2, \b1
 144         veor            \b6, \b6, \b3
 145         veor            \b3, \b3, \b0
 146         veor            \b5, \b5, \b6
 147         .endm
 148
 149         .macro          mul_gf4, x0, x1, y0, y1, t0, t1
 150         veor            \t0, \y0, \y1
 151         vand            \t0, \t0, \x0
 152         veor            \x0, \x0, \x1
 153         vand            \t1, \x1, \y0
 154         vand            \x0, \x0, \y1
 155         veor            \x1, \t1, \t0
 156         veor            \x0, \x0, \t1
 157         .endm
 158
 159         .macro          mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
 160         veor            \t0, \y0, \y1
 161         veor            \t1, \y2, \y3
 162         vand            \t0, \t0, \x0
 163         vand            \t1, \t1, \x2
 164         veor            \x0, \x0, \x1
 165         veor            \x2, \x2, \x3
 166         vand            \x1, \x1, \y0
 167         vand            \x3, \x3, \y2
 168         vand            \x0, \x0, \y1
 169         vand            \x2, \x2, \y3
 170         veor            \x1, \x1, \x0
 171         veor            \x2, \x2, \x3
 172         veor            \x0, \x0, \t0
 173         veor            \x3, \x3, \t1
 174         .endm
 175
 176         .macro          mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
 177                                     y0, y1, y2, y3, t0, t1, t2, t3
 178         veor            \t0, \x0, \x2
 179         veor            \t1, \x1, \x3
 180         mul_gf4         \x0, \x1, \y0, \y1, \t2, \t3
 181         veor            \y0, \y0, \y2
 182         veor            \y1, \y1, \y3
 183         mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
 184         veor            \x0, \x0, \t0
 185         veor            \x2, \x2, \t0
 186         veor            \x1, \x1, \t1
 187         veor            \x3, \x3, \t1
 188         veor            \t0, \x4, \x6
 189         veor            \t1, \x5, \x7
 190         mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
 191         veor            \y0, \y0, \y2
 192         veor            \y1, \y1, \y3
 193         mul_gf4         \x4, \x5, \y0, \y1, \t2, \t3
 194         veor            \x4, \x4, \t0
 195         veor            \x6, \x6, \t0
 196         veor            \x5, \x5, \t1
 197         veor            \x7, \x7, \t1
 198         .endm
 199
 200         .macro          inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
 201                                    t0, t1, t2, t3, s0, s1, s2, s3
 202         veor            \t3, \x4, \x6
 203         veor            \t0, \x5, \x7
 204         veor            \t1, \x1, \x3
 205         veor            \s1, \x7, \x6
 206         veor            \s0, \x0, \x2
 207         veor            \s3, \t3, \t0
 208         vorr            \t2, \t0, \t1
 209         vand            \s2, \t3, \s0
 210         vorr            \t3, \t3, \s0
 211         veor            \s0, \s0, \t1
 212         vand            \t0, \t0, \t1
 213         veor            \t1, \x3, \x2
 214         vand            \s3, \s3, \s0
 215         vand            \s1, \s1, \t1
 216         veor            \t1, \x4, \x5
 217         veor            \s0, \x1, \x0
 218         veor            \t3, \t3, \s1
 219         veor            \t2, \t2, \s1
 220         vand            \s1, \t1, \s0
 221         vorr            \t1, \t1, \s0
 222         veor            \t3, \t3, \s3
 223         veor            \t0, \t0, \s1
 224         veor            \t2, \t2, \s2
 225         veor            \t1, \t1, \s3
 226         veor            \t0, \t0, \s2
 227         vand            \s0, \x7, \x3
 228         veor            \t1, \t1, \s2
 229         vand            \s1, \x6, \x2
 230         vand            \s2, \x5, \x1
 231         vorr            \s3, \x4, \x0
 232         veor            \t3, \t3, \s0
 233         veor            \t1, \t1, \s2
 234         veor            \s0, \t0, \s3
 235         veor            \t2, \t2, \s1
 236         vand            \s2, \t3, \t1
 237         veor            \s1, \t2, \s2
 238         veor            \s3, \s0, \s2
 239         vbsl            \s1, \t1, \s0
 240         vmvn            \t0, \s0
 241         vbsl            \s0, \s1, \s3
 242         vbsl            \t0, \s1, \s3
 243         vbsl            \s3, \t3, \t2
 244         veor            \t3, \t3, \t2
 245         vand            \s2, \s0, \s3
 246         veor            \t1, \t1, \t0
 247         veor            \s2, \s2, \t3
 248         mul_gf16_2      \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
 249                         \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
 250         .endm
 251
 252         .macro          sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
 253                               t0, t1, t2, t3, s0, s1, s2, s3
 254         in_bs_ch        \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
 255         inv_gf256       \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
 256                         \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 257         out_bs_ch       \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
 258         .endm
 259
 260         .macro          inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
 261                                   t0, t1, t2, t3, s0, s1, s2, s3
 262         inv_in_bs_ch    \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
 263         inv_gf256       \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
 264                         \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 265         inv_out_bs_ch   \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
 266         .endm
 267
 268         .macro          shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
 269                                     t0, t1, t2, t3, mask
 270         vld1.8          {\t0-\t1}, [bskey, :256]!
 271         veor            \t0, \t0, \x0
 272         vld1.8          {\t2-\t3}, [bskey, :256]!
 273         veor            \t1, \t1, \x1
 274         __tbl           \x0, \t0, \mask
 275         veor            \t2, \t2, \x2
 276         __tbl           \x1, \t1, \mask
 277         vld1.8          {\t0-\t1}, [bskey, :256]!
 278         veor            \t3, \t3, \x3
 279         __tbl           \x2, \t2, \mask
 280         __tbl           \x3, \t3, \mask
 281         vld1.8          {\t2-\t3}, [bskey, :256]!
 282         veor            \t0, \t0, \x4
 283         veor            \t1, \t1, \x5
 284         __tbl           \x4, \t0, \mask
 285         veor            \t2, \t2, \x6
 286         __tbl           \x5, \t1, \mask
 287         veor            \t3, \t3, \x7
 288         __tbl           \x6, \t2, \mask
 289         __tbl           \x7, \t3, \mask
 290         .endm
 291
 292         .macro          inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
 293                                         t0, t1, t2, t3, mask
 294         __tbl           \x0, \x0, \mask, \t0
 295         __tbl           \x1, \x1, \mask, \t1
 296         __tbl           \x2, \x2, \mask, \t2
 297         __tbl           \x3, \x3, \mask, \t3
 298         __tbl           \x4, \x4, \mask, \t0
 299         __tbl           \x5, \x5, \mask, \t1
 300         __tbl           \x6, \x6, \mask, \t2
 301         __tbl           \x7, \x7, \mask, \t3
 302         .endm
 303
 304         .macro          mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
 305                                   t0, t1, t2, t3, t4, t5, t6, t7, inv
 306         vext.8          \t0, \x0, \x0, #12
 307         vext.8          \t1, \x1, \x1, #12
 308         veor            \x0, \x0, \t0
 309         vext.8          \t2, \x2, \x2, #12
 310         veor            \x1, \x1, \t1
 311         vext.8          \t3, \x3, \x3, #12
 312         veor            \x2, \x2, \t2
 313         vext.8          \t4, \x4, \x4, #12
 314         veor            \x3, \x3, \t3
 315         vext.8          \t5, \x5, \x5, #12
 316         veor            \x4, \x4, \t4
 317         vext.8          \t6, \x6, \x6, #12
 318         veor            \x5, \x5, \t5
 319         vext.8          \t7, \x7, \x7, #12
 320         veor            \x6, \x6, \t6
 321         veor            \t1, \t1, \x0
 322         veor.8          \x7, \x7, \t7
 323         vext.8          \x0, \x0, \x0, #8
 324         veor            \t2, \t2, \x1
 325         veor            \t0, \t0, \x7
 326         veor            \t1, \t1, \x7
 327         vext.8          \x1, \x1, \x1, #8
 328         veor            \t5, \t5, \x4
 329         veor            \x0, \x0, \t0
 330         veor            \t6, \t6, \x5
 331         veor            \x1, \x1, \t1
 332         vext.8          \t0, \x4, \x4, #8
 333         veor            \t4, \t4, \x3
 334         vext.8          \t1, \x5, \x5, #8
 335         veor            \t7, \t7, \x6
 336         vext.8          \x4, \x3, \x3, #8
 337         veor            \t3, \t3, \x2
 338         vext.8          \x5, \x7, \x7, #8
 339         veor            \t4, \t4, \x7
 340         vext.8          \x3, \x6, \x6, #8
 341         veor            \t3, \t3, \x7
 342         vext.8          \x6, \x2, \x2, #8
 343         veor            \x7, \t1, \t5
 344         .ifb            \inv
 345         veor            \x2, \t0, \t4
 346         veor            \x4, \x4, \t3
 347         veor            \x5, \x5, \t7
 348         veor            \x3, \x3, \t6
 349         veor            \x6, \x6, \t2
 350         .else
 351         veor            \t3, \t3, \x4
 352         veor            \x5, \x5, \t7
 353         veor            \x2, \x3, \t6
 354         veor            \x3, \t0, \t4
 355         veor            \x4, \x6, \t2
 356         vmov            \x6, \t3
 357         .endif
 358         .endm
 359
 360         .macro          inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
 361                                       t0, t1, t2, t3, t4, t5, t6, t7
 362         vld1.8          {\t0-\t1}, [bskey, :256]!
 363         veor            \x0, \x0, \t0
 364         vld1.8          {\t2-\t3}, [bskey, :256]!
 365         veor            \x1, \x1, \t1
 366         vld1.8          {\t4-\t5}, [bskey, :256]!
 367         veor            \x2, \x2, \t2
 368         vld1.8          {\t6-\t7}, [bskey, :256]
 369         sub             bskey, bskey, #224
 370         veor            \x3, \x3, \t3
 371         veor            \x4, \x4, \t4
 372         veor            \x5, \x5, \t5
 373         veor            \x6, \x6, \t6
 374         veor            \x7, \x7, \t7
 375         vext.8          \t0, \x0, \x0, #8
 376         vext.8          \t6, \x6, \x6, #8
 377         vext.8          \t7, \x7, \x7, #8
 378         veor            \t0, \t0, \x0
 379         vext.8          \t1, \x1, \x1, #8
 380         veor            \t6, \t6, \x6
 381         vext.8          \t2, \x2, \x2, #8
 382         veor            \t7, \t7, \x7
 383         vext.8          \t3, \x3, \x3, #8
 384         veor            \t1, \t1, \x1
 385         vext.8          \t4, \x4, \x4, #8
 386         veor            \t2, \t2, \x2
 387         vext.8          \t5, \x5, \x5, #8
 388         veor            \t3, \t3, \x3
 389         veor            \t4, \t4, \x4
 390         veor            \t5, \t5, \x5
 391         veor            \x0, \x0, \t6
 392         veor            \x1, \x1, \t6
 393         veor            \x2, \x2, \t0
 394         veor            \x4, \x4, \t2
 395         veor            \x3, \x3, \t1
 396         veor            \x1, \x1, \t7
 397         veor            \x2, \x2, \t7
 398         veor            \x4, \x4, \t6
 399         veor            \x5, \x5, \t3
 400         veor            \x3, \x3, \t6
 401         veor            \x6, \x6, \t4
 402         veor            \x4, \x4, \t7
 403         veor            \x5, \x5, \t7
 404         veor            \x7, \x7, \t5
 405         mix_cols        \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
 406                         \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
 407         .endm
 408
 409         .macro          swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
 410         vshr.u64        \t0, \b0, #\n
 411         vshr.u64        \t1, \b1, #\n
 412         veor            \t0, \t0, \a0
 413         veor            \t1, \t1, \a1
 414         vand            \t0, \t0, \mask
 415         vand            \t1, \t1, \mask
 416         veor            \a0, \a0, \t0
 417         vshl.s64        \t0, \t0, #\n
 418         veor            \a1, \a1, \t1
 419         vshl.s64        \t1, \t1, #\n
 420         veor            \b0, \b0, \t0
 421         veor            \b1, \b1, \t1
 422         .endm
 423
 424         .macro          bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
 425         vmov.i8         \t0, #0x55
 426         vmov.i8         \t1, #0x33
 427         swapmove_2x     \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
 428         swapmove_2x     \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
 429         vmov.i8         \t0, #0x0f
 430         swapmove_2x     \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
 431         swapmove_2x     \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
 432         swapmove_2x     \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
 433         swapmove_2x     \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
 434         .endm
 435
 436         .align          4
 437 M0:     .quad           0x02060a0e03070b0f, 0x0004080c0105090d
 438
 439         /*
 440          * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
 441          */
 442 ENTRY(aesbs_convert_key)
 443         vld1.32         {q7}, [r1]!             // load round 0 key
 444         vld1.32         {q15}, [r1]!            // load round 1 key
 445
 446         vmov.i8         q8,  #0x01              // bit masks
 447         vmov.i8         q9,  #0x02
 448         vmov.i8         q10, #0x04
 449         vmov.i8         q11, #0x08
 450         vmov.i8         q12, #0x10
 451         vmov.i8         q13, #0x20
 452         __ldr           q14, M0
 453
 454         sub             r2, r2, #1
 455         vst1.8          {q7}, [r0, :128]!       // save round 0 key
 456
 457 .Lkey_loop:
 458         __tbl           q7, q15, q14
 459         vmov.i8         q6, #0x40
 460         vmov.i8         q15, #0x80
 461
 462         vtst.8          q0, q7, q8
 463         vtst.8          q1, q7, q9
 464         vtst.8          q2, q7, q10
 465         vtst.8          q3, q7, q11
 466         vtst.8          q4, q7, q12
 467         vtst.8          q5, q7, q13
 468         vtst.8          q6, q7, q6
 469         vtst.8          q7, q7, q15
 470         vld1.32         {q15}, [r1]!            // load next round key
 471         vmvn            q0, q0
 472         vmvn            q1, q1
 473         vmvn            q5, q5
 474         vmvn            q6, q6
 475
 476         subs            r2, r2, #1
 477         vst1.8          {q0-q1}, [r0, :256]!
 478         vst1.8          {q2-q3}, [r0, :256]!
 479         vst1.8          {q4-q5}, [r0, :256]!
 480         vst1.8          {q6-q7}, [r0, :256]!
 481         bne             .Lkey_loop
 482
 483         vmov.i8         q7, #0x63               // compose .L63
 484         veor            q15, q15, q7
 485         vst1.8          {q15}, [r0, :128]
 486         bx              lr
 487 ENDPROC(aesbs_convert_key)
 488
 489         .align          4
 490 M0SR:   .quad           0x0a0e02060f03070b, 0x0004080c05090d01
 491
 492 aesbs_encrypt8:
 493         vld1.8          {q9}, [bskey, :128]!    // round 0 key
 494         __ldr           q8, M0SR
 495
 496         veor            q10, q0, q9             // xor with round0 key
 497         veor            q11, q1, q9
 498         __tbl           q0, q10, q8
 499         veor            q12, q2, q9
 500         __tbl           q1, q11, q8
 501         veor            q13, q3, q9
 502         __tbl           q2, q12, q8
 503         veor            q14, q4, q9
 504         __tbl           q3, q13, q8
 505         veor            q15, q5, q9
 506         __tbl           q4, q14, q8
 507         veor            q10, q6, q9
 508         __tbl           q5, q15, q8
 509         veor            q11, q7, q9
 510         __tbl           q6, q10, q8
 511         __tbl           q7, q11, q8
 512
 513         bitslice        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
 514
 515         sub             rounds, rounds, #1
 516         b               .Lenc_sbox
 517
 518         .align          5
 519 SR:     .quad           0x0504070600030201, 0x0f0e0d0c0a09080b
 520 SRM0:   .quad           0x0304090e00050a0f, 0x01060b0c0207080d
 521
 522 .Lenc_last:
 523         __ldr           q12, SRM0
 524 .Lenc_loop:
 525         shift_rows      q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
 526 .Lenc_sbox:
 527         sbox            q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
 528                                                                 q13, q14, q15
 529         subs            rounds, rounds, #1
 530         bcc             .Lenc_done
 531
 532         mix_cols        q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
 533                                                                 q13, q14, q15
 534
 535         beq             .Lenc_last
 536         __ldr           q12, SR
 537         b               .Lenc_loop
 538
 539 .Lenc_done:
 540         vld1.8          {q12}, [bskey, :128]    // last round key
 541
 542         bitslice        q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
 543
 544         veor            q0, q0, q12
 545         veor            q1, q1, q12
 546         veor            q4, q4, q12
 547         veor            q6, q6, q12
 548         veor            q3, q3, q12
 549         veor            q7, q7, q12
 550         veor            q2, q2, q12
 551         veor            q5, q5, q12
 552         bx              lr
 553 ENDPROC(aesbs_encrypt8)
 554
 555         .align          4
 556 M0ISR:  .quad           0x0a0e0206070b0f03, 0x0004080c0d010509
 557
 558 aesbs_decrypt8:
 559         add             bskey, bskey, rounds, lsl #7
 560         sub             bskey, bskey, #112
 561         vld1.8          {q9}, [bskey, :128]     // round 0 key
 562         sub             bskey, bskey, #128
 563         __ldr           q8, M0ISR
 564
 565         veor            q10, q0, q9             // xor with round0 key
 566         veor            q11, q1, q9
 567         __tbl           q0, q10, q8
 568         veor            q12, q2, q9
 569         __tbl           q1, q11, q8
 570         veor            q13, q3, q9
 571         __tbl           q2, q12, q8
 572         veor            q14, q4, q9
 573         __tbl           q3, q13, q8
 574         veor            q15, q5, q9
 575         __tbl           q4, q14, q8
 576         veor            q10, q6, q9
 577         __tbl           q5, q15, q8
 578         veor            q11, q7, q9
 579         __tbl           q6, q10, q8
 580         __tbl           q7, q11, q8
 581
 582         bitslice        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
 583
 584         sub             rounds, rounds, #1
 585         b               .Ldec_sbox
 586
 587         .align          5
 588 ISR:    .quad           0x0504070602010003, 0x0f0e0d0c080b0a09
 589 ISRM0:  .quad           0x01040b0e0205080f, 0x0306090c00070a0d
 590
 591 .Ldec_last:
 592         __ldr           q12, ISRM0
 593 .Ldec_loop:
 594         inv_shift_rows  q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
 595 .Ldec_sbox:
 596         inv_sbox        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
 597                                                                 q13, q14, q15
 598         subs            rounds, rounds, #1
 599         bcc             .Ldec_done
 600
 601         inv_mix_cols    q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
 602                                                                 q13, q14, q15
 603
 604         beq             .Ldec_last
 605         __ldr           q12, ISR
 606         b               .Ldec_loop
 607
 608 .Ldec_done:
 609         add             bskey, bskey, #112
 610         vld1.8          {q12}, [bskey, :128]    // last round key
 611
 612         bitslice        q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
 613
 614         veor            q0, q0, q12
 615         veor            q1, q1, q12
 616         veor            q6, q6, q12
 617         veor            q4, q4, q12
 618         veor            q2, q2, q12
 619         veor            q7, q7, q12
 620         veor            q3, q3, q12
 621         veor            q5, q5, q12
 622         bx              lr
 623 ENDPROC(aesbs_decrypt8)
 624
 625         /*
 626          * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 627          *                   int blocks)
 628          * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 629          *                   int blocks)
 630          */
 631         .macro          __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 632         push            {r4-r6, lr}
 633         ldr             r5, [sp, #16]           // number of blocks
 634
 635 99:     __adr           ip, 0f
 636         and             lr, r5, #7
 637         cmp             r5, #8
 638         sub             ip, ip, lr, lsl #2
 639         bxlt            ip                      // computed goto if blocks < 8
 640
 641         vld1.8          {q0}, [r1]!
 642         vld1.8          {q1}, [r1]!
 643         vld1.8          {q2}, [r1]!
 644         vld1.8          {q3}, [r1]!
 645         vld1.8          {q4}, [r1]!
 646         vld1.8          {q5}, [r1]!
 647         vld1.8          {q6}, [r1]!
 648         vld1.8          {q7}, [r1]!
 649
 650 0:      mov             bskey, r2
 651         mov             rounds, r3
 652         bl              \do8
 653
 654         __adr           ip, 1f
 655         and             lr, r5, #7
 656         cmp             r5, #8
 657         sub             ip, ip, lr, lsl #2
 658         bxlt            ip                      // computed goto if blocks < 8
 659
 660         vst1.8          {\o0}, [r0]!
 661         vst1.8          {\o1}, [r0]!
 662         vst1.8          {\o2}, [r0]!
 663         vst1.8          {\o3}, [r0]!
 664         vst1.8          {\o4}, [r0]!
 665         vst1.8          {\o5}, [r0]!
 666         vst1.8          {\o6}, [r0]!
 667         vst1.8          {\o7}, [r0]!
 668
 669 1:      subs            r5, r5, #8
 670         bgt             99b
 671
 672         pop             {r4-r6, pc}
 673         .endm
 674
 675         .align          4
 676 ENTRY(aesbs_ecb_encrypt)
 677         __ecb_crypt     aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
 678 ENDPROC(aesbs_ecb_encrypt)
 679
 680         .align          4
 681 ENTRY(aesbs_ecb_decrypt)
 682         __ecb_crypt     aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
 683 ENDPROC(aesbs_ecb_decrypt)
 684
 685         /*
 686          * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
 687          *                   int rounds, int blocks, u8 iv[])
 688          */
 689         .align          4
 690 ENTRY(aesbs_cbc_decrypt)
 691         mov             ip, sp
 692         push            {r4-r6, lr}
 693         ldm             ip, {r5-r6}             // load args 4-5
 694
 695 99:     __adr           ip, 0f
 696         and             lr, r5, #7
 697         cmp             r5, #8
 698         sub             ip, ip, lr, lsl #2
 699         mov             lr, r1
 700         bxlt            ip                      // computed goto if blocks < 8
 701
 702         vld1.8          {q0}, [lr]!
 703         vld1.8          {q1}, [lr]!
 704         vld1.8          {q2}, [lr]!
 705         vld1.8          {q3}, [lr]!
 706         vld1.8          {q4}, [lr]!
 707         vld1.8          {q5}, [lr]!
 708         vld1.8          {q6}, [lr]!
 709         vld1.8          {q7}, [lr]
 710
 711 0:      mov             bskey, r2
 712         mov             rounds, r3
 713         bl              aesbs_decrypt8
 714
 715         vld1.8          {q8}, [r6]
 716         vmov            q9, q8
 717         vmov            q10, q8
 718         vmov            q11, q8
 719         vmov            q12, q8
 720         vmov            q13, q8
 721         vmov            q14, q8
 722         vmov            q15, q8
 723
 724         __adr           ip, 1f
 725         and             lr, r5, #7
 726         cmp             r5, #8
 727         sub             ip, ip, lr, lsl #2
 728         bxlt            ip                      // computed goto if blocks < 8
 729
 730         vld1.8          {q9}, [r1]!
 731         vld1.8          {q10}, [r1]!
 732         vld1.8          {q11}, [r1]!
 733         vld1.8          {q12}, [r1]!
 734         vld1.8          {q13}, [r1]!
 735         vld1.8          {q14}, [r1]!
 736         vld1.8          {q15}, [r1]!
 737         W(nop)
 738
 739 1:      __adr           ip, 2f
 740         sub             ip, ip, lr, lsl #3
 741         bxlt            ip                      // computed goto if blocks < 8
 742
 743         veor            q0, q0, q8
 744         vst1.8          {q0}, [r0]!
 745         veor            q1, q1, q9
 746         vst1.8          {q1}, [r0]!
 747         veor            q6, q6, q10
 748         vst1.8          {q6}, [r0]!
 749         veor            q4, q4, q11
 750         vst1.8          {q4}, [r0]!
 751         veor            q2, q2, q12
 752         vst1.8          {q2}, [r0]!
 753         veor            q7, q7, q13
 754         vst1.8          {q7}, [r0]!
 755         veor            q3, q3, q14
 756         vst1.8          {q3}, [r0]!
 757         veor            q5, q5, q15
 758         vld1.8          {q8}, [r1]!             // load next round's iv
 759 2:      vst1.8          {q5}, [r0]!
 760
 761         subs            r5, r5, #8
 762         vst1.8          {q8}, [r6]              // store next round's iv
 763         bgt             99b
 764
 765         pop             {r4-r6, pc}
 766 ENDPROC(aesbs_cbc_decrypt)
 767
 768         .macro          next_ctr, q
 769         vmov.32         \q\()h[1], r10
 770         adds            r10, r10, #1
 771         vmov.32         \q\()h[0], r9
 772         adcs            r9, r9, #0
 773         vmov.32         \q\()l[1], r8
 774         adcs            r8, r8, #0
 775         vmov.32         \q\()l[0], r7
 776         adc             r7, r7, #0
 777         vrev32.8        \q, \q
 778         .endm
 779
 780         /*
 781          * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
 782          *                   int rounds, int blocks, u8 ctr[], u8 final[])
 783          */
 784 ENTRY(aesbs_ctr_encrypt)
 785         mov             ip, sp
 786         push            {r4-r10, lr}
 787
 788         ldm             ip, {r5-r7}             // load args 4-6
 789         teq             r7, #0
 790         addne           r5, r5, #1              // one extra block if final != 0
 791
 792         vld1.8          {q0}, [r6]              // load counter
 793         vrev32.8        q1, q0
 794         vmov            r9, r10, d3
 795         vmov            r7, r8, d2
 796
 797         adds            r10, r10, #1
 798         adcs            r9, r9, #0
 799         adcs            r8, r8, #0
 800         adc             r7, r7, #0
 801
 802 99:     vmov            q1, q0
 803         vmov            q2, q0
 804         vmov            q3, q0
 805         vmov            q4, q0
 806         vmov            q5, q0
 807         vmov            q6, q0
 808         vmov            q7, q0
 809
 810         __adr           ip, 0f
 811         sub             lr, r5, #1
 812         and             lr, lr, #7
 813         cmp             r5, #8
 814         sub             ip, ip, lr, lsl #5
 815         sub             ip, ip, lr, lsl #2
 816         bxlt            ip                      // computed goto if blocks < 8
 817
 818         next_ctr        q1
 819         next_ctr        q2
 820         next_ctr        q3
 821         next_ctr        q4
 822         next_ctr        q5
 823         next_ctr        q6
 824         next_ctr        q7
 825
 826 0:      mov             bskey, r2
 827         mov             rounds, r3
 828         bl              aesbs_encrypt8
 829
 830         __adr           ip, 1f
 831         and             lr, r5, #7
 832         cmp             r5, #8
 833         movgt           r4, #0
 834         ldrle           r4, [sp, #40]           // load final in the last round
 835         sub             ip, ip, lr, lsl #2
 836         bxlt            ip                      // computed goto if blocks < 8
 837
 838         vld1.8          {q8}, [r1]!
 839         vld1.8          {q9}, [r1]!
 840         vld1.8          {q10}, [r1]!
 841         vld1.8          {q11}, [r1]!
 842         vld1.8          {q12}, [r1]!
 843         vld1.8          {q13}, [r1]!
 844         vld1.8          {q14}, [r1]!
 845         teq             r4, #0                  // skip last block if 'final'
 846 1:      bne             2f
 847         vld1.8          {q15}, [r1]!
 848
 849 2:      __adr           ip, 3f
 850         cmp             r5, #8
 851         sub             ip, ip, lr, lsl #3
 852         bxlt            ip                      // computed goto if blocks < 8
 853
 854         veor            q0, q0, q8
 855         vst1.8          {q0}, [r0]!
 856         veor            q1, q1, q9
 857         vst1.8          {q1}, [r0]!
 858         veor            q4, q4, q10
 859         vst1.8          {q4}, [r0]!
 860         veor            q6, q6, q11
 861         vst1.8          {q6}, [r0]!
 862         veor            q3, q3, q12
 863         vst1.8          {q3}, [r0]!
 864         veor            q7, q7, q13
 865         vst1.8          {q7}, [r0]!
 866         veor            q2, q2, q14
 867         vst1.8          {q2}, [r0]!
 868         teq             r4, #0                  // skip last block if 'final'
 869         W(bne)          5f
 870 3:      veor            q5, q5, q15
 871         vst1.8          {q5}, [r0]!
 872
 873 4:      next_ctr        q0
 874
 875         subs            r5, r5, #8
 876         bgt             99b
 877
 878         vst1.8          {q0}, [r6]
 879         pop             {r4-r10, pc}
 880
 881 5:      vst1.8          {q5}, [r4]
 882         b               4b
 883 ENDPROC(aesbs_ctr_encrypt)
 884
 885         .macro          next_tweak, out, in, const, tmp
 886         vshr.s64        \tmp, \in, #63
 887         vand            \tmp, \tmp, \const
 888         vadd.u64        \out, \in, \in
 889         vext.8          \tmp, \tmp, \tmp, #8
 890         veor            \out, \out, \tmp
 891         .endm
 892
 893         .align          4
 894 .Lxts_mul_x:
 895         .quad           1, 0x87
 896
 897         /*
 898          * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 899          *                   int blocks, u8 iv[])
 900          * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 901          *                   int blocks, u8 iv[])
 902          */
 903 __xts_prepare8:
 904         vld1.8          {q14}, [r7]             // load iv
 905         __ldr           q15, .Lxts_mul_x        // load tweak mask
 906         vmov            q12, q14
 907
 908         __adr           ip, 0f
 909         and             r4, r6, #7
 910         cmp             r6, #8
 911         sub             ip, ip, r4, lsl #5
 912         mov             r4, sp
 913         bxlt            ip                      // computed goto if blocks < 8
 914
 915         vld1.8          {q0}, [r1]!
 916         next_tweak      q12, q14, q15, q13
 917         veor            q0, q0, q14
 918         vst1.8          {q14}, [r4, :128]!
 919
 920         vld1.8          {q1}, [r1]!
 921         next_tweak      q14, q12, q15, q13
 922         veor            q1, q1, q12
 923         vst1.8          {q12}, [r4, :128]!
 924
 925         vld1.8          {q2}, [r1]!
 926         next_tweak      q12, q14, q15, q13
 927         veor            q2, q2, q14
 928         vst1.8          {q14}, [r4, :128]!
 929
 930         vld1.8          {q3}, [r1]!
 931         next_tweak      q14, q12, q15, q13
 932         veor            q3, q3, q12
 933         vst1.8          {q12}, [r4, :128]!
 934
 935         vld1.8          {q4}, [r1]!
 936         next_tweak      q12, q14, q15, q13
 937         veor            q4, q4, q14
 938         vst1.8          {q14}, [r4, :128]!
 939
 940         vld1.8          {q5}, [r1]!
 941         next_tweak      q14, q12, q15, q13
 942         veor            q5, q5, q12
 943         vst1.8          {q12}, [r4, :128]!
 944
 945         vld1.8          {q6}, [r1]!
 946         next_tweak      q12, q14, q15, q13
 947         veor            q6, q6, q14
 948         vst1.8          {q14}, [r4, :128]!
 949
 950         vld1.8          {q7}, [r1]!
 951         next_tweak      q14, q12, q15, q13
 952         veor            q7, q7, q12
 953         vst1.8          {q12}, [r4, :128]
 954
 955 0:      vst1.8          {q14}, [r7]             // store next iv
 956         bx              lr
 957 ENDPROC(__xts_prepare8)
 958
 959         .macro          __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 960         push            {r4-r8, lr}
 961         mov             r5, sp                  // preserve sp
 962         ldrd            r6, r7, [sp, #24]       // get blocks and iv args
 963         sub             ip, sp, #128            // make room for 8x tweak
 964         bic             ip, ip, #0xf            // align sp to 16 bytes
 965         mov             sp, ip
 966
 967 99:     bl              __xts_prepare8
 968
 969         mov             bskey, r2
 970         mov             rounds, r3
 971         bl              \do8
 972
 973         __adr           ip, 0f
 974         and             lr, r6, #7
 975         cmp             r6, #8
 976         sub             ip, ip, lr, lsl #2
 977         mov             r4, sp
 978         bxlt            ip                      // computed goto if blocks < 8
 979
 980         vld1.8          {q8}, [r4, :128]!
 981         vld1.8          {q9}, [r4, :128]!
 982         vld1.8          {q10}, [r4, :128]!
 983         vld1.8          {q11}, [r4, :128]!
 984         vld1.8          {q12}, [r4, :128]!
 985         vld1.8          {q13}, [r4, :128]!
 986         vld1.8          {q14}, [r4, :128]!
 987         vld1.8          {q15}, [r4, :128]
 988
 989 0:      __adr           ip, 1f
 990         sub             ip, ip, lr, lsl #3
 991         bxlt            ip                      // computed goto if blocks < 8
 992
 993         veor            \o0, \o0, q8
 994         vst1.8          {\o0}, [r0]!
 995         veor            \o1, \o1, q9
 996         vst1.8          {\o1}, [r0]!
 997         veor            \o2, \o2, q10
 998         vst1.8          {\o2}, [r0]!
 999         veor            \o3, \o3, q11
1000         vst1.8          {\o3}, [r0]!
1001         veor            \o4, \o4, q12
1002         vst1.8          {\o4}, [r0]!
1003         veor            \o5, \o5, q13
1004         vst1.8          {\o5}, [r0]!
1005         veor            \o6, \o6, q14
1006         vst1.8          {\o6}, [r0]!
1007         veor            \o7, \o7, q15
1008         vst1.8          {\o7}, [r0]!
1009
1010 1:      subs            r6, r6, #8
1011         bgt             99b
1012
1013         mov             sp, r5
1014         pop             {r4-r8, pc}
1015         .endm
1016
1017 ENTRY(aesbs_xts_encrypt)
1018         __xts_crypt     aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1019 ENDPROC(aesbs_xts_encrypt)
1020
1021 ENTRY(aesbs_xts_decrypt)
1022         __xts_crypt     aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1023 ENDPROC(aesbs_xts_decrypt)