arch/arm/crypto/aes-neonbs-core.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Bit sliced AES using NEON instructions
   4  *
   5  * Copyright (C) 2017 Linaro Ltd.
   6  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
   7  */
   8
   9 /*
  10  * The algorithm implemented here is described in detail by the paper
  11  * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
  12  * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
  13  *
  14  * This implementation is based primarily on the OpenSSL implementation
  15  * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
  16  */
  17
  18 #include <linux/linkage.h>
  19 #include <asm/assembler.h>
  20
  21         .text
  22         .fpu            neon
  23
  24         rounds          .req    ip
  25         bskey           .req    r4
  26
  27         q0l             .req    d0
  28         q0h             .req    d1
  29         q1l             .req    d2
  30         q1h             .req    d3
  31         q2l             .req    d4
  32         q2h             .req    d5
  33         q3l             .req    d6
  34         q3h             .req    d7
  35         q4l             .req    d8
  36         q4h             .req    d9
  37         q5l             .req    d10
  38         q5h             .req    d11
  39         q6l             .req    d12
  40         q6h             .req    d13
  41         q7l             .req    d14
  42         q7h             .req    d15
  43         q8l             .req    d16
  44         q8h             .req    d17
  45         q9l             .req    d18
  46         q9h             .req    d19
  47         q10l            .req    d20
  48         q10h            .req    d21
  49         q11l            .req    d22
  50         q11h            .req    d23
  51         q12l            .req    d24
  52         q12h            .req    d25
  53         q13l            .req    d26
  54         q13h            .req    d27
  55         q14l            .req    d28
  56         q14h            .req    d29
  57         q15l            .req    d30
  58         q15h            .req    d31
  59
  60         .macro          __tbl, out, tbl, in, tmp
  61         .ifc            \out, \tbl
  62         .ifb            \tmp
  63         .error          __tbl needs temp register if out == tbl
  64         .endif
  65         vmov            \tmp, \out
  66         .endif
  67         vtbl.8          \out\()l, {\tbl}, \in\()l
  68         .ifc            \out, \tbl
  69         vtbl.8          \out\()h, {\tmp}, \in\()h
  70         .else
  71         vtbl.8          \out\()h, {\tbl}, \in\()h
  72         .endif
  73         .endm
  74
  75         .macro          __ldr, out, sym
  76         vldr            \out\()l, \sym
  77         vldr            \out\()h, \sym + 8
  78         .endm
  79
  80         .macro          in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
  81         veor            \b2, \b2, \b1
  82         veor            \b5, \b5, \b6
  83         veor            \b3, \b3, \b0
  84         veor            \b6, \b6, \b2
  85         veor            \b5, \b5, \b0
  86         veor            \b6, \b6, \b3
  87         veor            \b3, \b3, \b7
  88         veor            \b7, \b7, \b5
  89         veor            \b3, \b3, \b4
  90         veor            \b4, \b4, \b5
  91         veor            \b2, \b2, \b7
  92         veor            \b3, \b3, \b1
  93         veor            \b1, \b1, \b5
  94         .endm
  95
  96         .macro          out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
  97         veor            \b0, \b0, \b6
  98         veor            \b1, \b1, \b4
  99         veor            \b4, \b4, \b6
 100         veor            \b2, \b2, \b0
 101         veor            \b6, \b6, \b1
 102         veor            \b1, \b1, \b5
 103         veor            \b5, \b5, \b3
 104         veor            \b3, \b3, \b7
 105         veor            \b7, \b7, \b5
 106         veor            \b2, \b2, \b5
 107         veor            \b4, \b4, \b7
 108         .endm
 109
 110         .macro          inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
 111         veor            \b1, \b1, \b7
 112         veor            \b4, \b4, \b7
 113         veor            \b7, \b7, \b5
 114         veor            \b1, \b1, \b3
 115         veor            \b2, \b2, \b5
 116         veor            \b3, \b3, \b7
 117         veor            \b6, \b6, \b1
 118         veor            \b2, \b2, \b0
 119         veor            \b5, \b5, \b3
 120         veor            \b4, \b4, \b6
 121         veor            \b0, \b0, \b6
 122         veor            \b1, \b1, \b4
 123         .endm
 124
 125         .macro          inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
 126         veor            \b1, \b1, \b5
 127         veor            \b2, \b2, \b7
 128         veor            \b3, \b3, \b1
 129         veor            \b4, \b4, \b5
 130         veor            \b7, \b7, \b5
 131         veor            \b3, \b3, \b4
 132         veor            \b5, \b5, \b0
 133         veor            \b3, \b3, \b7
 134         veor            \b6, \b6, \b2
 135         veor            \b2, \b2, \b1
 136         veor            \b6, \b6, \b3
 137         veor            \b3, \b3, \b0
 138         veor            \b5, \b5, \b6
 139         .endm
 140
 141         .macro          mul_gf4, x0, x1, y0, y1, t0, t1
 142         veor            \t0, \y0, \y1
 143         vand            \t0, \t0, \x0
 144         veor            \x0, \x0, \x1
 145         vand            \t1, \x1, \y0
 146         vand            \x0, \x0, \y1
 147         veor            \x1, \t1, \t0
 148         veor            \x0, \x0, \t1
 149         .endm
 150
 151         .macro          mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
 152         veor            \t0, \y0, \y1
 153         veor            \t1, \y2, \y3
 154         vand            \t0, \t0, \x0
 155         vand            \t1, \t1, \x2
 156         veor            \x0, \x0, \x1
 157         veor            \x2, \x2, \x3
 158         vand            \x1, \x1, \y0
 159         vand            \x3, \x3, \y2
 160         vand            \x0, \x0, \y1
 161         vand            \x2, \x2, \y3
 162         veor            \x1, \x1, \x0
 163         veor            \x2, \x2, \x3
 164         veor            \x0, \x0, \t0
 165         veor            \x3, \x3, \t1
 166         .endm
 167
 168         .macro          mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
 169                                     y0, y1, y2, y3, t0, t1, t2, t3
 170         veor            \t0, \x0, \x2
 171         veor            \t1, \x1, \x3
 172         mul_gf4         \x0, \x1, \y0, \y1, \t2, \t3
 173         veor            \y0, \y0, \y2
 174         veor            \y1, \y1, \y3
 175         mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
 176         veor            \x0, \x0, \t0
 177         veor            \x2, \x2, \t0
 178         veor            \x1, \x1, \t1
 179         veor            \x3, \x3, \t1
 180         veor            \t0, \x4, \x6
 181         veor            \t1, \x5, \x7
 182         mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
 183         veor            \y0, \y0, \y2
 184         veor            \y1, \y1, \y3
 185         mul_gf4         \x4, \x5, \y0, \y1, \t2, \t3
 186         veor            \x4, \x4, \t0
 187         veor            \x6, \x6, \t0
 188         veor            \x5, \x5, \t1
 189         veor            \x7, \x7, \t1
 190         .endm
 191
 192         .macro          inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
 193                                    t0, t1, t2, t3, s0, s1, s2, s3
 194         veor            \t3, \x4, \x6
 195         veor            \t0, \x5, \x7
 196         veor            \t1, \x1, \x3
 197         veor            \s1, \x7, \x6
 198         veor            \s0, \x0, \x2
 199         veor            \s3, \t3, \t0
 200         vorr            \t2, \t0, \t1
 201         vand            \s2, \t3, \s0
 202         vorr            \t3, \t3, \s0
 203         veor            \s0, \s0, \t1
 204         vand            \t0, \t0, \t1
 205         veor            \t1, \x3, \x2
 206         vand            \s3, \s3, \s0
 207         vand            \s1, \s1, \t1
 208         veor            \t1, \x4, \x5
 209         veor            \s0, \x1, \x0
 210         veor            \t3, \t3, \s1
 211         veor            \t2, \t2, \s1
 212         vand            \s1, \t1, \s0
 213         vorr            \t1, \t1, \s0
 214         veor            \t3, \t3, \s3
 215         veor            \t0, \t0, \s1
 216         veor            \t2, \t2, \s2
 217         veor            \t1, \t1, \s3
 218         veor            \t0, \t0, \s2
 219         vand            \s0, \x7, \x3
 220         veor            \t1, \t1, \s2
 221         vand            \s1, \x6, \x2
 222         vand            \s2, \x5, \x1
 223         vorr            \s3, \x4, \x0
 224         veor            \t3, \t3, \s0
 225         veor            \t1, \t1, \s2
 226         veor            \s0, \t0, \s3
 227         veor            \t2, \t2, \s1
 228         vand            \s2, \t3, \t1
 229         veor            \s1, \t2, \s2
 230         veor            \s3, \s0, \s2
 231         vbsl            \s1, \t1, \s0
 232         vmvn            \t0, \s0
 233         vbsl            \s0, \s1, \s3
 234         vbsl            \t0, \s1, \s3
 235         vbsl            \s3, \t3, \t2
 236         veor            \t3, \t3, \t2
 237         vand            \s2, \s0, \s3
 238         veor            \t1, \t1, \t0
 239         veor            \s2, \s2, \t3
 240         mul_gf16_2      \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
 241                         \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
 242         .endm
 243
 244         .macro          sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
 245                               t0, t1, t2, t3, s0, s1, s2, s3
 246         in_bs_ch        \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
 247         inv_gf256       \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
 248                         \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 249         out_bs_ch       \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
 250         .endm
 251
 252         .macro          inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
 253                                   t0, t1, t2, t3, s0, s1, s2, s3
 254         inv_in_bs_ch    \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
 255         inv_gf256       \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
 256                         \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 257         inv_out_bs_ch   \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
 258         .endm
 259
 260         .macro          shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
 261                                     t0, t1, t2, t3, mask
 262         vld1.8          {\t0-\t1}, [bskey, :256]!
 263         veor            \t0, \t0, \x0
 264         vld1.8          {\t2-\t3}, [bskey, :256]!
 265         veor            \t1, \t1, \x1
 266         __tbl           \x0, \t0, \mask
 267         veor            \t2, \t2, \x2
 268         __tbl           \x1, \t1, \mask
 269         vld1.8          {\t0-\t1}, [bskey, :256]!
 270         veor            \t3, \t3, \x3
 271         __tbl           \x2, \t2, \mask
 272         __tbl           \x3, \t3, \mask
 273         vld1.8          {\t2-\t3}, [bskey, :256]!
 274         veor            \t0, \t0, \x4
 275         veor            \t1, \t1, \x5
 276         __tbl           \x4, \t0, \mask
 277         veor            \t2, \t2, \x6
 278         __tbl           \x5, \t1, \mask
 279         veor            \t3, \t3, \x7
 280         __tbl           \x6, \t2, \mask
 281         __tbl           \x7, \t3, \mask
 282         .endm
 283
 284         .macro          inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
 285                                         t0, t1, t2, t3, mask
 286         __tbl           \x0, \x0, \mask, \t0
 287         __tbl           \x1, \x1, \mask, \t1
 288         __tbl           \x2, \x2, \mask, \t2
 289         __tbl           \x3, \x3, \mask, \t3
 290         __tbl           \x4, \x4, \mask, \t0
 291         __tbl           \x5, \x5, \mask, \t1
 292         __tbl           \x6, \x6, \mask, \t2
 293         __tbl           \x7, \x7, \mask, \t3
 294         .endm
 295
 296         .macro          mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
 297                                   t0, t1, t2, t3, t4, t5, t6, t7, inv
 298         vext.8          \t0, \x0, \x0, #12
 299         vext.8          \t1, \x1, \x1, #12
 300         veor            \x0, \x0, \t0
 301         vext.8          \t2, \x2, \x2, #12
 302         veor            \x1, \x1, \t1
 303         vext.8          \t3, \x3, \x3, #12
 304         veor            \x2, \x2, \t2
 305         vext.8          \t4, \x4, \x4, #12
 306         veor            \x3, \x3, \t3
 307         vext.8          \t5, \x5, \x5, #12
 308         veor            \x4, \x4, \t4
 309         vext.8          \t6, \x6, \x6, #12
 310         veor            \x5, \x5, \t5
 311         vext.8          \t7, \x7, \x7, #12
 312         veor            \x6, \x6, \t6
 313         veor            \t1, \t1, \x0
 314         veor.8          \x7, \x7, \t7
 315         vext.8          \x0, \x0, \x0, #8
 316         veor            \t2, \t2, \x1
 317         veor            \t0, \t0, \x7
 318         veor            \t1, \t1, \x7
 319         vext.8          \x1, \x1, \x1, #8
 320         veor            \t5, \t5, \x4
 321         veor            \x0, \x0, \t0
 322         veor            \t6, \t6, \x5
 323         veor            \x1, \x1, \t1
 324         vext.8          \t0, \x4, \x4, #8
 325         veor            \t4, \t4, \x3
 326         vext.8          \t1, \x5, \x5, #8
 327         veor            \t7, \t7, \x6
 328         vext.8          \x4, \x3, \x3, #8
 329         veor            \t3, \t3, \x2
 330         vext.8          \x5, \x7, \x7, #8
 331         veor            \t4, \t4, \x7
 332         vext.8          \x3, \x6, \x6, #8
 333         veor            \t3, \t3, \x7
 334         vext.8          \x6, \x2, \x2, #8
 335         veor            \x7, \t1, \t5
 336         .ifb            \inv
 337         veor            \x2, \t0, \t4
 338         veor            \x4, \x4, \t3
 339         veor            \x5, \x5, \t7
 340         veor            \x3, \x3, \t6
 341         veor            \x6, \x6, \t2
 342         .else
 343         veor            \t3, \t3, \x4
 344         veor            \x5, \x5, \t7
 345         veor            \x2, \x3, \t6
 346         veor            \x3, \t0, \t4
 347         veor            \x4, \x6, \t2
 348         vmov            \x6, \t3
 349         .endif
 350         .endm
 351
 352         .macro          inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
 353                                       t0, t1, t2, t3, t4, t5, t6, t7
 354         vld1.8          {\t0-\t1}, [bskey, :256]!
 355         veor            \x0, \x0, \t0
 356         vld1.8          {\t2-\t3}, [bskey, :256]!
 357         veor            \x1, \x1, \t1
 358         vld1.8          {\t4-\t5}, [bskey, :256]!
 359         veor            \x2, \x2, \t2
 360         vld1.8          {\t6-\t7}, [bskey, :256]
 361         sub             bskey, bskey, #224
 362         veor            \x3, \x3, \t3
 363         veor            \x4, \x4, \t4
 364         veor            \x5, \x5, \t5
 365         veor            \x6, \x6, \t6
 366         veor            \x7, \x7, \t7
 367         vext.8          \t0, \x0, \x0, #8
 368         vext.8          \t6, \x6, \x6, #8
 369         vext.8          \t7, \x7, \x7, #8
 370         veor            \t0, \t0, \x0
 371         vext.8          \t1, \x1, \x1, #8
 372         veor            \t6, \t6, \x6
 373         vext.8          \t2, \x2, \x2, #8
 374         veor            \t7, \t7, \x7
 375         vext.8          \t3, \x3, \x3, #8
 376         veor            \t1, \t1, \x1
 377         vext.8          \t4, \x4, \x4, #8
 378         veor            \t2, \t2, \x2
 379         vext.8          \t5, \x5, \x5, #8
 380         veor            \t3, \t3, \x3
 381         veor            \t4, \t4, \x4
 382         veor            \t5, \t5, \x5
 383         veor            \x0, \x0, \t6
 384         veor            \x1, \x1, \t6
 385         veor            \x2, \x2, \t0
 386         veor            \x4, \x4, \t2
 387         veor            \x3, \x3, \t1
 388         veor            \x1, \x1, \t7
 389         veor            \x2, \x2, \t7
 390         veor            \x4, \x4, \t6
 391         veor            \x5, \x5, \t3
 392         veor            \x3, \x3, \t6
 393         veor            \x6, \x6, \t4
 394         veor            \x4, \x4, \t7
 395         veor            \x5, \x5, \t7
 396         veor            \x7, \x7, \t5
 397         mix_cols        \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
 398                         \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
 399         .endm
 400
 401         .macro          swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
 402         vshr.u64        \t0, \b0, #\n
 403         vshr.u64        \t1, \b1, #\n
 404         veor            \t0, \t0, \a0
 405         veor            \t1, \t1, \a1
 406         vand            \t0, \t0, \mask
 407         vand            \t1, \t1, \mask
 408         veor            \a0, \a0, \t0
 409         vshl.s64        \t0, \t0, #\n
 410         veor            \a1, \a1, \t1
 411         vshl.s64        \t1, \t1, #\n
 412         veor            \b0, \b0, \t0
 413         veor            \b1, \b1, \t1
 414         .endm
 415
 416         .macro          bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
 417         vmov.i8         \t0, #0x55
 418         vmov.i8         \t1, #0x33
 419         swapmove_2x     \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
 420         swapmove_2x     \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
 421         vmov.i8         \t0, #0x0f
 422         swapmove_2x     \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
 423         swapmove_2x     \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
 424         swapmove_2x     \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
 425         swapmove_2x     \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
 426         .endm
 427
 428         .align          4
 429 M0:     .quad           0x02060a0e03070b0f, 0x0004080c0105090d
 430
 431         /*
 432          * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
 433          */
 434 ENTRY(aesbs_convert_key)
 435         vld1.32         {q7}, [r1]!             // load round 0 key
 436         vld1.32         {q15}, [r1]!            // load round 1 key
 437
 438         vmov.i8         q8,  #0x01              // bit masks
 439         vmov.i8         q9,  #0x02
 440         vmov.i8         q10, #0x04
 441         vmov.i8         q11, #0x08
 442         vmov.i8         q12, #0x10
 443         vmov.i8         q13, #0x20
 444         __ldr           q14, M0
 445
 446         sub             r2, r2, #1
 447         vst1.8          {q7}, [r0, :128]!       // save round 0 key
 448
 449 .Lkey_loop:
 450         __tbl           q7, q15, q14
 451         vmov.i8         q6, #0x40
 452         vmov.i8         q15, #0x80
 453
 454         vtst.8          q0, q7, q8
 455         vtst.8          q1, q7, q9
 456         vtst.8          q2, q7, q10
 457         vtst.8          q3, q7, q11
 458         vtst.8          q4, q7, q12
 459         vtst.8          q5, q7, q13
 460         vtst.8          q6, q7, q6
 461         vtst.8          q7, q7, q15
 462         vld1.32         {q15}, [r1]!            // load next round key
 463         vmvn            q0, q0
 464         vmvn            q1, q1
 465         vmvn            q5, q5
 466         vmvn            q6, q6
 467
 468         subs            r2, r2, #1
 469         vst1.8          {q0-q1}, [r0, :256]!
 470         vst1.8          {q2-q3}, [r0, :256]!
 471         vst1.8          {q4-q5}, [r0, :256]!
 472         vst1.8          {q6-q7}, [r0, :256]!
 473         bne             .Lkey_loop
 474
 475         vmov.i8         q7, #0x63               // compose .L63
 476         veor            q15, q15, q7
 477         vst1.8          {q15}, [r0, :128]
 478         bx              lr
 479 ENDPROC(aesbs_convert_key)
 480
 481         .align          4
 482 M0SR:   .quad           0x0a0e02060f03070b, 0x0004080c05090d01
 483
 484 aesbs_encrypt8:
 485         vld1.8          {q9}, [bskey, :128]!    // round 0 key
 486         __ldr           q8, M0SR
 487
 488         veor            q10, q0, q9             // xor with round0 key
 489         veor            q11, q1, q9
 490         __tbl           q0, q10, q8
 491         veor            q12, q2, q9
 492         __tbl           q1, q11, q8
 493         veor            q13, q3, q9
 494         __tbl           q2, q12, q8
 495         veor            q14, q4, q9
 496         __tbl           q3, q13, q8
 497         veor            q15, q5, q9
 498         __tbl           q4, q14, q8
 499         veor            q10, q6, q9
 500         __tbl           q5, q15, q8
 501         veor            q11, q7, q9
 502         __tbl           q6, q10, q8
 503         __tbl           q7, q11, q8
 504
 505         bitslice        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
 506
 507         sub             rounds, rounds, #1
 508         b               .Lenc_sbox
 509
 510         .align          5
 511 SR:     .quad           0x0504070600030201, 0x0f0e0d0c0a09080b
 512 SRM0:   .quad           0x0304090e00050a0f, 0x01060b0c0207080d
 513
 514 .Lenc_last:
 515         __ldr           q12, SRM0
 516 .Lenc_loop:
 517         shift_rows      q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
 518 .Lenc_sbox:
 519         sbox            q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
 520                                                                 q13, q14, q15
 521         subs            rounds, rounds, #1
 522         bcc             .Lenc_done
 523
 524         mix_cols        q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
 525                                                                 q13, q14, q15
 526
 527         beq             .Lenc_last
 528         __ldr           q12, SR
 529         b               .Lenc_loop
 530
 531 .Lenc_done:
 532         vld1.8          {q12}, [bskey, :128]    // last round key
 533
 534         bitslice        q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
 535
 536         veor            q0, q0, q12
 537         veor            q1, q1, q12
 538         veor            q4, q4, q12
 539         veor            q6, q6, q12
 540         veor            q3, q3, q12
 541         veor            q7, q7, q12
 542         veor            q2, q2, q12
 543         veor            q5, q5, q12
 544         bx              lr
 545 ENDPROC(aesbs_encrypt8)
 546
 547         .align          4
 548 M0ISR:  .quad           0x0a0e0206070b0f03, 0x0004080c0d010509
 549
 550 aesbs_decrypt8:
 551         add             bskey, bskey, rounds, lsl #7
 552         sub             bskey, bskey, #112
 553         vld1.8          {q9}, [bskey, :128]     // round 0 key
 554         sub             bskey, bskey, #128
 555         __ldr           q8, M0ISR
 556
 557         veor            q10, q0, q9             // xor with round0 key
 558         veor            q11, q1, q9
 559         __tbl           q0, q10, q8
 560         veor            q12, q2, q9
 561         __tbl           q1, q11, q8
 562         veor            q13, q3, q9
 563         __tbl           q2, q12, q8
 564         veor            q14, q4, q9
 565         __tbl           q3, q13, q8
 566         veor            q15, q5, q9
 567         __tbl           q4, q14, q8
 568         veor            q10, q6, q9
 569         __tbl           q5, q15, q8
 570         veor            q11, q7, q9
 571         __tbl           q6, q10, q8
 572         __tbl           q7, q11, q8
 573
 574         bitslice        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
 575
 576         sub             rounds, rounds, #1
 577         b               .Ldec_sbox
 578
 579         .align          5
 580 ISR:    .quad           0x0504070602010003, 0x0f0e0d0c080b0a09
 581 ISRM0:  .quad           0x01040b0e0205080f, 0x0306090c00070a0d
 582
 583 .Ldec_last:
 584         __ldr           q12, ISRM0
 585 .Ldec_loop:
 586         inv_shift_rows  q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
 587 .Ldec_sbox:
 588         inv_sbox        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
 589                                                                 q13, q14, q15
 590         subs            rounds, rounds, #1
 591         bcc             .Ldec_done
 592
 593         inv_mix_cols    q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
 594                                                                 q13, q14, q15
 595
 596         beq             .Ldec_last
 597         __ldr           q12, ISR
 598         b               .Ldec_loop
 599
 600 .Ldec_done:
 601         add             bskey, bskey, #112
 602         vld1.8          {q12}, [bskey, :128]    // last round key
 603
 604         bitslice        q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
 605
 606         veor            q0, q0, q12
 607         veor            q1, q1, q12
 608         veor            q6, q6, q12
 609         veor            q4, q4, q12
 610         veor            q2, q2, q12
 611         veor            q7, q7, q12
 612         veor            q3, q3, q12
 613         veor            q5, q5, q12
 614         bx              lr
 615 ENDPROC(aesbs_decrypt8)
 616
 617         /*
 618          * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 619          *                   int blocks)
 620          * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 621          *                   int blocks)
 622          */
 623         .macro          __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 624         push            {r4-r6, lr}
 625         ldr             r5, [sp, #16]           // number of blocks
 626
 627 99:     adr             ip, 0f
 628         and             lr, r5, #7
 629         cmp             r5, #8
 630         sub             ip, ip, lr, lsl #2
 631         movlt           pc, ip                  // computed goto if blocks < 8
 632
 633         vld1.8          {q0}, [r1]!
 634         vld1.8          {q1}, [r1]!
 635         vld1.8          {q2}, [r1]!
 636         vld1.8          {q3}, [r1]!
 637         vld1.8          {q4}, [r1]!
 638         vld1.8          {q5}, [r1]!
 639         vld1.8          {q6}, [r1]!
 640         vld1.8          {q7}, [r1]!
 641
 642 0:      mov             bskey, r2
 643         mov             rounds, r3
 644         bl              \do8
 645
 646         adr             ip, 1f
 647         and             lr, r5, #7
 648         cmp             r5, #8
 649         sub             ip, ip, lr, lsl #2
 650         movlt           pc, ip                  // computed goto if blocks < 8
 651
 652         vst1.8          {\o0}, [r0]!
 653         vst1.8          {\o1}, [r0]!
 654         vst1.8          {\o2}, [r0]!
 655         vst1.8          {\o3}, [r0]!
 656         vst1.8          {\o4}, [r0]!
 657         vst1.8          {\o5}, [r0]!
 658         vst1.8          {\o6}, [r0]!
 659         vst1.8          {\o7}, [r0]!
 660
 661 1:      subs            r5, r5, #8
 662         bgt             99b
 663
 664         pop             {r4-r6, pc}
 665         .endm
 666
 667         .align          4
 668 ENTRY(aesbs_ecb_encrypt)
 669         __ecb_crypt     aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
 670 ENDPROC(aesbs_ecb_encrypt)
 671
 672         .align          4
 673 ENTRY(aesbs_ecb_decrypt)
 674         __ecb_crypt     aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
 675 ENDPROC(aesbs_ecb_decrypt)
 676
 677         /*
 678          * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
 679          *                   int rounds, int blocks, u8 iv[])
 680          */
 681         .align          4
 682 ENTRY(aesbs_cbc_decrypt)
 683         mov             ip, sp
 684         push            {r4-r6, lr}
 685         ldm             ip, {r5-r6}             // load args 4-5
 686
 687 99:     adr             ip, 0f
 688         and             lr, r5, #7
 689         cmp             r5, #8
 690         sub             ip, ip, lr, lsl #2
 691         mov             lr, r1
 692         movlt           pc, ip                  // computed goto if blocks < 8
 693
 694         vld1.8          {q0}, [lr]!
 695         vld1.8          {q1}, [lr]!
 696         vld1.8          {q2}, [lr]!
 697         vld1.8          {q3}, [lr]!
 698         vld1.8          {q4}, [lr]!
 699         vld1.8          {q5}, [lr]!
 700         vld1.8          {q6}, [lr]!
 701         vld1.8          {q7}, [lr]
 702
 703 0:      mov             bskey, r2
 704         mov             rounds, r3
 705         bl              aesbs_decrypt8
 706
 707         vld1.8          {q8}, [r6]
 708         vmov            q9, q8
 709         vmov            q10, q8
 710         vmov            q11, q8
 711         vmov            q12, q8
 712         vmov            q13, q8
 713         vmov            q14, q8
 714         vmov            q15, q8
 715
 716         adr             ip, 1f
 717         and             lr, r5, #7
 718         cmp             r5, #8
 719         sub             ip, ip, lr, lsl #2
 720         movlt           pc, ip                  // computed goto if blocks < 8
 721
 722         vld1.8          {q9}, [r1]!
 723         vld1.8          {q10}, [r1]!
 724         vld1.8          {q11}, [r1]!
 725         vld1.8          {q12}, [r1]!
 726         vld1.8          {q13}, [r1]!
 727         vld1.8          {q14}, [r1]!
 728         vld1.8          {q15}, [r1]!
 729         W(nop)
 730
 731 1:      adr             ip, 2f
 732         sub             ip, ip, lr, lsl #3
 733         movlt           pc, ip                  // computed goto if blocks < 8
 734
 735         veor            q0, q0, q8
 736         vst1.8          {q0}, [r0]!
 737         veor            q1, q1, q9
 738         vst1.8          {q1}, [r0]!
 739         veor            q6, q6, q10
 740         vst1.8          {q6}, [r0]!
 741         veor            q4, q4, q11
 742         vst1.8          {q4}, [r0]!
 743         veor            q2, q2, q12
 744         vst1.8          {q2}, [r0]!
 745         veor            q7, q7, q13
 746         vst1.8          {q7}, [r0]!
 747         veor            q3, q3, q14
 748         vst1.8          {q3}, [r0]!
 749         veor            q5, q5, q15
 750         vld1.8          {q8}, [r1]!             // load next round's iv
 751 2:      vst1.8          {q5}, [r0]!
 752
 753         subs            r5, r5, #8
 754         vst1.8          {q8}, [r6]              // store next round's iv
 755         bgt             99b
 756
 757         pop             {r4-r6, pc}
 758 ENDPROC(aesbs_cbc_decrypt)
 759
 760         .macro          next_ctr, q
 761         vmov.32         \q\()h[1], r10
 762         adds            r10, r10, #1
 763         vmov.32         \q\()h[0], r9
 764         adcs            r9, r9, #0
 765         vmov.32         \q\()l[1], r8
 766         adcs            r8, r8, #0
 767         vmov.32         \q\()l[0], r7
 768         adc             r7, r7, #0
 769         vrev32.8        \q, \q
 770         .endm
 771
 772         /*
 773          * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
 774          *                   int rounds, int blocks, u8 ctr[], u8 final[])
 775          */
 776 ENTRY(aesbs_ctr_encrypt)
 777         mov             ip, sp
 778         push            {r4-r10, lr}
 779
 780         ldm             ip, {r5-r7}             // load args 4-6
 781         teq             r7, #0
 782         addne           r5, r5, #1              // one extra block if final != 0
 783
 784         vld1.8          {q0}, [r6]              // load counter
 785         vrev32.8        q1, q0
 786         vmov            r9, r10, d3
 787         vmov            r7, r8, d2
 788
 789         adds            r10, r10, #1
 790         adcs            r9, r9, #0
 791         adcs            r8, r8, #0
 792         adc             r7, r7, #0
 793
 794 99:     vmov            q1, q0
 795         vmov            q2, q0
 796         vmov            q3, q0
 797         vmov            q4, q0
 798         vmov            q5, q0
 799         vmov            q6, q0
 800         vmov            q7, q0
 801
 802         adr             ip, 0f
 803         sub             lr, r5, #1
 804         and             lr, lr, #7
 805         cmp             r5, #8
 806         sub             ip, ip, lr, lsl #5
 807         sub             ip, ip, lr, lsl #2
 808         movlt           pc, ip                  // computed goto if blocks < 8
 809
 810         next_ctr        q1
 811         next_ctr        q2
 812         next_ctr        q3
 813         next_ctr        q4
 814         next_ctr        q5
 815         next_ctr        q6
 816         next_ctr        q7
 817
 818 0:      mov             bskey, r2
 819         mov             rounds, r3
 820         bl              aesbs_encrypt8
 821
 822         adr             ip, 1f
 823         and             lr, r5, #7
 824         cmp             r5, #8
 825         movgt           r4, #0
 826         ldrle           r4, [sp, #40]           // load final in the last round
 827         sub             ip, ip, lr, lsl #2
 828         movlt           pc, ip                  // computed goto if blocks < 8
 829
 830         vld1.8          {q8}, [r1]!
 831         vld1.8          {q9}, [r1]!
 832         vld1.8          {q10}, [r1]!
 833         vld1.8          {q11}, [r1]!
 834         vld1.8          {q12}, [r1]!
 835         vld1.8          {q13}, [r1]!
 836         vld1.8          {q14}, [r1]!
 837         teq             r4, #0                  // skip last block if 'final'
 838 1:      bne             2f
 839         vld1.8          {q15}, [r1]!
 840
 841 2:      adr             ip, 3f
 842         cmp             r5, #8
 843         sub             ip, ip, lr, lsl #3
 844         movlt           pc, ip                  // computed goto if blocks < 8
 845
 846         veor            q0, q0, q8
 847         vst1.8          {q0}, [r0]!
 848         veor            q1, q1, q9
 849         vst1.8          {q1}, [r0]!
 850         veor            q4, q4, q10
 851         vst1.8          {q4}, [r0]!
 852         veor            q6, q6, q11
 853         vst1.8          {q6}, [r0]!
 854         veor            q3, q3, q12
 855         vst1.8          {q3}, [r0]!
 856         veor            q7, q7, q13
 857         vst1.8          {q7}, [r0]!
 858         veor            q2, q2, q14
 859         vst1.8          {q2}, [r0]!
 860         teq             r4, #0                  // skip last block if 'final'
 861         W(bne)          5f
 862 3:      veor            q5, q5, q15
 863         vst1.8          {q5}, [r0]!
 864
 865 4:      next_ctr        q0
 866
 867         subs            r5, r5, #8
 868         bgt             99b
 869
 870         vst1.8          {q0}, [r6]
 871         pop             {r4-r10, pc}
 872
 873 5:      vst1.8          {q5}, [r4]
 874         b               4b
 875 ENDPROC(aesbs_ctr_encrypt)
 876
 877         .macro          next_tweak, out, in, const, tmp
 878         vshr.s64        \tmp, \in, #63
 879         vand            \tmp, \tmp, \const
 880         vadd.u64        \out, \in, \in
 881         vext.8          \tmp, \tmp, \tmp, #8
 882         veor            \out, \out, \tmp
 883         .endm
 884
 885         /*
 886          * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 887          *                   int blocks, u8 iv[], int reorder_last_tweak)
 888          * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 889          *                   int blocks, u8 iv[], int reorder_last_tweak)
 890          */
 891 __xts_prepare8:
 892         vld1.8          {q14}, [r7]             // load iv
 893         vmov.i32        d30, #0x87              // compose tweak mask vector
 894         vmovl.u32       q15, d30
 895         vshr.u64        d30, d31, #7
 896         vmov            q12, q14
 897
 898         adr             ip, 0f
 899         and             r4, r6, #7
 900         cmp             r6, #8
 901         sub             ip, ip, r4, lsl #5
 902         mov             r4, sp
 903         movlt           pc, ip                  // computed goto if blocks < 8
 904
 905         vld1.8          {q0}, [r1]!
 906         next_tweak      q12, q14, q15, q13
 907         veor            q0, q0, q14
 908         vst1.8          {q14}, [r4, :128]!
 909
 910         vld1.8          {q1}, [r1]!
 911         next_tweak      q14, q12, q15, q13
 912         veor            q1, q1, q12
 913         vst1.8          {q12}, [r4, :128]!
 914
 915         vld1.8          {q2}, [r1]!
 916         next_tweak      q12, q14, q15, q13
 917         veor            q2, q2, q14
 918         vst1.8          {q14}, [r4, :128]!
 919
 920         vld1.8          {q3}, [r1]!
 921         next_tweak      q14, q12, q15, q13
 922         veor            q3, q3, q12
 923         vst1.8          {q12}, [r4, :128]!
 924
 925         vld1.8          {q4}, [r1]!
 926         next_tweak      q12, q14, q15, q13
 927         veor            q4, q4, q14
 928         vst1.8          {q14}, [r4, :128]!
 929
 930         vld1.8          {q5}, [r1]!
 931         next_tweak      q14, q12, q15, q13
 932         veor            q5, q5, q12
 933         vst1.8          {q12}, [r4, :128]!
 934
 935         vld1.8          {q6}, [r1]!
 936         next_tweak      q12, q14, q15, q13
 937         veor            q6, q6, q14
 938         vst1.8          {q14}, [r4, :128]!
 939
 940         vld1.8          {q7}, [r1]!
 941         next_tweak      q14, q12, q15, q13
 942 THUMB(  itt             le              )
 943         W(cmple)        r8, #0
 944         ble             1f
 945 0:      veor            q7, q7, q12
 946         vst1.8          {q12}, [r4, :128]
 947
 948         vst1.8          {q14}, [r7]             // store next iv
 949         bx              lr
 950
 951 1:      vswp            q12, q14
 952         b               0b
 953 ENDPROC(__xts_prepare8)
 954
 955         .macro          __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 956         push            {r4-r8, lr}
 957         mov             r5, sp                  // preserve sp
 958         ldrd            r6, r7, [sp, #24]       // get blocks and iv args
 959         rsb             r8, ip, #1
 960         sub             ip, sp, #128            // make room for 8x tweak
 961         bic             ip, ip, #0xf            // align sp to 16 bytes
 962         mov             sp, ip
 963
 964 99:     bl              __xts_prepare8
 965
 966         mov             bskey, r2
 967         mov             rounds, r3
 968         bl              \do8
 969
 970         adr             ip, 0f
 971         and             lr, r6, #7
 972         cmp             r6, #8
 973         sub             ip, ip, lr, lsl #2
 974         mov             r4, sp
 975         movlt           pc, ip                  // computed goto if blocks < 8
 976
 977         vld1.8          {q8}, [r4, :128]!
 978         vld1.8          {q9}, [r4, :128]!
 979         vld1.8          {q10}, [r4, :128]!
 980         vld1.8          {q11}, [r4, :128]!
 981         vld1.8          {q12}, [r4, :128]!
 982         vld1.8          {q13}, [r4, :128]!
 983         vld1.8          {q14}, [r4, :128]!
 984         vld1.8          {q15}, [r4, :128]
 985
 986 0:      adr             ip, 1f
 987         sub             ip, ip, lr, lsl #3
 988         movlt           pc, ip                  // computed goto if blocks < 8
 989
 990         veor            \o0, \o0, q8
 991         vst1.8          {\o0}, [r0]!
 992         veor            \o1, \o1, q9
 993         vst1.8          {\o1}, [r0]!
 994         veor            \o2, \o2, q10
 995         vst1.8          {\o2}, [r0]!
 996         veor            \o3, \o3, q11
 997         vst1.8          {\o3}, [r0]!
 998         veor            \o4, \o4, q12
 999         vst1.8          {\o4}, [r0]!
1000         veor            \o5, \o5, q13
1001         vst1.8          {\o5}, [r0]!
1002         veor            \o6, \o6, q14
1003         vst1.8          {\o6}, [r0]!
1004         veor            \o7, \o7, q15
1005         vst1.8          {\o7}, [r0]!
1006
1007 1:      subs            r6, r6, #8
1008         bgt             99b
1009
1010         mov             sp, r5
1011         pop             {r4-r8, pc}
1012         .endm
1013
1014 ENTRY(aesbs_xts_encrypt)
1015         mov             ip, #0                  // never reorder final tweak
1016         __xts_crypt     aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1017 ENDPROC(aesbs_xts_encrypt)
1018
1019 ENTRY(aesbs_xts_decrypt)
1020         ldr             ip, [sp, #8]            // reorder final tweak?
1021         __xts_crypt     aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1022 ENDPROC(aesbs_xts_decrypt)