arch/arm/crypto/aes-neonbs-core.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Bit sliced AES using NEON instructions
   4  *
   5  * Copyright (C) 2017 Linaro Ltd.
   6  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
   7  */
   8
   9 /*
  10  * The algorithm implemented here is described in detail by the paper
  11  * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
  12  * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
  13  *
  14  * This implementation is based primarily on the OpenSSL implementation
  15  * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
  16  */
  17
  18 #include <linux/linkage.h>
  19 #include <asm/assembler.h>
  20
  21         .text
  22         .fpu            neon
  23
  24         rounds          .req    ip
  25         bskey           .req    r4
  26
  27         q0l             .req    d0
  28         q0h             .req    d1
  29         q1l             .req    d2
  30         q1h             .req    d3
  31         q2l             .req    d4
  32         q2h             .req    d5
  33         q3l             .req    d6
  34         q3h             .req    d7
  35         q4l             .req    d8
  36         q4h             .req    d9
  37         q5l             .req    d10
  38         q5h             .req    d11
  39         q6l             .req    d12
  40         q6h             .req    d13
  41         q7l             .req    d14
  42         q7h             .req    d15
  43         q8l             .req    d16
  44         q8h             .req    d17
  45         q9l             .req    d18
  46         q9h             .req    d19
  47         q10l            .req    d20
  48         q10h            .req    d21
  49         q11l            .req    d22
  50         q11h            .req    d23
  51         q12l            .req    d24
  52         q12h            .req    d25
  53         q13l            .req    d26
  54         q13h            .req    d27
  55         q14l            .req    d28
  56         q14h            .req    d29
  57         q15l            .req    d30
  58         q15h            .req    d31
  59
  60         .macro          __tbl, out, tbl, in, tmp
  61         .ifc            \out, \tbl
  62         .ifb            \tmp
  63         .error          __tbl needs temp register if out == tbl
  64         .endif
  65         vmov            \tmp, \out
  66         .endif
  67         vtbl.8          \out\()l, {\tbl}, \in\()l
  68         .ifc            \out, \tbl
  69         vtbl.8          \out\()h, {\tmp}, \in\()h
  70         .else
  71         vtbl.8          \out\()h, {\tbl}, \in\()h
  72         .endif
  73         .endm
  74
  75         .macro          __ldr, out, sym
  76         vldr            \out\()l, \sym
  77         vldr            \out\()h, \sym + 8
  78         .endm
  79
  80         .macro          in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
  81         veor            \b2, \b2, \b1
  82         veor            \b5, \b5, \b6
  83         veor            \b3, \b3, \b0
  84         veor            \b6, \b6, \b2
  85         veor            \b5, \b5, \b0
  86         veor            \b6, \b6, \b3
  87         veor            \b3, \b3, \b7
  88         veor            \b7, \b7, \b5
  89         veor            \b3, \b3, \b4
  90         veor            \b4, \b4, \b5
  91         veor            \b2, \b2, \b7
  92         veor            \b3, \b3, \b1
  93         veor            \b1, \b1, \b5
  94         .endm
  95
  96         .macro          out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
  97         veor            \b0, \b0, \b6
  98         veor            \b1, \b1, \b4
  99         veor            \b4, \b4, \b6
 100         veor            \b2, \b2, \b0
 101         veor            \b6, \b6, \b1
 102         veor            \b1, \b1, \b5
 103         veor            \b5, \b5, \b3
 104         veor            \b3, \b3, \b7
 105         veor            \b7, \b7, \b5
 106         veor            \b2, \b2, \b5
 107         veor            \b4, \b4, \b7
 108         .endm
 109
 110         .macro          inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
 111         veor            \b1, \b1, \b7
 112         veor            \b4, \b4, \b7
 113         veor            \b7, \b7, \b5
 114         veor            \b1, \b1, \b3
 115         veor            \b2, \b2, \b5
 116         veor            \b3, \b3, \b7
 117         veor            \b6, \b6, \b1
 118         veor            \b2, \b2, \b0
 119         veor            \b5, \b5, \b3
 120         veor            \b4, \b4, \b6
 121         veor            \b0, \b0, \b6
 122         veor            \b1, \b1, \b4
 123         .endm
 124
 125         .macro          inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
 126         veor            \b1, \b1, \b5
 127         veor            \b2, \b2, \b7
 128         veor            \b3, \b3, \b1
 129         veor            \b4, \b4, \b5
 130         veor            \b7, \b7, \b5
 131         veor            \b3, \b3, \b4
 132         veor            \b5, \b5, \b0
 133         veor            \b3, \b3, \b7
 134         veor            \b6, \b6, \b2
 135         veor            \b2, \b2, \b1
 136         veor            \b6, \b6, \b3
 137         veor            \b3, \b3, \b0
 138         veor            \b5, \b5, \b6
 139         .endm
 140
 141         .macro          mul_gf4, x0, x1, y0, y1, t0, t1
 142         veor            \t0, \y0, \y1
 143         vand            \t0, \t0, \x0
 144         veor            \x0, \x0, \x1
 145         vand            \t1, \x1, \y0
 146         vand            \x0, \x0, \y1
 147         veor            \x1, \t1, \t0
 148         veor            \x0, \x0, \t1
 149         .endm
 150
 151         .macro          mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
 152         veor            \t0, \y0, \y1
 153         veor            \t1, \y2, \y3
 154         vand            \t0, \t0, \x0
 155         vand            \t1, \t1, \x2
 156         veor            \x0, \x0, \x1
 157         veor            \x2, \x2, \x3
 158         vand            \x1, \x1, \y0
 159         vand            \x3, \x3, \y2
 160         vand            \x0, \x0, \y1
 161         vand            \x2, \x2, \y3
 162         veor            \x1, \x1, \x0
 163         veor            \x2, \x2, \x3
 164         veor            \x0, \x0, \t0
 165         veor            \x3, \x3, \t1
 166         .endm
 167
 168         .macro          mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
 169                                     y0, y1, y2, y3, t0, t1, t2, t3
 170         veor            \t0, \x0, \x2
 171         veor            \t1, \x1, \x3
 172         mul_gf4         \x0, \x1, \y0, \y1, \t2, \t3
 173         veor            \y0, \y0, \y2
 174         veor            \y1, \y1, \y3
 175         mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
 176         veor            \x0, \x0, \t0
 177         veor            \x2, \x2, \t0
 178         veor            \x1, \x1, \t1
 179         veor            \x3, \x3, \t1
 180         veor            \t0, \x4, \x6
 181         veor            \t1, \x5, \x7
 182         mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
 183         veor            \y0, \y0, \y2
 184         veor            \y1, \y1, \y3
 185         mul_gf4         \x4, \x5, \y0, \y1, \t2, \t3
 186         veor            \x4, \x4, \t0
 187         veor            \x6, \x6, \t0
 188         veor            \x5, \x5, \t1
 189         veor            \x7, \x7, \t1
 190         .endm
 191
 192         .macro          inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
 193                                    t0, t1, t2, t3, s0, s1, s2, s3
 194         veor            \t3, \x4, \x6
 195         veor            \t0, \x5, \x7
 196         veor            \t1, \x1, \x3
 197         veor            \s1, \x7, \x6
 198         veor            \s0, \x0, \x2
 199         veor            \s3, \t3, \t0
 200         vorr            \t2, \t0, \t1
 201         vand            \s2, \t3, \s0
 202         vorr            \t3, \t3, \s0
 203         veor            \s0, \s0, \t1
 204         vand            \t0, \t0, \t1
 205         veor            \t1, \x3, \x2
 206         vand            \s3, \s3, \s0
 207         vand            \s1, \s1, \t1
 208         veor            \t1, \x4, \x5
 209         veor            \s0, \x1, \x0
 210         veor            \t3, \t3, \s1
 211         veor            \t2, \t2, \s1
 212         vand            \s1, \t1, \s0
 213         vorr            \t1, \t1, \s0
 214         veor            \t3, \t3, \s3
 215         veor            \t0, \t0, \s1
 216         veor            \t2, \t2, \s2
 217         veor            \t1, \t1, \s3
 218         veor            \t0, \t0, \s2
 219         vand            \s0, \x7, \x3
 220         veor            \t1, \t1, \s2
 221         vand            \s1, \x6, \x2
 222         vand            \s2, \x5, \x1
 223         vorr            \s3, \x4, \x0
 224         veor            \t3, \t3, \s0
 225         veor            \t1, \t1, \s2
 226         veor            \s0, \t0, \s3
 227         veor            \t2, \t2, \s1
 228         vand            \s2, \t3, \t1
 229         veor            \s1, \t2, \s2
 230         veor            \s3, \s0, \s2
 231         vbsl            \s1, \t1, \s0
 232         vmvn            \t0, \s0
 233         vbsl            \s0, \s1, \s3
 234         vbsl            \t0, \s1, \s3
 235         vbsl            \s3, \t3, \t2
 236         veor            \t3, \t3, \t2
 237         vand            \s2, \s0, \s3
 238         veor            \t1, \t1, \t0
 239         veor            \s2, \s2, \t3
 240         mul_gf16_2      \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
 241                         \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
 242         .endm
 243
 244         .macro          sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
 245                               t0, t1, t2, t3, s0, s1, s2, s3
 246         in_bs_ch        \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
 247         inv_gf256       \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
 248                         \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 249         out_bs_ch       \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
 250         .endm
 251
 252         .macro          inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
 253                                   t0, t1, t2, t3, s0, s1, s2, s3
 254         inv_in_bs_ch    \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
 255         inv_gf256       \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
 256                         \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 257         inv_out_bs_ch   \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
 258         .endm
 259
 260         .macro          shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
 261                                     t0, t1, t2, t3, mask
 262         vld1.8          {\t0-\t1}, [bskey, :256]!
 263         veor            \t0, \t0, \x0
 264         vld1.8          {\t2-\t3}, [bskey, :256]!
 265         veor            \t1, \t1, \x1
 266         __tbl           \x0, \t0, \mask
 267         veor            \t2, \t2, \x2
 268         __tbl           \x1, \t1, \mask
 269         vld1.8          {\t0-\t1}, [bskey, :256]!
 270         veor            \t3, \t3, \x3
 271         __tbl           \x2, \t2, \mask
 272         __tbl           \x3, \t3, \mask
 273         vld1.8          {\t2-\t3}, [bskey, :256]!
 274         veor            \t0, \t0, \x4
 275         veor            \t1, \t1, \x5
 276         __tbl           \x4, \t0, \mask
 277         veor            \t2, \t2, \x6
 278         __tbl           \x5, \t1, \mask
 279         veor            \t3, \t3, \x7
 280         __tbl           \x6, \t2, \mask
 281         __tbl           \x7, \t3, \mask
 282         .endm
 283
 284         .macro          inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
 285                                         t0, t1, t2, t3, mask
 286         __tbl           \x0, \x0, \mask, \t0
 287         __tbl           \x1, \x1, \mask, \t1
 288         __tbl           \x2, \x2, \mask, \t2
 289         __tbl           \x3, \x3, \mask, \t3
 290         __tbl           \x4, \x4, \mask, \t0
 291         __tbl           \x5, \x5, \mask, \t1
 292         __tbl           \x6, \x6, \mask, \t2
 293         __tbl           \x7, \x7, \mask, \t3
 294         .endm
 295
 296         .macro          mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
 297                                   t0, t1, t2, t3, t4, t5, t6, t7, inv
 298         vext.8          \t0, \x0, \x0, #12
 299         vext.8          \t1, \x1, \x1, #12
 300         veor            \x0, \x0, \t0
 301         vext.8          \t2, \x2, \x2, #12
 302         veor            \x1, \x1, \t1
 303         vext.8          \t3, \x3, \x3, #12
 304         veor            \x2, \x2, \t2
 305         vext.8          \t4, \x4, \x4, #12
 306         veor            \x3, \x3, \t3
 307         vext.8          \t5, \x5, \x5, #12
 308         veor            \x4, \x4, \t4
 309         vext.8          \t6, \x6, \x6, #12
 310         veor            \x5, \x5, \t5
 311         vext.8          \t7, \x7, \x7, #12
 312         veor            \x6, \x6, \t6
 313         veor            \t1, \t1, \x0
 314         veor.8          \x7, \x7, \t7
 315         vext.8          \x0, \x0, \x0, #8
 316         veor            \t2, \t2, \x1
 317         veor            \t0, \t0, \x7
 318         veor            \t1, \t1, \x7
 319         vext.8          \x1, \x1, \x1, #8
 320         veor            \t5, \t5, \x4
 321         veor            \x0, \x0, \t0
 322         veor            \t6, \t6, \x5
 323         veor            \x1, \x1, \t1
 324         vext.8          \t0, \x4, \x4, #8
 325         veor            \t4, \t4, \x3
 326         vext.8          \t1, \x5, \x5, #8
 327         veor            \t7, \t7, \x6
 328         vext.8          \x4, \x3, \x3, #8
 329         veor            \t3, \t3, \x2
 330         vext.8          \x5, \x7, \x7, #8
 331         veor            \t4, \t4, \x7
 332         vext.8          \x3, \x6, \x6, #8
 333         veor            \t3, \t3, \x7
 334         vext.8          \x6, \x2, \x2, #8
 335         veor            \x7, \t1, \t5
 336         .ifb            \inv
 337         veor            \x2, \t0, \t4
 338         veor            \x4, \x4, \t3
 339         veor            \x5, \x5, \t7
 340         veor            \x3, \x3, \t6
 341         veor            \x6, \x6, \t2
 342         .else
 343         veor            \t3, \t3, \x4
 344         veor            \x5, \x5, \t7
 345         veor            \x2, \x3, \t6
 346         veor            \x3, \t0, \t4
 347         veor            \x4, \x6, \t2
 348         vmov            \x6, \t3
 349         .endif
 350         .endm
 351
 352         .macro          inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
 353                                       t0, t1, t2, t3, t4, t5, t6, t7
 354         vld1.8          {\t0-\t1}, [bskey, :256]!
 355         veor            \x0, \x0, \t0
 356         vld1.8          {\t2-\t3}, [bskey, :256]!
 357         veor            \x1, \x1, \t1
 358         vld1.8          {\t4-\t5}, [bskey, :256]!
 359         veor            \x2, \x2, \t2
 360         vld1.8          {\t6-\t7}, [bskey, :256]
 361         sub             bskey, bskey, #224
 362         veor            \x3, \x3, \t3
 363         veor            \x4, \x4, \t4
 364         veor            \x5, \x5, \t5
 365         veor            \x6, \x6, \t6
 366         veor            \x7, \x7, \t7
 367         vext.8          \t0, \x0, \x0, #8
 368         vext.8          \t6, \x6, \x6, #8
 369         vext.8          \t7, \x7, \x7, #8
 370         veor            \t0, \t0, \x0
 371         vext.8          \t1, \x1, \x1, #8
 372         veor            \t6, \t6, \x6
 373         vext.8          \t2, \x2, \x2, #8
 374         veor            \t7, \t7, \x7
 375         vext.8          \t3, \x3, \x3, #8
 376         veor            \t1, \t1, \x1
 377         vext.8          \t4, \x4, \x4, #8
 378         veor            \t2, \t2, \x2
 379         vext.8          \t5, \x5, \x5, #8
 380         veor            \t3, \t3, \x3
 381         veor            \t4, \t4, \x4
 382         veor            \t5, \t5, \x5
 383         veor            \x0, \x0, \t6
 384         veor            \x1, \x1, \t6
 385         veor            \x2, \x2, \t0
 386         veor            \x4, \x4, \t2
 387         veor            \x3, \x3, \t1
 388         veor            \x1, \x1, \t7
 389         veor            \x2, \x2, \t7
 390         veor            \x4, \x4, \t6
 391         veor            \x5, \x5, \t3
 392         veor            \x3, \x3, \t6
 393         veor            \x6, \x6, \t4
 394         veor            \x4, \x4, \t7
 395         veor            \x5, \x5, \t7
 396         veor            \x7, \x7, \t5
 397         mix_cols        \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
 398                         \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
 399         .endm
 400
 401         .macro          swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
 402         vshr.u64        \t0, \b0, #\n
 403         vshr.u64        \t1, \b1, #\n
 404         veor            \t0, \t0, \a0
 405         veor            \t1, \t1, \a1
 406         vand            \t0, \t0, \mask
 407         vand            \t1, \t1, \mask
 408         veor            \a0, \a0, \t0
 409         vshl.s64        \t0, \t0, #\n
 410         veor            \a1, \a1, \t1
 411         vshl.s64        \t1, \t1, #\n
 412         veor            \b0, \b0, \t0
 413         veor            \b1, \b1, \t1
 414         .endm
 415
 416         .macro          bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
 417         vmov.i8         \t0, #0x55
 418         vmov.i8         \t1, #0x33
 419         swapmove_2x     \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
 420         swapmove_2x     \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
 421         vmov.i8         \t0, #0x0f
 422         swapmove_2x     \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
 423         swapmove_2x     \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
 424         swapmove_2x     \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
 425         swapmove_2x     \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
 426         .endm
 427
 428         .align          4
 429 M0:     .quad           0x02060a0e03070b0f, 0x0004080c0105090d
 430
 431         /*
 432          * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
 433          */
 434 ENTRY(aesbs_convert_key)
 435         vld1.32         {q7}, [r1]!             // load round 0 key
 436         vld1.32         {q15}, [r1]!            // load round 1 key
 437
 438         vmov.i8         q8,  #0x01              // bit masks
 439         vmov.i8         q9,  #0x02
 440         vmov.i8         q10, #0x04
 441         vmov.i8         q11, #0x08
 442         vmov.i8         q12, #0x10
 443         vmov.i8         q13, #0x20
 444         __ldr           q14, M0
 445
 446         sub             r2, r2, #1
 447         vst1.8          {q7}, [r0, :128]!       // save round 0 key
 448
 449 .Lkey_loop:
 450         __tbl           q7, q15, q14
 451         vmov.i8         q6, #0x40
 452         vmov.i8         q15, #0x80
 453
 454         vtst.8          q0, q7, q8
 455         vtst.8          q1, q7, q9
 456         vtst.8          q2, q7, q10
 457         vtst.8          q3, q7, q11
 458         vtst.8          q4, q7, q12
 459         vtst.8          q5, q7, q13
 460         vtst.8          q6, q7, q6
 461         vtst.8          q7, q7, q15
 462         vld1.32         {q15}, [r1]!            // load next round key
 463         vmvn            q0, q0
 464         vmvn            q1, q1
 465         vmvn            q5, q5
 466         vmvn            q6, q6
 467
 468         subs            r2, r2, #1
 469         vst1.8          {q0-q1}, [r0, :256]!
 470         vst1.8          {q2-q3}, [r0, :256]!
 471         vst1.8          {q4-q5}, [r0, :256]!
 472         vst1.8          {q6-q7}, [r0, :256]!
 473         bne             .Lkey_loop
 474
 475         vmov.i8         q7, #0x63               // compose .L63
 476         veor            q15, q15, q7
 477         vst1.8          {q15}, [r0, :128]
 478         bx              lr
 479 ENDPROC(aesbs_convert_key)
 480
 481         .align          4
 482 M0SR:   .quad           0x0a0e02060f03070b, 0x0004080c05090d01
 483
 484 aesbs_encrypt8:
 485         vld1.8          {q9}, [bskey, :128]!    // round 0 key
 486         __ldr           q8, M0SR
 487
 488         veor            q10, q0, q9             // xor with round0 key
 489         veor            q11, q1, q9
 490         __tbl           q0, q10, q8
 491         veor            q12, q2, q9
 492         __tbl           q1, q11, q8
 493         veor            q13, q3, q9
 494         __tbl           q2, q12, q8
 495         veor            q14, q4, q9
 496         __tbl           q3, q13, q8
 497         veor            q15, q5, q9
 498         __tbl           q4, q14, q8
 499         veor            q10, q6, q9
 500         __tbl           q5, q15, q8
 501         veor            q11, q7, q9
 502         __tbl           q6, q10, q8
 503         __tbl           q7, q11, q8
 504
 505         bitslice        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
 506
 507         sub             rounds, rounds, #1
 508         b               .Lenc_sbox
 509
 510         .align          5
 511 SR:     .quad           0x0504070600030201, 0x0f0e0d0c0a09080b
 512 SRM0:   .quad           0x0304090e00050a0f, 0x01060b0c0207080d
 513
 514 .Lenc_last:
 515         __ldr           q12, SRM0
 516 .Lenc_loop:
 517         shift_rows      q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
 518 .Lenc_sbox:
 519         sbox            q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
 520                                                                 q13, q14, q15
 521         subs            rounds, rounds, #1
 522         bcc             .Lenc_done
 523
 524         mix_cols        q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
 525                                                                 q13, q14, q15
 526
 527         beq             .Lenc_last
 528         __ldr           q12, SR
 529         b               .Lenc_loop
 530
 531 .Lenc_done:
 532         vld1.8          {q12}, [bskey, :128]    // last round key
 533
 534         bitslice        q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
 535
 536         veor            q0, q0, q12
 537         veor            q1, q1, q12
 538         veor            q4, q4, q12
 539         veor            q6, q6, q12
 540         veor            q3, q3, q12
 541         veor            q7, q7, q12
 542         veor            q2, q2, q12
 543         veor            q5, q5, q12
 544         bx              lr
 545 ENDPROC(aesbs_encrypt8)
 546
 547         .align          4
 548 M0ISR:  .quad           0x0a0e0206070b0f03, 0x0004080c0d010509
 549
 550 aesbs_decrypt8:
 551         add             bskey, bskey, rounds, lsl #7
 552         sub             bskey, bskey, #112
 553         vld1.8          {q9}, [bskey, :128]     // round 0 key
 554         sub             bskey, bskey, #128
 555         __ldr           q8, M0ISR
 556
 557         veor            q10, q0, q9             // xor with round0 key
 558         veor            q11, q1, q9
 559         __tbl           q0, q10, q8
 560         veor            q12, q2, q9
 561         __tbl           q1, q11, q8
 562         veor            q13, q3, q9
 563         __tbl           q2, q12, q8
 564         veor            q14, q4, q9
 565         __tbl           q3, q13, q8
 566         veor            q15, q5, q9
 567         __tbl           q4, q14, q8
 568         veor            q10, q6, q9
 569         __tbl           q5, q15, q8
 570         veor            q11, q7, q9
 571         __tbl           q6, q10, q8
 572         __tbl           q7, q11, q8
 573
 574         bitslice        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
 575
 576         sub             rounds, rounds, #1
 577         b               .Ldec_sbox
 578
 579         .align          5
 580 ISR:    .quad           0x0504070602010003, 0x0f0e0d0c080b0a09
 581 ISRM0:  .quad           0x01040b0e0205080f, 0x0306090c00070a0d
 582
 583 .Ldec_last:
 584         __ldr           q12, ISRM0
 585 .Ldec_loop:
 586         inv_shift_rows  q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
 587 .Ldec_sbox:
 588         inv_sbox        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
 589                                                                 q13, q14, q15
 590         subs            rounds, rounds, #1
 591         bcc             .Ldec_done
 592
 593         inv_mix_cols    q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
 594                                                                 q13, q14, q15
 595
 596         beq             .Ldec_last
 597         __ldr           q12, ISR
 598         b               .Ldec_loop
 599
 600 .Ldec_done:
 601         add             bskey, bskey, #112
 602         vld1.8          {q12}, [bskey, :128]    // last round key
 603
 604         bitslice        q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
 605
 606         veor            q0, q0, q12
 607         veor            q1, q1, q12
 608         veor            q6, q6, q12
 609         veor            q4, q4, q12
 610         veor            q2, q2, q12
 611         veor            q7, q7, q12
 612         veor            q3, q3, q12
 613         veor            q5, q5, q12
 614         bx              lr
 615 ENDPROC(aesbs_decrypt8)
 616
 617         /*
 618          * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 619          *                   int blocks)
 620          * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 621          *                   int blocks)
 622          */
 623         .macro          __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 624         push            {r4-r6, lr}
 625         ldr             r5, [sp, #16]           // number of blocks
 626
 627 99:     adr             ip, 0f
 628         and             lr, r5, #7
 629         cmp             r5, #8
 630         sub             ip, ip, lr, lsl #2
 631         movlt           pc, ip                  // computed goto if blocks < 8
 632
 633         vld1.8          {q0}, [r1]!
 634         vld1.8          {q1}, [r1]!
 635         vld1.8          {q2}, [r1]!
 636         vld1.8          {q3}, [r1]!
 637         vld1.8          {q4}, [r1]!
 638         vld1.8          {q5}, [r1]!
 639         vld1.8          {q6}, [r1]!
 640         vld1.8          {q7}, [r1]!
 641
 642 0:      mov             bskey, r2
 643         mov             rounds, r3
 644         bl              \do8
 645
 646         adr             ip, 1f
 647         and             lr, r5, #7
 648         cmp             r5, #8
 649         sub             ip, ip, lr, lsl #2
 650         movlt           pc, ip                  // computed goto if blocks < 8
 651
 652         vst1.8          {\o0}, [r0]!
 653         vst1.8          {\o1}, [r0]!
 654         vst1.8          {\o2}, [r0]!
 655         vst1.8          {\o3}, [r0]!
 656         vst1.8          {\o4}, [r0]!
 657         vst1.8          {\o5}, [r0]!
 658         vst1.8          {\o6}, [r0]!
 659         vst1.8          {\o7}, [r0]!
 660
 661 1:      subs            r5, r5, #8
 662         bgt             99b
 663
 664         pop             {r4-r6, pc}
 665         .endm
 666
 667         .align          4
 668 ENTRY(aesbs_ecb_encrypt)
 669         __ecb_crypt     aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
 670 ENDPROC(aesbs_ecb_encrypt)
 671
 672         .align          4
 673 ENTRY(aesbs_ecb_decrypt)
 674         __ecb_crypt     aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
 675 ENDPROC(aesbs_ecb_decrypt)
 676
 677         /*
 678          * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
 679          *                   int rounds, int blocks, u8 iv[])
 680          */
 681         .align          4
 682 ENTRY(aesbs_cbc_decrypt)
 683         mov             ip, sp
 684         push            {r4-r6, lr}
 685         ldm             ip, {r5-r6}             // load args 4-5
 686
 687 99:     adr             ip, 0f
 688         and             lr, r5, #7
 689         cmp             r5, #8
 690         sub             ip, ip, lr, lsl #2
 691         mov             lr, r1
 692         movlt           pc, ip                  // computed goto if blocks < 8
 693
 694         vld1.8          {q0}, [lr]!
 695         vld1.8          {q1}, [lr]!
 696         vld1.8          {q2}, [lr]!
 697         vld1.8          {q3}, [lr]!
 698         vld1.8          {q4}, [lr]!
 699         vld1.8          {q5}, [lr]!
 700         vld1.8          {q6}, [lr]!
 701         vld1.8          {q7}, [lr]
 702
 703 0:      mov             bskey, r2
 704         mov             rounds, r3
 705         bl              aesbs_decrypt8
 706
 707         vld1.8          {q8}, [r6]
 708         vmov            q9, q8
 709         vmov            q10, q8
 710         vmov            q11, q8
 711         vmov            q12, q8
 712         vmov            q13, q8
 713         vmov            q14, q8
 714         vmov            q15, q8
 715
 716         adr             ip, 1f
 717         and             lr, r5, #7
 718         cmp             r5, #8
 719         sub             ip, ip, lr, lsl #2
 720         movlt           pc, ip                  // computed goto if blocks < 8
 721
 722         vld1.8          {q9}, [r1]!
 723         vld1.8          {q10}, [r1]!
 724         vld1.8          {q11}, [r1]!
 725         vld1.8          {q12}, [r1]!
 726         vld1.8          {q13}, [r1]!
 727         vld1.8          {q14}, [r1]!
 728         vld1.8          {q15}, [r1]!
 729         W(nop)
 730
 731 1:      adr             ip, 2f
 732         sub             ip, ip, lr, lsl #3
 733         movlt           pc, ip                  // computed goto if blocks < 8
 734
 735         veor            q0, q0, q8
 736         vst1.8          {q0}, [r0]!
 737         veor            q1, q1, q9
 738         vst1.8          {q1}, [r0]!
 739         veor            q6, q6, q10
 740         vst1.8          {q6}, [r0]!
 741         veor            q4, q4, q11
 742         vst1.8          {q4}, [r0]!
 743         veor            q2, q2, q12
 744         vst1.8          {q2}, [r0]!
 745         veor            q7, q7, q13
 746         vst1.8          {q7}, [r0]!
 747         veor            q3, q3, q14
 748         vst1.8          {q3}, [r0]!
 749         veor            q5, q5, q15
 750         vld1.8          {q8}, [r1]!             // load next round's iv
 751 2:      vst1.8          {q5}, [r0]!
 752
 753         subs            r5, r5, #8
 754         vst1.8          {q8}, [r6]              // store next round's iv
 755         bgt             99b
 756
 757         pop             {r4-r6, pc}
 758 ENDPROC(aesbs_cbc_decrypt)
 759
 760         .macro          next_ctr, q
 761         vmov            \q\()h, r9, r10
 762         adds            r10, r10, #1
 763         adcs            r9, r9, #0
 764         vmov            \q\()l, r7, r8
 765         adcs            r8, r8, #0
 766         adc             r7, r7, #0
 767         vrev32.8        \q, \q
 768         .endm
 769
 770         /*
 771          * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
 772          *                   int rounds, int bytes, u8 ctr[])
 773          */
 774 ENTRY(aesbs_ctr_encrypt)
 775         mov             ip, sp
 776         push            {r4-r10, lr}
 777
 778         ldm             ip, {r5, r6}            // load args 4-5
 779         vld1.8          {q0}, [r6]              // load counter
 780         vrev32.8        q1, q0
 781         vmov            r9, r10, d3
 782         vmov            r7, r8, d2
 783
 784         adds            r10, r10, #1
 785         adcs            r9, r9, #0
 786         adcs            r8, r8, #0
 787         adc             r7, r7, #0
 788
 789 99:     vmov            q1, q0
 790         sub             lr, r5, #1
 791         vmov            q2, q0
 792         adr             ip, 0f
 793         vmov            q3, q0
 794         and             lr, lr, #112
 795         vmov            q4, q0
 796         cmp             r5, #112
 797         vmov            q5, q0
 798         sub             ip, ip, lr, lsl #1
 799         vmov            q6, q0
 800         add             ip, ip, lr, lsr #2
 801         vmov            q7, q0
 802         movle           pc, ip                  // computed goto if bytes < 112
 803
 804         next_ctr        q1
 805         next_ctr        q2
 806         next_ctr        q3
 807         next_ctr        q4
 808         next_ctr        q5
 809         next_ctr        q6
 810         next_ctr        q7
 811
 812 0:      mov             bskey, r2
 813         mov             rounds, r3
 814         bl              aesbs_encrypt8
 815
 816         adr             ip, 1f
 817         sub             lr, r5, #1
 818         cmp             r5, #128
 819         bic             lr, lr, #15
 820         ands            r4, r5, #15             // preserves C flag
 821         teqcs           r5, r5                  // set Z flag if not last iteration
 822         sub             ip, ip, lr, lsr #2
 823         rsb             r4, r4, #16
 824         movcc           pc, ip                  // computed goto if bytes < 128
 825
 826         vld1.8          {q8}, [r1]!
 827         vld1.8          {q9}, [r1]!
 828         vld1.8          {q10}, [r1]!
 829         vld1.8          {q11}, [r1]!
 830         vld1.8          {q12}, [r1]!
 831         vld1.8          {q13}, [r1]!
 832         vld1.8          {q14}, [r1]!
 833 1:      subne           r1, r1, r4
 834         vld1.8          {q15}, [r1]!
 835
 836         add             ip, ip, #2f - 1b
 837
 838         veor            q0, q0, q8
 839         veor            q1, q1, q9
 840         veor            q4, q4, q10
 841         veor            q6, q6, q11
 842         veor            q3, q3, q12
 843         veor            q7, q7, q13
 844         veor            q2, q2, q14
 845         bne             3f
 846         veor            q5, q5, q15
 847
 848         movcc           pc, ip                  // computed goto if bytes < 128
 849
 850         vst1.8          {q0}, [r0]!
 851         vst1.8          {q1}, [r0]!
 852         vst1.8          {q4}, [r0]!
 853         vst1.8          {q6}, [r0]!
 854         vst1.8          {q3}, [r0]!
 855         vst1.8          {q7}, [r0]!
 856         vst1.8          {q2}, [r0]!
 857 2:      subne           r0, r0, r4
 858         vst1.8          {q5}, [r0]!
 859
 860         next_ctr        q0
 861
 862         subs            r5, r5, #128
 863         bgt             99b
 864
 865         vst1.8          {q0}, [r6]
 866         pop             {r4-r10, pc}
 867
 868 3:      adr             lr, .Lpermute_table + 16
 869         cmp             r5, #16                 // Z flag remains cleared
 870         sub             lr, lr, r4
 871         vld1.8          {q8-q9}, [lr]
 872         vtbl.8          d16, {q5}, d16
 873         vtbl.8          d17, {q5}, d17
 874         veor            q5, q8, q15
 875         bcc             4f                      // have to reload prev if R5 < 16
 876         vtbx.8          d10, {q2}, d18
 877         vtbx.8          d11, {q2}, d19
 878         mov             pc, ip                  // branch back to VST sequence
 879
 880 4:      sub             r0, r0, r4
 881         vshr.s8         q9, q9, #7              // create mask for VBIF
 882         vld1.8          {q8}, [r0]              // reload
 883         vbif            q5, q8, q9
 884         vst1.8          {q5}, [r0]
 885         pop             {r4-r10, pc}
 886 ENDPROC(aesbs_ctr_encrypt)
 887
 888         .align          6
 889 .Lpermute_table:
 890         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 891         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 892         .byte           0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
 893         .byte           0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
 894         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 895         .byte           0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 896
 897         .macro          next_tweak, out, in, const, tmp
 898         vshr.s64        \tmp, \in, #63
 899         vand            \tmp, \tmp, \const
 900         vadd.u64        \out, \in, \in
 901         vext.8          \tmp, \tmp, \tmp, #8
 902         veor            \out, \out, \tmp
 903         .endm
 904
 905         /*
 906          * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 907          *                   int blocks, u8 iv[], int reorder_last_tweak)
 908          * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 909          *                   int blocks, u8 iv[], int reorder_last_tweak)
 910          */
 911         .align          6
 912 __xts_prepare8:
 913         vld1.8          {q14}, [r7]             // load iv
 914         vmov.i32        d30, #0x87              // compose tweak mask vector
 915         vmovl.u32       q15, d30
 916         vshr.u64        d30, d31, #7
 917         vmov            q12, q14
 918
 919         adr             ip, 0f
 920         and             r4, r6, #7
 921         cmp             r6, #8
 922         sub             ip, ip, r4, lsl #5
 923         mov             r4, sp
 924         movlt           pc, ip                  // computed goto if blocks < 8
 925
 926         vld1.8          {q0}, [r1]!
 927         next_tweak      q12, q14, q15, q13
 928         veor            q0, q0, q14
 929         vst1.8          {q14}, [r4, :128]!
 930
 931         vld1.8          {q1}, [r1]!
 932         next_tweak      q14, q12, q15, q13
 933         veor            q1, q1, q12
 934         vst1.8          {q12}, [r4, :128]!
 935
 936         vld1.8          {q2}, [r1]!
 937         next_tweak      q12, q14, q15, q13
 938         veor            q2, q2, q14
 939         vst1.8          {q14}, [r4, :128]!
 940
 941         vld1.8          {q3}, [r1]!
 942         next_tweak      q14, q12, q15, q13
 943         veor            q3, q3, q12
 944         vst1.8          {q12}, [r4, :128]!
 945
 946         vld1.8          {q4}, [r1]!
 947         next_tweak      q12, q14, q15, q13
 948         veor            q4, q4, q14
 949         vst1.8          {q14}, [r4, :128]!
 950
 951         vld1.8          {q5}, [r1]!
 952         next_tweak      q14, q12, q15, q13
 953         veor            q5, q5, q12
 954         vst1.8          {q12}, [r4, :128]!
 955
 956         vld1.8          {q6}, [r1]!
 957         next_tweak      q12, q14, q15, q13
 958         veor            q6, q6, q14
 959         vst1.8          {q14}, [r4, :128]!
 960
 961         vld1.8          {q7}, [r1]!
 962         next_tweak      q14, q12, q15, q13
 963 THUMB(  itt             le              )
 964         W(cmple)        r8, #0
 965         ble             1f
 966 0:      veor            q7, q7, q12
 967         vst1.8          {q12}, [r4, :128]
 968
 969         vst1.8          {q14}, [r7]             // store next iv
 970         bx              lr
 971
 972 1:      vswp            q12, q14
 973         b               0b
 974 ENDPROC(__xts_prepare8)
 975
 976         .macro          __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 977         push            {r4-r8, lr}
 978         mov             r5, sp                  // preserve sp
 979         ldrd            r6, r7, [sp, #24]       // get blocks and iv args
 980         rsb             r8, ip, #1
 981         sub             ip, sp, #128            // make room for 8x tweak
 982         bic             ip, ip, #0xf            // align sp to 16 bytes
 983         mov             sp, ip
 984
 985 99:     bl              __xts_prepare8
 986
 987         mov             bskey, r2
 988         mov             rounds, r3
 989         bl              \do8
 990
 991         adr             ip, 0f
 992         and             lr, r6, #7
 993         cmp             r6, #8
 994         sub             ip, ip, lr, lsl #2
 995         mov             r4, sp
 996         movlt           pc, ip                  // computed goto if blocks < 8
 997
 998         vld1.8          {q8}, [r4, :128]!
 999         vld1.8          {q9}, [r4, :128]!
1000         vld1.8          {q10}, [r4, :128]!
1001         vld1.8          {q11}, [r4, :128]!
1002         vld1.8          {q12}, [r4, :128]!
1003         vld1.8          {q13}, [r4, :128]!
1004         vld1.8          {q14}, [r4, :128]!
1005         vld1.8          {q15}, [r4, :128]
1006
1007 0:      adr             ip, 1f
1008         sub             ip, ip, lr, lsl #3
1009         movlt           pc, ip                  // computed goto if blocks < 8
1010
1011         veor            \o0, \o0, q8
1012         vst1.8          {\o0}, [r0]!
1013         veor            \o1, \o1, q9
1014         vst1.8          {\o1}, [r0]!
1015         veor            \o2, \o2, q10
1016         vst1.8          {\o2}, [r0]!
1017         veor            \o3, \o3, q11
1018         vst1.8          {\o3}, [r0]!
1019         veor            \o4, \o4, q12
1020         vst1.8          {\o4}, [r0]!
1021         veor            \o5, \o5, q13
1022         vst1.8          {\o5}, [r0]!
1023         veor            \o6, \o6, q14
1024         vst1.8          {\o6}, [r0]!
1025         veor            \o7, \o7, q15
1026         vst1.8          {\o7}, [r0]!
1027
1028 1:      subs            r6, r6, #8
1029         bgt             99b
1030
1031         mov             sp, r5
1032         pop             {r4-r8, pc}
1033         .endm
1034
1035 ENTRY(aesbs_xts_encrypt)
1036         mov             ip, #0                  // never reorder final tweak
1037         __xts_crypt     aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1038 ENDPROC(aesbs_xts_encrypt)
1039
1040 ENTRY(aesbs_xts_decrypt)
1041         ldr             ip, [sp, #8]            // reorder final tweak?
1042         __xts_crypt     aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1043 ENDPROC(aesbs_xts_decrypt)