arch/arm/crypto/aes-neonbs-core.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Bit sliced AES using NEON instructions
   4  *
   5  * Copyright (C) 2017 Linaro Ltd.
   6  * Author: Ard Biesheuvel <ard.biesheuvel@linaro.org>
   7  */
   8
   9 /*
  10  * The algorithm implemented here is described in detail by the paper
  11  * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and
  12  * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf)
  13  *
  14  * This implementation is based primarily on the OpenSSL implementation
  15  * for 32-bit ARM written by Andy Polyakov <appro@openssl.org>
  16  */
  17
  18 #include <linux/linkage.h>
  19 #include <asm/assembler.h>
  20
  21         .text
  22         .fpu            neon
  23
  24         rounds          .req    ip
  25         bskey           .req    r4
  26
  27         q0l             .req    d0
  28         q0h             .req    d1
  29         q1l             .req    d2
  30         q1h             .req    d3
  31         q2l             .req    d4
  32         q2h             .req    d5
  33         q3l             .req    d6
  34         q3h             .req    d7
  35         q4l             .req    d8
  36         q4h             .req    d9
  37         q5l             .req    d10
  38         q5h             .req    d11
  39         q6l             .req    d12
  40         q6h             .req    d13
  41         q7l             .req    d14
  42         q7h             .req    d15
  43         q8l             .req    d16
  44         q8h             .req    d17
  45         q9l             .req    d18
  46         q9h             .req    d19
  47         q10l            .req    d20
  48         q10h            .req    d21
  49         q11l            .req    d22
  50         q11h            .req    d23
  51         q12l            .req    d24
  52         q12h            .req    d25
  53         q13l            .req    d26
  54         q13h            .req    d27
  55         q14l            .req    d28
  56         q14h            .req    d29
  57         q15l            .req    d30
  58         q15h            .req    d31
  59
  60         .macro          __tbl, out, tbl, in, tmp
  61         .ifc            \out, \tbl
  62         .ifb            \tmp
  63         .error          __tbl needs temp register if out == tbl
  64         .endif
  65         vmov            \tmp, \out
  66         .endif
  67         vtbl.8          \out\()l, {\tbl}, \in\()l
  68         .ifc            \out, \tbl
  69         vtbl.8          \out\()h, {\tmp}, \in\()h
  70         .else
  71         vtbl.8          \out\()h, {\tbl}, \in\()h
  72         .endif
  73         .endm
  74
  75         .macro          __ldr, out, sym
  76         vldr            \out\()l, \sym
  77         vldr            \out\()h, \sym + 8
  78         .endm
  79
  80         .macro          __adr, reg, lbl
  81         adr             \reg, \lbl
  82 THUMB(  orr             \reg, \reg, #1          )
  83         .endm
  84
  85         .macro          in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
  86         veor            \b2, \b2, \b1
  87         veor            \b5, \b5, \b6
  88         veor            \b3, \b3, \b0
  89         veor            \b6, \b6, \b2
  90         veor            \b5, \b5, \b0
  91         veor            \b6, \b6, \b3
  92         veor            \b3, \b3, \b7
  93         veor            \b7, \b7, \b5
  94         veor            \b3, \b3, \b4
  95         veor            \b4, \b4, \b5
  96         veor            \b2, \b2, \b7
  97         veor            \b3, \b3, \b1
  98         veor            \b1, \b1, \b5
  99         .endm
 100
 101         .macro          out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7
 102         veor            \b0, \b0, \b6
 103         veor            \b1, \b1, \b4
 104         veor            \b4, \b4, \b6
 105         veor            \b2, \b2, \b0
 106         veor            \b6, \b6, \b1
 107         veor            \b1, \b1, \b5
 108         veor            \b5, \b5, \b3
 109         veor            \b3, \b3, \b7
 110         veor            \b7, \b7, \b5
 111         veor            \b2, \b2, \b5
 112         veor            \b4, \b4, \b7
 113         .endm
 114
 115         .macro          inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5
 116         veor            \b1, \b1, \b7
 117         veor            \b4, \b4, \b7
 118         veor            \b7, \b7, \b5
 119         veor            \b1, \b1, \b3
 120         veor            \b2, \b2, \b5
 121         veor            \b3, \b3, \b7
 122         veor            \b6, \b6, \b1
 123         veor            \b2, \b2, \b0
 124         veor            \b5, \b5, \b3
 125         veor            \b4, \b4, \b6
 126         veor            \b0, \b0, \b6
 127         veor            \b1, \b1, \b4
 128         .endm
 129
 130         .macro          inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2
 131         veor            \b1, \b1, \b5
 132         veor            \b2, \b2, \b7
 133         veor            \b3, \b3, \b1
 134         veor            \b4, \b4, \b5
 135         veor            \b7, \b7, \b5
 136         veor            \b3, \b3, \b4
 137         veor            \b5, \b5, \b0
 138         veor            \b3, \b3, \b7
 139         veor            \b6, \b6, \b2
 140         veor            \b2, \b2, \b1
 141         veor            \b6, \b6, \b3
 142         veor            \b3, \b3, \b0
 143         veor            \b5, \b5, \b6
 144         .endm
 145
 146         .macro          mul_gf4, x0, x1, y0, y1, t0, t1
 147         veor            \t0, \y0, \y1
 148         vand            \t0, \t0, \x0
 149         veor            \x0, \x0, \x1
 150         vand            \t1, \x1, \y0
 151         vand            \x0, \x0, \y1
 152         veor            \x1, \t1, \t0
 153         veor            \x0, \x0, \t1
 154         .endm
 155
 156         .macro          mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1
 157         veor            \t0, \y0, \y1
 158         veor            \t1, \y2, \y3
 159         vand            \t0, \t0, \x0
 160         vand            \t1, \t1, \x2
 161         veor            \x0, \x0, \x1
 162         veor            \x2, \x2, \x3
 163         vand            \x1, \x1, \y0
 164         vand            \x3, \x3, \y2
 165         vand            \x0, \x0, \y1
 166         vand            \x2, \x2, \y3
 167         veor            \x1, \x1, \x0
 168         veor            \x2, \x2, \x3
 169         veor            \x0, \x0, \t0
 170         veor            \x3, \x3, \t1
 171         .endm
 172
 173         .macro          mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \
 174                                     y0, y1, y2, y3, t0, t1, t2, t3
 175         veor            \t0, \x0, \x2
 176         veor            \t1, \x1, \x3
 177         mul_gf4         \x0, \x1, \y0, \y1, \t2, \t3
 178         veor            \y0, \y0, \y2
 179         veor            \y1, \y1, \y3
 180         mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2
 181         veor            \x0, \x0, \t0
 182         veor            \x2, \x2, \t0
 183         veor            \x1, \x1, \t1
 184         veor            \x3, \x3, \t1
 185         veor            \t0, \x4, \x6
 186         veor            \t1, \x5, \x7
 187         mul_gf4_n_gf4   \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2
 188         veor            \y0, \y0, \y2
 189         veor            \y1, \y1, \y3
 190         mul_gf4         \x4, \x5, \y0, \y1, \t2, \t3
 191         veor            \x4, \x4, \t0
 192         veor            \x6, \x6, \t0
 193         veor            \x5, \x5, \t1
 194         veor            \x7, \x7, \t1
 195         .endm
 196
 197         .macro          inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \
 198                                    t0, t1, t2, t3, s0, s1, s2, s3
 199         veor            \t3, \x4, \x6
 200         veor            \t0, \x5, \x7
 201         veor            \t1, \x1, \x3
 202         veor            \s1, \x7, \x6
 203         veor            \s0, \x0, \x2
 204         veor            \s3, \t3, \t0
 205         vorr            \t2, \t0, \t1
 206         vand            \s2, \t3, \s0
 207         vorr            \t3, \t3, \s0
 208         veor            \s0, \s0, \t1
 209         vand            \t0, \t0, \t1
 210         veor            \t1, \x3, \x2
 211         vand            \s3, \s3, \s0
 212         vand            \s1, \s1, \t1
 213         veor            \t1, \x4, \x5
 214         veor            \s0, \x1, \x0
 215         veor            \t3, \t3, \s1
 216         veor            \t2, \t2, \s1
 217         vand            \s1, \t1, \s0
 218         vorr            \t1, \t1, \s0
 219         veor            \t3, \t3, \s3
 220         veor            \t0, \t0, \s1
 221         veor            \t2, \t2, \s2
 222         veor            \t1, \t1, \s3
 223         veor            \t0, \t0, \s2
 224         vand            \s0, \x7, \x3
 225         veor            \t1, \t1, \s2
 226         vand            \s1, \x6, \x2
 227         vand            \s2, \x5, \x1
 228         vorr            \s3, \x4, \x0
 229         veor            \t3, \t3, \s0
 230         veor            \t1, \t1, \s2
 231         veor            \s0, \t0, \s3
 232         veor            \t2, \t2, \s1
 233         vand            \s2, \t3, \t1
 234         veor            \s1, \t2, \s2
 235         veor            \s3, \s0, \s2
 236         vbsl            \s1, \t1, \s0
 237         vmvn            \t0, \s0
 238         vbsl            \s0, \s1, \s3
 239         vbsl            \t0, \s1, \s3
 240         vbsl            \s3, \t3, \t2
 241         veor            \t3, \t3, \t2
 242         vand            \s2, \s0, \s3
 243         veor            \t1, \t1, \t0
 244         veor            \s2, \s2, \t3
 245         mul_gf16_2      \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
 246                         \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
 247         .endm
 248
 249         .macro          sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
 250                               t0, t1, t2, t3, s0, s1, s2, s3
 251         in_bs_ch        \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
 252         inv_gf256       \b6, \b5, \b0, \b3, \b7, \b1, \b4, \b2, \
 253                         \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 254         out_bs_ch       \b7, \b1, \b4, \b2, \b6, \b5, \b0, \b3
 255         .endm
 256
 257         .macro          inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \
 258                                   t0, t1, t2, t3, s0, s1, s2, s3
 259         inv_in_bs_ch    \b0, \b1, \b2, \b3, \b4, \b5, \b6, \b7
 260         inv_gf256       \b5, \b1, \b2, \b6, \b3, \b7, \b0, \b4, \
 261                         \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 262         inv_out_bs_ch   \b3, \b7, \b0, \b4, \b5, \b1, \b2, \b6
 263         .endm
 264
 265         .macro          shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
 266                                     t0, t1, t2, t3, mask
 267         vld1.8          {\t0-\t1}, [bskey, :256]!
 268         veor            \t0, \t0, \x0
 269         vld1.8          {\t2-\t3}, [bskey, :256]!
 270         veor            \t1, \t1, \x1
 271         __tbl           \x0, \t0, \mask
 272         veor            \t2, \t2, \x2
 273         __tbl           \x1, \t1, \mask
 274         vld1.8          {\t0-\t1}, [bskey, :256]!
 275         veor            \t3, \t3, \x3
 276         __tbl           \x2, \t2, \mask
 277         __tbl           \x3, \t3, \mask
 278         vld1.8          {\t2-\t3}, [bskey, :256]!
 279         veor            \t0, \t0, \x4
 280         veor            \t1, \t1, \x5
 281         __tbl           \x4, \t0, \mask
 282         veor            \t2, \t2, \x6
 283         __tbl           \x5, \t1, \mask
 284         veor            \t3, \t3, \x7
 285         __tbl           \x6, \t2, \mask
 286         __tbl           \x7, \t3, \mask
 287         .endm
 288
 289         .macro          inv_shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, \
 290                                         t0, t1, t2, t3, mask
 291         __tbl           \x0, \x0, \mask, \t0
 292         __tbl           \x1, \x1, \mask, \t1
 293         __tbl           \x2, \x2, \mask, \t2
 294         __tbl           \x3, \x3, \mask, \t3
 295         __tbl           \x4, \x4, \mask, \t0
 296         __tbl           \x5, \x5, \mask, \t1
 297         __tbl           \x6, \x6, \mask, \t2
 298         __tbl           \x7, \x7, \mask, \t3
 299         .endm
 300
 301         .macro          mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
 302                                   t0, t1, t2, t3, t4, t5, t6, t7, inv
 303         vext.8          \t0, \x0, \x0, #12
 304         vext.8          \t1, \x1, \x1, #12
 305         veor            \x0, \x0, \t0
 306         vext.8          \t2, \x2, \x2, #12
 307         veor            \x1, \x1, \t1
 308         vext.8          \t3, \x3, \x3, #12
 309         veor            \x2, \x2, \t2
 310         vext.8          \t4, \x4, \x4, #12
 311         veor            \x3, \x3, \t3
 312         vext.8          \t5, \x5, \x5, #12
 313         veor            \x4, \x4, \t4
 314         vext.8          \t6, \x6, \x6, #12
 315         veor            \x5, \x5, \t5
 316         vext.8          \t7, \x7, \x7, #12
 317         veor            \x6, \x6, \t6
 318         veor            \t1, \t1, \x0
 319         veor.8          \x7, \x7, \t7
 320         vext.8          \x0, \x0, \x0, #8
 321         veor            \t2, \t2, \x1
 322         veor            \t0, \t0, \x7
 323         veor            \t1, \t1, \x7
 324         vext.8          \x1, \x1, \x1, #8
 325         veor            \t5, \t5, \x4
 326         veor            \x0, \x0, \t0
 327         veor            \t6, \t6, \x5
 328         veor            \x1, \x1, \t1
 329         vext.8          \t0, \x4, \x4, #8
 330         veor            \t4, \t4, \x3
 331         vext.8          \t1, \x5, \x5, #8
 332         veor            \t7, \t7, \x6
 333         vext.8          \x4, \x3, \x3, #8
 334         veor            \t3, \t3, \x2
 335         vext.8          \x5, \x7, \x7, #8
 336         veor            \t4, \t4, \x7
 337         vext.8          \x3, \x6, \x6, #8
 338         veor            \t3, \t3, \x7
 339         vext.8          \x6, \x2, \x2, #8
 340         veor            \x7, \t1, \t5
 341         .ifb            \inv
 342         veor            \x2, \t0, \t4
 343         veor            \x4, \x4, \t3
 344         veor            \x5, \x5, \t7
 345         veor            \x3, \x3, \t6
 346         veor            \x6, \x6, \t2
 347         .else
 348         veor            \t3, \t3, \x4
 349         veor            \x5, \x5, \t7
 350         veor            \x2, \x3, \t6
 351         veor            \x3, \t0, \t4
 352         veor            \x4, \x6, \t2
 353         vmov            \x6, \t3
 354         .endif
 355         .endm
 356
 357         .macro          inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \
 358                                       t0, t1, t2, t3, t4, t5, t6, t7
 359         vld1.8          {\t0-\t1}, [bskey, :256]!
 360         veor            \x0, \x0, \t0
 361         vld1.8          {\t2-\t3}, [bskey, :256]!
 362         veor            \x1, \x1, \t1
 363         vld1.8          {\t4-\t5}, [bskey, :256]!
 364         veor            \x2, \x2, \t2
 365         vld1.8          {\t6-\t7}, [bskey, :256]
 366         sub             bskey, bskey, #224
 367         veor            \x3, \x3, \t3
 368         veor            \x4, \x4, \t4
 369         veor            \x5, \x5, \t5
 370         veor            \x6, \x6, \t6
 371         veor            \x7, \x7, \t7
 372         vext.8          \t0, \x0, \x0, #8
 373         vext.8          \t6, \x6, \x6, #8
 374         vext.8          \t7, \x7, \x7, #8
 375         veor            \t0, \t0, \x0
 376         vext.8          \t1, \x1, \x1, #8
 377         veor            \t6, \t6, \x6
 378         vext.8          \t2, \x2, \x2, #8
 379         veor            \t7, \t7, \x7
 380         vext.8          \t3, \x3, \x3, #8
 381         veor            \t1, \t1, \x1
 382         vext.8          \t4, \x4, \x4, #8
 383         veor            \t2, \t2, \x2
 384         vext.8          \t5, \x5, \x5, #8
 385         veor            \t3, \t3, \x3
 386         veor            \t4, \t4, \x4
 387         veor            \t5, \t5, \x5
 388         veor            \x0, \x0, \t6
 389         veor            \x1, \x1, \t6
 390         veor            \x2, \x2, \t0
 391         veor            \x4, \x4, \t2
 392         veor            \x3, \x3, \t1
 393         veor            \x1, \x1, \t7
 394         veor            \x2, \x2, \t7
 395         veor            \x4, \x4, \t6
 396         veor            \x5, \x5, \t3
 397         veor            \x3, \x3, \t6
 398         veor            \x6, \x6, \t4
 399         veor            \x4, \x4, \t7
 400         veor            \x5, \x5, \t7
 401         veor            \x7, \x7, \t5
 402         mix_cols        \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \
 403                         \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1
 404         .endm
 405
 406         .macro          swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1
 407         vshr.u64        \t0, \b0, #\n
 408         vshr.u64        \t1, \b1, #\n
 409         veor            \t0, \t0, \a0
 410         veor            \t1, \t1, \a1
 411         vand            \t0, \t0, \mask
 412         vand            \t1, \t1, \mask
 413         veor            \a0, \a0, \t0
 414         vshl.s64        \t0, \t0, #\n
 415         veor            \a1, \a1, \t1
 416         vshl.s64        \t1, \t1, #\n
 417         veor            \b0, \b0, \t0
 418         veor            \b1, \b1, \t1
 419         .endm
 420
 421         .macro          bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3
 422         vmov.i8         \t0, #0x55
 423         vmov.i8         \t1, #0x33
 424         swapmove_2x     \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3
 425         swapmove_2x     \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3
 426         vmov.i8         \t0, #0x0f
 427         swapmove_2x     \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3
 428         swapmove_2x     \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3
 429         swapmove_2x     \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3
 430         swapmove_2x     \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3
 431         .endm
 432
 433         .align          4
 434 M0:     .quad           0x02060a0e03070b0f, 0x0004080c0105090d
 435
 436         /*
 437          * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
 438          */
 439 ENTRY(aesbs_convert_key)
 440         vld1.32         {q7}, [r1]!             // load round 0 key
 441         vld1.32         {q15}, [r1]!            // load round 1 key
 442
 443         vmov.i8         q8,  #0x01              // bit masks
 444         vmov.i8         q9,  #0x02
 445         vmov.i8         q10, #0x04
 446         vmov.i8         q11, #0x08
 447         vmov.i8         q12, #0x10
 448         vmov.i8         q13, #0x20
 449         __ldr           q14, M0
 450
 451         sub             r2, r2, #1
 452         vst1.8          {q7}, [r0, :128]!       // save round 0 key
 453
 454 .Lkey_loop:
 455         __tbl           q7, q15, q14
 456         vmov.i8         q6, #0x40
 457         vmov.i8         q15, #0x80
 458
 459         vtst.8          q0, q7, q8
 460         vtst.8          q1, q7, q9
 461         vtst.8          q2, q7, q10
 462         vtst.8          q3, q7, q11
 463         vtst.8          q4, q7, q12
 464         vtst.8          q5, q7, q13
 465         vtst.8          q6, q7, q6
 466         vtst.8          q7, q7, q15
 467         vld1.32         {q15}, [r1]!            // load next round key
 468         vmvn            q0, q0
 469         vmvn            q1, q1
 470         vmvn            q5, q5
 471         vmvn            q6, q6
 472
 473         subs            r2, r2, #1
 474         vst1.8          {q0-q1}, [r0, :256]!
 475         vst1.8          {q2-q3}, [r0, :256]!
 476         vst1.8          {q4-q5}, [r0, :256]!
 477         vst1.8          {q6-q7}, [r0, :256]!
 478         bne             .Lkey_loop
 479
 480         vmov.i8         q7, #0x63               // compose .L63
 481         veor            q15, q15, q7
 482         vst1.8          {q15}, [r0, :128]
 483         bx              lr
 484 ENDPROC(aesbs_convert_key)
 485
 486         .align          4
 487 M0SR:   .quad           0x0a0e02060f03070b, 0x0004080c05090d01
 488
 489 aesbs_encrypt8:
 490         vld1.8          {q9}, [bskey, :128]!    // round 0 key
 491         __ldr           q8, M0SR
 492
 493         veor            q10, q0, q9             // xor with round0 key
 494         veor            q11, q1, q9
 495         __tbl           q0, q10, q8
 496         veor            q12, q2, q9
 497         __tbl           q1, q11, q8
 498         veor            q13, q3, q9
 499         __tbl           q2, q12, q8
 500         veor            q14, q4, q9
 501         __tbl           q3, q13, q8
 502         veor            q15, q5, q9
 503         __tbl           q4, q14, q8
 504         veor            q10, q6, q9
 505         __tbl           q5, q15, q8
 506         veor            q11, q7, q9
 507         __tbl           q6, q10, q8
 508         __tbl           q7, q11, q8
 509
 510         bitslice        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
 511
 512         sub             rounds, rounds, #1
 513         b               .Lenc_sbox
 514
 515         .align          5
 516 SR:     .quad           0x0504070600030201, 0x0f0e0d0c0a09080b
 517 SRM0:   .quad           0x0304090e00050a0f, 0x01060b0c0207080d
 518
 519 .Lenc_last:
 520         __ldr           q12, SRM0
 521 .Lenc_loop:
 522         shift_rows      q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
 523 .Lenc_sbox:
 524         sbox            q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
 525                                                                 q13, q14, q15
 526         subs            rounds, rounds, #1
 527         bcc             .Lenc_done
 528
 529         mix_cols        q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11, q12, \
 530                                                                 q13, q14, q15
 531
 532         beq             .Lenc_last
 533         __ldr           q12, SR
 534         b               .Lenc_loop
 535
 536 .Lenc_done:
 537         vld1.8          {q12}, [bskey, :128]    // last round key
 538
 539         bitslice        q0, q1, q4, q6, q3, q7, q2, q5, q8, q9, q10, q11
 540
 541         veor            q0, q0, q12
 542         veor            q1, q1, q12
 543         veor            q4, q4, q12
 544         veor            q6, q6, q12
 545         veor            q3, q3, q12
 546         veor            q7, q7, q12
 547         veor            q2, q2, q12
 548         veor            q5, q5, q12
 549         bx              lr
 550 ENDPROC(aesbs_encrypt8)
 551
 552         .align          4
 553 M0ISR:  .quad           0x0a0e0206070b0f03, 0x0004080c0d010509
 554
 555 aesbs_decrypt8:
 556         add             bskey, bskey, rounds, lsl #7
 557         sub             bskey, bskey, #112
 558         vld1.8          {q9}, [bskey, :128]     // round 0 key
 559         sub             bskey, bskey, #128
 560         __ldr           q8, M0ISR
 561
 562         veor            q10, q0, q9             // xor with round0 key
 563         veor            q11, q1, q9
 564         __tbl           q0, q10, q8
 565         veor            q12, q2, q9
 566         __tbl           q1, q11, q8
 567         veor            q13, q3, q9
 568         __tbl           q2, q12, q8
 569         veor            q14, q4, q9
 570         __tbl           q3, q13, q8
 571         veor            q15, q5, q9
 572         __tbl           q4, q14, q8
 573         veor            q10, q6, q9
 574         __tbl           q5, q15, q8
 575         veor            q11, q7, q9
 576         __tbl           q6, q10, q8
 577         __tbl           q7, q11, q8
 578
 579         bitslice        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11
 580
 581         sub             rounds, rounds, #1
 582         b               .Ldec_sbox
 583
 584         .align          5
 585 ISR:    .quad           0x0504070602010003, 0x0f0e0d0c080b0a09
 586 ISRM0:  .quad           0x01040b0e0205080f, 0x0306090c00070a0d
 587
 588 .Ldec_last:
 589         __ldr           q12, ISRM0
 590 .Ldec_loop:
 591         inv_shift_rows  q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12
 592 .Ldec_sbox:
 593         inv_sbox        q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11, q12, \
 594                                                                 q13, q14, q15
 595         subs            rounds, rounds, #1
 596         bcc             .Ldec_done
 597
 598         inv_mix_cols    q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11, q12, \
 599                                                                 q13, q14, q15
 600
 601         beq             .Ldec_last
 602         __ldr           q12, ISR
 603         b               .Ldec_loop
 604
 605 .Ldec_done:
 606         add             bskey, bskey, #112
 607         vld1.8          {q12}, [bskey, :128]    // last round key
 608
 609         bitslice        q0, q1, q6, q4, q2, q7, q3, q5, q8, q9, q10, q11
 610
 611         veor            q0, q0, q12
 612         veor            q1, q1, q12
 613         veor            q6, q6, q12
 614         veor            q4, q4, q12
 615         veor            q2, q2, q12
 616         veor            q7, q7, q12
 617         veor            q3, q3, q12
 618         veor            q5, q5, q12
 619         bx              lr
 620 ENDPROC(aesbs_decrypt8)
 621
 622         /*
 623          * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 624          *                   int blocks)
 625          * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 626          *                   int blocks)
 627          */
 628         .macro          __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 629         push            {r4-r6, lr}
 630         ldr             r5, [sp, #16]           // number of blocks
 631
 632 99:     __adr           ip, 0f
 633         and             lr, r5, #7
 634         cmp             r5, #8
 635         sub             ip, ip, lr, lsl #2
 636         bxlt            ip                      // computed goto if blocks < 8
 637
 638         vld1.8          {q0}, [r1]!
 639         vld1.8          {q1}, [r1]!
 640         vld1.8          {q2}, [r1]!
 641         vld1.8          {q3}, [r1]!
 642         vld1.8          {q4}, [r1]!
 643         vld1.8          {q5}, [r1]!
 644         vld1.8          {q6}, [r1]!
 645         vld1.8          {q7}, [r1]!
 646
 647 0:      mov             bskey, r2
 648         mov             rounds, r3
 649         bl              \do8
 650
 651         __adr           ip, 1f
 652         and             lr, r5, #7
 653         cmp             r5, #8
 654         sub             ip, ip, lr, lsl #2
 655         bxlt            ip                      // computed goto if blocks < 8
 656
 657         vst1.8          {\o0}, [r0]!
 658         vst1.8          {\o1}, [r0]!
 659         vst1.8          {\o2}, [r0]!
 660         vst1.8          {\o3}, [r0]!
 661         vst1.8          {\o4}, [r0]!
 662         vst1.8          {\o5}, [r0]!
 663         vst1.8          {\o6}, [r0]!
 664         vst1.8          {\o7}, [r0]!
 665
 666 1:      subs            r5, r5, #8
 667         bgt             99b
 668
 669         pop             {r4-r6, pc}
 670         .endm
 671
 672         .align          4
 673 ENTRY(aesbs_ecb_encrypt)
 674         __ecb_crypt     aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
 675 ENDPROC(aesbs_ecb_encrypt)
 676
 677         .align          4
 678 ENTRY(aesbs_ecb_decrypt)
 679         __ecb_crypt     aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
 680 ENDPROC(aesbs_ecb_decrypt)
 681
 682         /*
 683          * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[],
 684          *                   int rounds, int blocks, u8 iv[])
 685          */
 686         .align          4
 687 ENTRY(aesbs_cbc_decrypt)
 688         mov             ip, sp
 689         push            {r4-r6, lr}
 690         ldm             ip, {r5-r6}             // load args 4-5
 691
 692 99:     __adr           ip, 0f
 693         and             lr, r5, #7
 694         cmp             r5, #8
 695         sub             ip, ip, lr, lsl #2
 696         mov             lr, r1
 697         bxlt            ip                      // computed goto if blocks < 8
 698
 699         vld1.8          {q0}, [lr]!
 700         vld1.8          {q1}, [lr]!
 701         vld1.8          {q2}, [lr]!
 702         vld1.8          {q3}, [lr]!
 703         vld1.8          {q4}, [lr]!
 704         vld1.8          {q5}, [lr]!
 705         vld1.8          {q6}, [lr]!
 706         vld1.8          {q7}, [lr]
 707
 708 0:      mov             bskey, r2
 709         mov             rounds, r3
 710         bl              aesbs_decrypt8
 711
 712         vld1.8          {q8}, [r6]
 713         vmov            q9, q8
 714         vmov            q10, q8
 715         vmov            q11, q8
 716         vmov            q12, q8
 717         vmov            q13, q8
 718         vmov            q14, q8
 719         vmov            q15, q8
 720
 721         __adr           ip, 1f
 722         and             lr, r5, #7
 723         cmp             r5, #8
 724         sub             ip, ip, lr, lsl #2
 725         bxlt            ip                      // computed goto if blocks < 8
 726
 727         vld1.8          {q9}, [r1]!
 728         vld1.8          {q10}, [r1]!
 729         vld1.8          {q11}, [r1]!
 730         vld1.8          {q12}, [r1]!
 731         vld1.8          {q13}, [r1]!
 732         vld1.8          {q14}, [r1]!
 733         vld1.8          {q15}, [r1]!
 734         W(nop)
 735
 736 1:      __adr           ip, 2f
 737         sub             ip, ip, lr, lsl #3
 738         bxlt            ip                      // computed goto if blocks < 8
 739
 740         veor            q0, q0, q8
 741         vst1.8          {q0}, [r0]!
 742         veor            q1, q1, q9
 743         vst1.8          {q1}, [r0]!
 744         veor            q6, q6, q10
 745         vst1.8          {q6}, [r0]!
 746         veor            q4, q4, q11
 747         vst1.8          {q4}, [r0]!
 748         veor            q2, q2, q12
 749         vst1.8          {q2}, [r0]!
 750         veor            q7, q7, q13
 751         vst1.8          {q7}, [r0]!
 752         veor            q3, q3, q14
 753         vst1.8          {q3}, [r0]!
 754         veor            q5, q5, q15
 755         vld1.8          {q8}, [r1]!             // load next round's iv
 756 2:      vst1.8          {q5}, [r0]!
 757
 758         subs            r5, r5, #8
 759         vst1.8          {q8}, [r6]              // store next round's iv
 760         bgt             99b
 761
 762         pop             {r4-r6, pc}
 763 ENDPROC(aesbs_cbc_decrypt)
 764
 765         .macro          next_ctr, q
 766         vmov.32         \q\()h[1], r10
 767         adds            r10, r10, #1
 768         vmov.32         \q\()h[0], r9
 769         adcs            r9, r9, #0
 770         vmov.32         \q\()l[1], r8
 771         adcs            r8, r8, #0
 772         vmov.32         \q\()l[0], r7
 773         adc             r7, r7, #0
 774         vrev32.8        \q, \q
 775         .endm
 776
 777         /*
 778          * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[],
 779          *                   int rounds, int blocks, u8 ctr[], u8 final[])
 780          */
 781 ENTRY(aesbs_ctr_encrypt)
 782         mov             ip, sp
 783         push            {r4-r10, lr}
 784
 785         ldm             ip, {r5-r7}             // load args 4-6
 786         teq             r7, #0
 787         addne           r5, r5, #1              // one extra block if final != 0
 788
 789         vld1.8          {q0}, [r6]              // load counter
 790         vrev32.8        q1, q0
 791         vmov            r9, r10, d3
 792         vmov            r7, r8, d2
 793
 794         adds            r10, r10, #1
 795         adcs            r9, r9, #0
 796         adcs            r8, r8, #0
 797         adc             r7, r7, #0
 798
 799 99:     vmov            q1, q0
 800         vmov            q2, q0
 801         vmov            q3, q0
 802         vmov            q4, q0
 803         vmov            q5, q0
 804         vmov            q6, q0
 805         vmov            q7, q0
 806
 807         __adr           ip, 0f
 808         sub             lr, r5, #1
 809         and             lr, lr, #7
 810         cmp             r5, #8
 811         sub             ip, ip, lr, lsl #5
 812         sub             ip, ip, lr, lsl #2
 813         bxlt            ip                      // computed goto if blocks < 8
 814
 815         next_ctr        q1
 816         next_ctr        q2
 817         next_ctr        q3
 818         next_ctr        q4
 819         next_ctr        q5
 820         next_ctr        q6
 821         next_ctr        q7
 822
 823 0:      mov             bskey, r2
 824         mov             rounds, r3
 825         bl              aesbs_encrypt8
 826
 827         __adr           ip, 1f
 828         and             lr, r5, #7
 829         cmp             r5, #8
 830         movgt           r4, #0
 831         ldrle           r4, [sp, #40]           // load final in the last round
 832         sub             ip, ip, lr, lsl #2
 833         bxlt            ip                      // computed goto if blocks < 8
 834
 835         vld1.8          {q8}, [r1]!
 836         vld1.8          {q9}, [r1]!
 837         vld1.8          {q10}, [r1]!
 838         vld1.8          {q11}, [r1]!
 839         vld1.8          {q12}, [r1]!
 840         vld1.8          {q13}, [r1]!
 841         vld1.8          {q14}, [r1]!
 842         teq             r4, #0                  // skip last block if 'final'
 843 1:      bne             2f
 844         vld1.8          {q15}, [r1]!
 845
 846 2:      __adr           ip, 3f
 847         cmp             r5, #8
 848         sub             ip, ip, lr, lsl #3
 849         bxlt            ip                      // computed goto if blocks < 8
 850
 851         veor            q0, q0, q8
 852         vst1.8          {q0}, [r0]!
 853         veor            q1, q1, q9
 854         vst1.8          {q1}, [r0]!
 855         veor            q4, q4, q10
 856         vst1.8          {q4}, [r0]!
 857         veor            q6, q6, q11
 858         vst1.8          {q6}, [r0]!
 859         veor            q3, q3, q12
 860         vst1.8          {q3}, [r0]!
 861         veor            q7, q7, q13
 862         vst1.8          {q7}, [r0]!
 863         veor            q2, q2, q14
 864         vst1.8          {q2}, [r0]!
 865         teq             r4, #0                  // skip last block if 'final'
 866         W(bne)          5f
 867 3:      veor            q5, q5, q15
 868         vst1.8          {q5}, [r0]!
 869
 870 4:      next_ctr        q0
 871
 872         subs            r5, r5, #8
 873         bgt             99b
 874
 875         vst1.8          {q0}, [r6]
 876         pop             {r4-r10, pc}
 877
 878 5:      vst1.8          {q5}, [r4]
 879         b               4b
 880 ENDPROC(aesbs_ctr_encrypt)
 881
 882         .macro          next_tweak, out, in, const, tmp
 883         vshr.s64        \tmp, \in, #63
 884         vand            \tmp, \tmp, \const
 885         vadd.u64        \out, \in, \in
 886         vext.8          \tmp, \tmp, \tmp, #8
 887         veor            \out, \out, \tmp
 888         .endm
 889
 890         /*
 891          * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 892          *                   int blocks, u8 iv[], int reorder_last_tweak)
 893          * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
 894          *                   int blocks, u8 iv[], int reorder_last_tweak)
 895          */
 896 __xts_prepare8:
 897         vld1.8          {q14}, [r7]             // load iv
 898         vmov.i32        d30, #0x87              // compose tweak mask vector
 899         vmovl.u32       q15, d30
 900         vshr.u64        d30, d31, #7
 901         vmov            q12, q14
 902
 903         __adr           ip, 0f
 904         and             r4, r6, #7
 905         cmp             r6, #8
 906         sub             ip, ip, r4, lsl #5
 907         mov             r4, sp
 908         bxlt            ip                      // computed goto if blocks < 8
 909
 910         vld1.8          {q0}, [r1]!
 911         next_tweak      q12, q14, q15, q13
 912         veor            q0, q0, q14
 913         vst1.8          {q14}, [r4, :128]!
 914
 915         vld1.8          {q1}, [r1]!
 916         next_tweak      q14, q12, q15, q13
 917         veor            q1, q1, q12
 918         vst1.8          {q12}, [r4, :128]!
 919
 920         vld1.8          {q2}, [r1]!
 921         next_tweak      q12, q14, q15, q13
 922         veor            q2, q2, q14
 923         vst1.8          {q14}, [r4, :128]!
 924
 925         vld1.8          {q3}, [r1]!
 926         next_tweak      q14, q12, q15, q13
 927         veor            q3, q3, q12
 928         vst1.8          {q12}, [r4, :128]!
 929
 930         vld1.8          {q4}, [r1]!
 931         next_tweak      q12, q14, q15, q13
 932         veor            q4, q4, q14
 933         vst1.8          {q14}, [r4, :128]!
 934
 935         vld1.8          {q5}, [r1]!
 936         next_tweak      q14, q12, q15, q13
 937         veor            q5, q5, q12
 938         vst1.8          {q12}, [r4, :128]!
 939
 940         vld1.8          {q6}, [r1]!
 941         next_tweak      q12, q14, q15, q13
 942         veor            q6, q6, q14
 943         vst1.8          {q14}, [r4, :128]!
 944
 945         vld1.8          {q7}, [r1]!
 946         next_tweak      q14, q12, q15, q13
 947 THUMB(  itt             le              )
 948         W(cmple)        r8, #0
 949         ble             1f
 950 0:      veor            q7, q7, q12
 951         vst1.8          {q12}, [r4, :128]
 952
 953         vst1.8          {q14}, [r7]             // store next iv
 954         bx              lr
 955
 956 1:      vswp            q12, q14
 957         b               0b
 958 ENDPROC(__xts_prepare8)
 959
 960         .macro          __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
 961         push            {r4-r8, lr}
 962         mov             r5, sp                  // preserve sp
 963         ldrd            r6, r7, [sp, #24]       // get blocks and iv args
 964         ldr             r8, [sp, #32]           // reorder final tweak?
 965         rsb             r8, r8, #1
 966         sub             ip, sp, #128            // make room for 8x tweak
 967         bic             ip, ip, #0xf            // align sp to 16 bytes
 968         mov             sp, ip
 969
 970 99:     bl              __xts_prepare8
 971
 972         mov             bskey, r2
 973         mov             rounds, r3
 974         bl              \do8
 975
 976         __adr           ip, 0f
 977         and             lr, r6, #7
 978         cmp             r6, #8
 979         sub             ip, ip, lr, lsl #2
 980         mov             r4, sp
 981         bxlt            ip                      // computed goto if blocks < 8
 982
 983         vld1.8          {q8}, [r4, :128]!
 984         vld1.8          {q9}, [r4, :128]!
 985         vld1.8          {q10}, [r4, :128]!
 986         vld1.8          {q11}, [r4, :128]!
 987         vld1.8          {q12}, [r4, :128]!
 988         vld1.8          {q13}, [r4, :128]!
 989         vld1.8          {q14}, [r4, :128]!
 990         vld1.8          {q15}, [r4, :128]
 991
 992 0:      __adr           ip, 1f
 993         sub             ip, ip, lr, lsl #3
 994         bxlt            ip                      // computed goto if blocks < 8
 995
 996         veor            \o0, \o0, q8
 997         vst1.8          {\o0}, [r0]!
 998         veor            \o1, \o1, q9
 999         vst1.8          {\o1}, [r0]!
1000         veor            \o2, \o2, q10
1001         vst1.8          {\o2}, [r0]!
1002         veor            \o3, \o3, q11
1003         vst1.8          {\o3}, [r0]!
1004         veor            \o4, \o4, q12
1005         vst1.8          {\o4}, [r0]!
1006         veor            \o5, \o5, q13
1007         vst1.8          {\o5}, [r0]!
1008         veor            \o6, \o6, q14
1009         vst1.8          {\o6}, [r0]!
1010         veor            \o7, \o7, q15
1011         vst1.8          {\o7}, [r0]!
1012
1013 1:      subs            r6, r6, #8
1014         bgt             99b
1015
1016         mov             sp, r5
1017         pop             {r4-r8, pc}
1018         .endm
1019
1020 ENTRY(aesbs_xts_encrypt)
1021         __xts_crypt     aesbs_encrypt8, q0, q1, q4, q6, q3, q7, q2, q5
1022 ENDPROC(aesbs_xts_encrypt)
1023
1024 ENTRY(aesbs_xts_decrypt)
1025         __xts_crypt     aesbs_decrypt8, q0, q1, q6, q4, q2, q7, q3, q5
1026 ENDPROC(aesbs_xts_decrypt)