arch/arm/crypto/crct10dif-ce-core.S

   1 //
   2 // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
   3 //
   4 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
   5 //
   6 // This program is free software; you can redistribute it and/or modify
   7 // it under the terms of the GNU General Public License version 2 as
   8 // published by the Free Software Foundation.
   9 //
  10
  11 //
  12 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
  13 //
  14 // Copyright (c) 2013, Intel Corporation
  15 //
  16 // Authors:
  17 //     Erdinc Ozturk <erdinc.ozturk@intel.com>
  18 //     Vinodh Gopal <vinodh.gopal@intel.com>
  19 //     James Guilford <james.guilford@intel.com>
  20 //     Tim Chen <tim.c.chen@linux.intel.com>
  21 //
  22 // This software is available to you under a choice of one of two
  23 // licenses.  You may choose to be licensed under the terms of the GNU
  24 // General Public License (GPL) Version 2, available from the file
  25 // COPYING in the main directory of this source tree, or the
  26 // OpenIB.org BSD license below:
  27 //
  28 // Redistribution and use in source and binary forms, with or without
  29 // modification, are permitted provided that the following conditions are
  30 // met:
  31 //
  32 // * Redistributions of source code must retain the above copyright
  33 //   notice, this list of conditions and the following disclaimer.
  34 //
  35 // * Redistributions in binary form must reproduce the above copyright
  36 //   notice, this list of conditions and the following disclaimer in the
  37 //   documentation and/or other materials provided with the
  38 //   distribution.
  39 //
  40 // * Neither the name of the Intel Corporation nor the names of its
  41 //   contributors may be used to endorse or promote products derived from
  42 //   this software without specific prior written permission.
  43 //
  44 //
  45 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  46 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  47 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  48 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  49 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  50 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  51 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  52 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  53 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  54 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  55 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  56 //
  57 //       Function API:
  58 //       UINT16 crc_t10dif_pcl(
  59 //               UINT16 init_crc, //initial CRC value, 16 bits
  60 //               const unsigned char *buf, //buffer pointer to calculate CRC on
  61 //               UINT64 len //buffer length in bytes (64-bit data)
  62 //       );
  63 //
  64 //       Reference paper titled "Fast CRC Computation for Generic
  65 //      Polynomials Using PCLMULQDQ Instruction"
  66 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
  67 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  68 //
  69 //
  70
  71 #include <linux/linkage.h>
  72 #include <asm/assembler.h>
  73
  74 #ifdef CONFIG_CPU_ENDIAN_BE8
  75 #define CPU_LE(code...)
  76 #else
  77 #define CPU_LE(code...)         code
  78 #endif
  79
  80         .text
  81         .fpu            crypto-neon-fp-armv8
  82
  83         arg1_low32      .req    r0
  84         arg2            .req    r1
  85         arg3            .req    r2
  86
  87         qzr             .req    q13
  88
  89         q0l             .req    d0
  90         q0h             .req    d1
  91         q1l             .req    d2
  92         q1h             .req    d3
  93         q2l             .req    d4
  94         q2h             .req    d5
  95         q3l             .req    d6
  96         q3h             .req    d7
  97         q4l             .req    d8
  98         q4h             .req    d9
  99         q5l             .req    d10
 100         q5h             .req    d11
 101         q6l             .req    d12
 102         q6h             .req    d13
 103         q7l             .req    d14
 104         q7h             .req    d15
 105
 106 ENTRY(crc_t10dif_pmull)
 107         vmov.i8         qzr, #0                 // init zero register
 108
 109         // adjust the 16-bit initial_crc value, scale it to 32 bits
 110         lsl             arg1_low32, arg1_low32, #16
 111
 112         // check if smaller than 256
 113         cmp             arg3, #256
 114
 115         // for sizes less than 128, we can't fold 64B at a time...
 116         blt             _less_than_128
 117
 118         // load the initial crc value
 119         // crc value does not need to be byte-reflected, but it needs
 120         // to be moved to the high part of the register.
 121         // because data will be byte-reflected and will align with
 122         // initial crc at correct place.
 123         vmov            s0, arg1_low32          // initial crc
 124         vext.8          q10, qzr, q0, #4
 125
 126         // receive the initial 64B data, xor the initial crc value
 127         vld1.64         {q0-q1}, [arg2, :128]!
 128         vld1.64         {q2-q3}, [arg2, :128]!
 129         vld1.64         {q4-q5}, [arg2, :128]!
 130         vld1.64         {q6-q7}, [arg2, :128]!
 131 CPU_LE( vrev64.8        q0, q0                  )
 132 CPU_LE( vrev64.8        q1, q1                  )
 133 CPU_LE( vrev64.8        q2, q2                  )
 134 CPU_LE( vrev64.8        q3, q3                  )
 135 CPU_LE( vrev64.8        q4, q4                  )
 136 CPU_LE( vrev64.8        q5, q5                  )
 137 CPU_LE( vrev64.8        q6, q6                  )
 138 CPU_LE( vrev64.8        q7, q7                  )
 139
 140         vswp            d0, d1
 141         vswp            d2, d3
 142         vswp            d4, d5
 143         vswp            d6, d7
 144         vswp            d8, d9
 145         vswp            d10, d11
 146         vswp            d12, d13
 147         vswp            d14, d15
 148
 149         // XOR the initial_crc value
 150         veor.8          q0, q0, q10
 151
 152         adr             ip, rk3
 153         vld1.64         {q10}, [ip, :128]       // xmm10 has rk3 and rk4
 154
 155         //
 156         // we subtract 256 instead of 128 to save one instruction from the loop
 157         //
 158         sub             arg3, arg3, #256
 159
 160         // at this section of the code, there is 64*x+y (0<=y<64) bytes of
 161         // buffer. The _fold_64_B_loop will fold 64B at a time
 162         // until we have 64+y Bytes of buffer
 163
 164
 165         // fold 64B at a time. This section of the code folds 4 vector
 166         // registers in parallel
 167 _fold_64_B_loop:
 168
 169         .macro          fold64, reg1, reg2
 170         vld1.64         {q11-q12}, [arg2, :128]!
 171
 172         vmull.p64       q8, \reg1\()h, d21
 173         vmull.p64       \reg1, \reg1\()l, d20
 174         vmull.p64       q9, \reg2\()h, d21
 175         vmull.p64       \reg2, \reg2\()l, d20
 176
 177 CPU_LE( vrev64.8        q11, q11                )
 178 CPU_LE( vrev64.8        q12, q12                )
 179         vswp            d22, d23
 180         vswp            d24, d25
 181
 182         veor.8          \reg1, \reg1, q8
 183         veor.8          \reg2, \reg2, q9
 184         veor.8          \reg1, \reg1, q11
 185         veor.8          \reg2, \reg2, q12
 186         .endm
 187
 188         fold64          q0, q1
 189         fold64          q2, q3
 190         fold64          q4, q5
 191         fold64          q6, q7
 192
 193         subs            arg3, arg3, #128
 194
 195         // check if there is another 64B in the buffer to be able to fold
 196         bge             _fold_64_B_loop
 197
 198         // at this point, the buffer pointer is pointing at the last y Bytes
 199         // of the buffer the 64B of folded data is in 4 of the vector
 200         // registers: v0, v1, v2, v3
 201
 202         // fold the 8 vector registers to 1 vector register with different
 203         // constants
 204
 205         adr             ip, rk9
 206         vld1.64         {q10}, [ip, :128]!
 207
 208         .macro          fold16, reg, rk
 209         vmull.p64       q8, \reg\()l, d20
 210         vmull.p64       \reg, \reg\()h, d21
 211         .ifnb           \rk
 212         vld1.64         {q10}, [ip, :128]!
 213         .endif
 214         veor.8          q7, q7, q8
 215         veor.8          q7, q7, \reg
 216         .endm
 217
 218         fold16          q0, rk11
 219         fold16          q1, rk13
 220         fold16          q2, rk15
 221         fold16          q3, rk17
 222         fold16          q4, rk19
 223         fold16          q5, rk1
 224         fold16          q6
 225
 226         // instead of 64, we add 48 to the loop counter to save 1 instruction
 227         // from the loop instead of a cmp instruction, we use the negative
 228         // flag with the jl instruction
 229         adds            arg3, arg3, #(128-16)
 230         blt             _final_reduction_for_128
 231
 232         // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
 233         // and the rest is in memory. We can fold 16 bytes at a time if y>=16
 234         // continue folding 16B at a time
 235
 236 _16B_reduction_loop:
 237         vmull.p64       q8, d14, d20
 238         vmull.p64       q7, d15, d21
 239         veor.8          q7, q7, q8
 240
 241         vld1.64         {q0}, [arg2, :128]!
 242 CPU_LE( vrev64.8        q0, q0          )
 243         vswp            d0, d1
 244         veor.8          q7, q7, q0
 245         subs            arg3, arg3, #16
 246
 247         // instead of a cmp instruction, we utilize the flags with the
 248         // jge instruction equivalent of: cmp arg3, 16-16
 249         // check if there is any more 16B in the buffer to be able to fold
 250         bge             _16B_reduction_loop
 251
 252         // now we have 16+z bytes left to reduce, where 0<= z < 16.
 253         // first, we reduce the data in the xmm7 register
 254
 255 _final_reduction_for_128:
 256         // check if any more data to fold. If not, compute the CRC of
 257         // the final 128 bits
 258         adds            arg3, arg3, #16
 259         beq             _128_done
 260
 261         // here we are getting data that is less than 16 bytes.
 262         // since we know that there was data before the pointer, we can
 263         // offset the input pointer before the actual point, to receive
 264         // exactly 16 bytes. after that the registers need to be adjusted.
 265 _get_last_two_regs:
 266         add             arg2, arg2, arg3
 267         sub             arg2, arg2, #16
 268         vld1.64         {q1}, [arg2]
 269 CPU_LE( vrev64.8        q1, q1                  )
 270         vswp            d2, d3
 271
 272         // get rid of the extra data that was loaded before
 273         // load the shift constant
 274         adr             ip, tbl_shf_table + 16
 275         sub             ip, ip, arg3
 276         vld1.8          {q0}, [ip]
 277
 278         // shift v2 to the left by arg3 bytes
 279         vtbl.8          d4, {d14-d15}, d0
 280         vtbl.8          d5, {d14-d15}, d1
 281
 282         // shift v7 to the right by 16-arg3 bytes
 283         vmov.i8         q9, #0x80
 284         veor.8          q0, q0, q9
 285         vtbl.8          d18, {d14-d15}, d0
 286         vtbl.8          d19, {d14-d15}, d1
 287
 288         // blend
 289         vshr.s8         q0, q0, #7              // convert to 8-bit mask
 290         vbsl.8          q0, q2, q1
 291
 292         // fold 16 Bytes
 293         vmull.p64       q8, d18, d20
 294         vmull.p64       q7, d19, d21
 295         veor.8          q7, q7, q8
 296         veor.8          q7, q7, q0
 297
 298 _128_done:
 299         // compute crc of a 128-bit value
 300         vldr            d20, rk5
 301         vldr            d21, rk6                // rk5 and rk6 in xmm10
 302
 303         // 64b fold
 304         vext.8          q0, qzr, q7, #8
 305         vmull.p64       q7, d15, d20
 306         veor.8          q7, q7, q0
 307
 308         // 32b fold
 309         vext.8          q0, q7, qzr, #12
 310         vmov            s31, s3
 311         vmull.p64       q0, d0, d21
 312         veor.8          q7, q0, q7
 313
 314         // barrett reduction
 315 _barrett:
 316         vldr            d20, rk7
 317         vldr            d21, rk8
 318
 319         vmull.p64       q0, d15, d20
 320         vext.8          q0, qzr, q0, #12
 321         vmull.p64       q0, d1, d21
 322         vext.8          q0, qzr, q0, #12
 323         veor.8          q7, q7, q0
 324         vmov            r0, s29
 325
 326 _cleanup:
 327         // scale the result back to 16 bits
 328         lsr             r0, r0, #16
 329         bx              lr
 330
 331 _less_than_128:
 332         teq             arg3, #0
 333         beq             _cleanup
 334
 335         vmov.i8         q0, #0
 336         vmov            s3, arg1_low32          // get the initial crc value
 337
 338         vld1.64         {q7}, [arg2, :128]!
 339 CPU_LE( vrev64.8        q7, q7          )
 340         vswp            d14, d15
 341         veor.8          q7, q7, q0
 342
 343         cmp             arg3, #16
 344         beq             _128_done               // exactly 16 left
 345         blt             _less_than_16_left
 346
 347         // now if there is, load the constants
 348         vldr            d20, rk1
 349         vldr            d21, rk2                // rk1 and rk2 in xmm10
 350
 351         // check if there is enough buffer to be able to fold 16B at a time
 352         subs            arg3, arg3, #32
 353         addlt           arg3, arg3, #16
 354         blt             _get_last_two_regs
 355         b               _16B_reduction_loop
 356
 357 _less_than_16_left:
 358         // shl r9, 4
 359         adr             ip, tbl_shf_table + 16
 360         sub             ip, ip, arg3
 361         vld1.8          {q0}, [ip]
 362         vmov.i8         q9, #0x80
 363         veor.8          q0, q0, q9
 364         vtbl.8          d18, {d14-d15}, d0
 365         vtbl.8          d15, {d14-d15}, d1
 366         vmov            d14, d18
 367         b               _128_done
 368 ENDPROC(crc_t10dif_pmull)
 369
 370 // precomputed constants
 371 // these constants are precomputed from the poly:
 372 // 0x8bb70000 (0x8bb7 scaled to 32 bits)
 373         .align          4
 374 // Q = 0x18BB70000
 375 // rk1 = 2^(32*3) mod Q << 32
 376 // rk2 = 2^(32*5) mod Q << 32
 377 // rk3 = 2^(32*15) mod Q << 32
 378 // rk4 = 2^(32*17) mod Q << 32
 379 // rk5 = 2^(32*3) mod Q << 32
 380 // rk6 = 2^(32*2) mod Q << 32
 381 // rk7 = floor(2^64/Q)
 382 // rk8 = Q
 383
 384 rk3:    .quad           0x9d9d000000000000
 385 rk4:    .quad           0x7cf5000000000000
 386 rk5:    .quad           0x2d56000000000000
 387 rk6:    .quad           0x1368000000000000
 388 rk7:    .quad           0x00000001f65a57f8
 389 rk8:    .quad           0x000000018bb70000
 390 rk9:    .quad           0xceae000000000000
 391 rk10:   .quad           0xbfd6000000000000
 392 rk11:   .quad           0x1e16000000000000
 393 rk12:   .quad           0x713c000000000000
 394 rk13:   .quad           0xf7f9000000000000
 395 rk14:   .quad           0x80a6000000000000
 396 rk15:   .quad           0x044c000000000000
 397 rk16:   .quad           0xe658000000000000
 398 rk17:   .quad           0xad18000000000000
 399 rk18:   .quad           0xa497000000000000
 400 rk19:   .quad           0x6ee3000000000000
 401 rk20:   .quad           0xe7b5000000000000
 402 rk1:    .quad           0x2d56000000000000
 403 rk2:    .quad           0x06df000000000000
 404
 405 tbl_shf_table:
 406 // use these values for shift constants for the tbl/tbx instruction
 407 // different alignments result in values as shown:
 408 //      DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
 409 //      DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
 410 //      DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
 411 //      DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
 412 //      DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
 413 //      DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
 414 //      DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
 415 //      DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
 416 //      DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
 417 //      DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
 418 //      DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
 419 //      DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
 420 //      DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
 421 //      DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
 422 //      DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
 423
 424         .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
 425         .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
 426         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
 427         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0