arch/arm64/crypto/crc32-ce-core.S

   1 /*
   2  * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
   3  *
   4  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 as
   8  * published by the Free Software Foundation.
   9  */
  10
  11 /* GPL HEADER START
  12  *
  13  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  14  *
  15  * This program is free software; you can redistribute it and/or modify
  16  * it under the terms of the GNU General Public License version 2 only,
  17  * as published by the Free Software Foundation.
  18  *
  19  * This program is distributed in the hope that it will be useful, but
  20  * WITHOUT ANY WARRANTY; without even the implied warranty of
  21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22  * General Public License version 2 for more details (a copy is included
  23  * in the LICENSE file that accompanied this code).
  24  *
  25  * You should have received a copy of the GNU General Public License
  26  * version 2 along with this program; If not, see http://www.gnu.org/licenses
  27  *
  28  * Please  visit http://www.xyratex.com/contact if you need additional
  29  * information or have any questions.
  30  *
  31  * GPL HEADER END
  32  */
  33
  34 /*
  35  * Copyright 2012 Xyratex Technology Limited
  36  *
  37  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
  38  * calculation.
  39  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
  40  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
  41  * at:
  42  * http://www.intel.com/products/processor/manuals/
  43  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
  44  * Volume 2B: Instruction Set Reference, N-Z
  45  *
  46  * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
  47  *            Alexander Boyko <Alexander_Boyko@xyratex.com>
  48  */
  49
  50 #include <linux/linkage.h>
  51 #include <asm/assembler.h>
  52
  53         .section        ".rodata", "a"
  54         .align          6
  55         .cpu            generic+crypto+crc
  56
  57 .Lcrc32_constants:
  58         /*
  59          * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
  60          * #define CONSTANT_R1  0x154442bd4LL
  61          *
  62          * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
  63          * #define CONSTANT_R2  0x1c6e41596LL
  64          */
  65         .octa           0x00000001c6e415960000000154442bd4
  66
  67         /*
  68          * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
  69          * #define CONSTANT_R3  0x1751997d0LL
  70          *
  71          * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
  72          * #define CONSTANT_R4  0x0ccaa009eLL
  73          */
  74         .octa           0x00000000ccaa009e00000001751997d0
  75
  76         /*
  77          * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
  78          * #define CONSTANT_R5  0x163cd6124LL
  79          */
  80         .quad           0x0000000163cd6124
  81         .quad           0x00000000FFFFFFFF
  82
  83         /*
  84          * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
  85          *
  86          * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
  87          *                                                      = 0x1F7011641LL
  88          * #define CONSTANT_RU  0x1F7011641LL
  89          */
  90         .octa           0x00000001F701164100000001DB710641
  91
  92 .Lcrc32c_constants:
  93         .octa           0x000000009e4addf800000000740eef02
  94         .octa           0x000000014cd00bd600000000f20c0dfe
  95         .quad           0x00000000dd45aab8
  96         .quad           0x00000000FFFFFFFF
  97         .octa           0x00000000dea713f10000000105ec76f0
  98
  99         vCONSTANT       .req    v0
 100         dCONSTANT       .req    d0
 101         qCONSTANT       .req    q0
 102
 103         BUF             .req    x19
 104         LEN             .req    x20
 105         CRC             .req    x21
 106         CONST           .req    x22
 107
 108         vzr             .req    v9
 109
 110         /**
 111          * Calculate crc32
 112          * BUF - buffer
 113          * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
 114          * CRC - initial crc32
 115          * return %eax crc32
 116          * uint crc32_pmull_le(unsigned char const *buffer,
 117          *                     size_t len, uint crc32)
 118          */
 119         .text
 120 ENTRY(crc32_pmull_le)
 121         adr_l           x3, .Lcrc32_constants
 122         b               0f
 123
 124 ENTRY(crc32c_pmull_le)
 125         adr_l           x3, .Lcrc32c_constants
 126
 127 0:      frame_push      4, 64
 128
 129         mov             BUF, x0
 130         mov             LEN, x1
 131         mov             CRC, x2
 132         mov             CONST, x3
 133
 134         bic             LEN, LEN, #15
 135         ld1             {v1.16b-v4.16b}, [BUF], #0x40
 136         movi            vzr.16b, #0
 137         fmov            dCONSTANT, CRC
 138         eor             v1.16b, v1.16b, vCONSTANT.16b
 139         sub             LEN, LEN, #0x40
 140         cmp             LEN, #0x40
 141         b.lt            less_64
 142
 143         ldr             qCONSTANT, [CONST]
 144
 145 loop_64:                /* 64 bytes Full cache line folding */
 146         sub             LEN, LEN, #0x40
 147
 148         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 149         pmull2          v6.1q, v2.2d, vCONSTANT.2d
 150         pmull2          v7.1q, v3.2d, vCONSTANT.2d
 151         pmull2          v8.1q, v4.2d, vCONSTANT.2d
 152
 153         pmull           v1.1q, v1.1d, vCONSTANT.1d
 154         pmull           v2.1q, v2.1d, vCONSTANT.1d
 155         pmull           v3.1q, v3.1d, vCONSTANT.1d
 156         pmull           v4.1q, v4.1d, vCONSTANT.1d
 157
 158         eor             v1.16b, v1.16b, v5.16b
 159         ld1             {v5.16b}, [BUF], #0x10
 160         eor             v2.16b, v2.16b, v6.16b
 161         ld1             {v6.16b}, [BUF], #0x10
 162         eor             v3.16b, v3.16b, v7.16b
 163         ld1             {v7.16b}, [BUF], #0x10
 164         eor             v4.16b, v4.16b, v8.16b
 165         ld1             {v8.16b}, [BUF], #0x10
 166
 167         eor             v1.16b, v1.16b, v5.16b
 168         eor             v2.16b, v2.16b, v6.16b
 169         eor             v3.16b, v3.16b, v7.16b
 170         eor             v4.16b, v4.16b, v8.16b
 171
 172         cmp             LEN, #0x40
 173         b.lt            less_64
 174
 175         if_will_cond_yield_neon
 176         stp             q1, q2, [sp, #.Lframe_local_offset]
 177         stp             q3, q4, [sp, #.Lframe_local_offset + 32]
 178         do_cond_yield_neon
 179         ldp             q1, q2, [sp, #.Lframe_local_offset]
 180         ldp             q3, q4, [sp, #.Lframe_local_offset + 32]
 181         ldr             qCONSTANT, [CONST]
 182         movi            vzr.16b, #0
 183         endif_yield_neon
 184         b               loop_64
 185
 186 less_64:                /* Folding cache line into 128bit */
 187         ldr             qCONSTANT, [CONST, #16]
 188
 189         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 190         pmull           v1.1q, v1.1d, vCONSTANT.1d
 191         eor             v1.16b, v1.16b, v5.16b
 192         eor             v1.16b, v1.16b, v2.16b
 193
 194         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 195         pmull           v1.1q, v1.1d, vCONSTANT.1d
 196         eor             v1.16b, v1.16b, v5.16b
 197         eor             v1.16b, v1.16b, v3.16b
 198
 199         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 200         pmull           v1.1q, v1.1d, vCONSTANT.1d
 201         eor             v1.16b, v1.16b, v5.16b
 202         eor             v1.16b, v1.16b, v4.16b
 203
 204         cbz             LEN, fold_64
 205
 206 loop_16:                /* Folding rest buffer into 128bit */
 207         subs            LEN, LEN, #0x10
 208
 209         ld1             {v2.16b}, [BUF], #0x10
 210         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 211         pmull           v1.1q, v1.1d, vCONSTANT.1d
 212         eor             v1.16b, v1.16b, v5.16b
 213         eor             v1.16b, v1.16b, v2.16b
 214
 215         b.ne            loop_16
 216
 217 fold_64:
 218         /* perform the last 64 bit fold, also adds 32 zeroes
 219          * to the input stream */
 220         ext             v2.16b, v1.16b, v1.16b, #8
 221         pmull2          v2.1q, v2.2d, vCONSTANT.2d
 222         ext             v1.16b, v1.16b, vzr.16b, #8
 223         eor             v1.16b, v1.16b, v2.16b
 224
 225         /* final 32-bit fold */
 226         ldr             dCONSTANT, [CONST, #32]
 227         ldr             d3, [CONST, #40]
 228
 229         ext             v2.16b, v1.16b, vzr.16b, #4
 230         and             v1.16b, v1.16b, v3.16b
 231         pmull           v1.1q, v1.1d, vCONSTANT.1d
 232         eor             v1.16b, v1.16b, v2.16b
 233
 234         /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
 235         ldr             qCONSTANT, [CONST, #48]
 236
 237         and             v2.16b, v1.16b, v3.16b
 238         ext             v2.16b, vzr.16b, v2.16b, #8
 239         pmull2          v2.1q, v2.2d, vCONSTANT.2d
 240         and             v2.16b, v2.16b, v3.16b
 241         pmull           v2.1q, v2.1d, vCONSTANT.1d
 242         eor             v1.16b, v1.16b, v2.16b
 243         mov             w0, v1.s[1]
 244
 245         frame_pop
 246         ret
 247 ENDPROC(crc32_pmull_le)
 248 ENDPROC(crc32c_pmull_le)
 249
 250         .macro          __crc32, c
 251 0:      subs            x2, x2, #16
 252         b.mi            8f
 253         ldp             x3, x4, [x1], #16
 254 CPU_BE( rev             x3, x3          )
 255 CPU_BE( rev             x4, x4          )
 256         crc32\c\()x     w0, w0, x3
 257         crc32\c\()x     w0, w0, x4
 258         b.ne            0b
 259         ret
 260
 261 8:      tbz             x2, #3, 4f
 262         ldr             x3, [x1], #8
 263 CPU_BE( rev             x3, x3          )
 264         crc32\c\()x     w0, w0, x3
 265 4:      tbz             x2, #2, 2f
 266         ldr             w3, [x1], #4
 267 CPU_BE( rev             w3, w3          )
 268         crc32\c\()w     w0, w0, w3
 269 2:      tbz             x2, #1, 1f
 270         ldrh            w3, [x1], #2
 271 CPU_BE( rev16           w3, w3          )
 272         crc32\c\()h     w0, w0, w3
 273 1:      tbz             x2, #0, 0f
 274         ldrb            w3, [x1]
 275         crc32\c\()b     w0, w0, w3
 276 0:      ret
 277         .endm
 278
 279         .align          5
 280 ENTRY(crc32_armv8_le)
 281         __crc32
 282 ENDPROC(crc32_armv8_le)
 283
 284         .align          5
 285 ENTRY(crc32c_armv8_le)
 286         __crc32         c
 287 ENDPROC(crc32c_armv8_le)