arch/arm64/crypto/crc32-ce-core.S

   1 /*
   2  * Accelerated CRC32(C) using arm64 CRC, NEON and Crypto Extensions instructions
   3  *
   4  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 as
   8  * published by the Free Software Foundation.
   9  */
  10
  11 /* GPL HEADER START
  12  *
  13  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  14  *
  15  * This program is free software; you can redistribute it and/or modify
  16  * it under the terms of the GNU General Public License version 2 only,
  17  * as published by the Free Software Foundation.
  18  *
  19  * This program is distributed in the hope that it will be useful, but
  20  * WITHOUT ANY WARRANTY; without even the implied warranty of
  21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22  * General Public License version 2 for more details (a copy is included
  23  * in the LICENSE file that accompanied this code).
  24  *
  25  * You should have received a copy of the GNU General Public License
  26  * version 2 along with this program; If not, see http://www.gnu.org/licenses
  27  *
  28  * Please  visit http://www.xyratex.com/contact if you need additional
  29  * information or have any questions.
  30  *
  31  * GPL HEADER END
  32  */
  33
  34 /*
  35  * Copyright 2012 Xyratex Technology Limited
  36  *
  37  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
  38  * calculation.
  39  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
  40  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
  41  * at:
  42  * http://www.intel.com/products/processor/manuals/
  43  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
  44  * Volume 2B: Instruction Set Reference, N-Z
  45  *
  46  * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
  47  *            Alexander Boyko <Alexander_Boyko@xyratex.com>
  48  */
  49
  50 #include <linux/linkage.h>
  51 #include <asm/assembler.h>
  52
  53         .section        ".rodata", "a"
  54         .align          6
  55         .cpu            generic+crypto+crc
  56
  57 .Lcrc32_constants:
  58         /*
  59          * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
  60          * #define CONSTANT_R1  0x154442bd4LL
  61          *
  62          * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
  63          * #define CONSTANT_R2  0x1c6e41596LL
  64          */
  65         .octa           0x00000001c6e415960000000154442bd4
  66
  67         /*
  68          * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
  69          * #define CONSTANT_R3  0x1751997d0LL
  70          *
  71          * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
  72          * #define CONSTANT_R4  0x0ccaa009eLL
  73          */
  74         .octa           0x00000000ccaa009e00000001751997d0
  75
  76         /*
  77          * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
  78          * #define CONSTANT_R5  0x163cd6124LL
  79          */
  80         .quad           0x0000000163cd6124
  81         .quad           0x00000000FFFFFFFF
  82
  83         /*
  84          * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
  85          *
  86          * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
  87          *                                                      = 0x1F7011641LL
  88          * #define CONSTANT_RU  0x1F7011641LL
  89          */
  90         .octa           0x00000001F701164100000001DB710641
  91
  92 .Lcrc32c_constants:
  93         .octa           0x000000009e4addf800000000740eef02
  94         .octa           0x000000014cd00bd600000000f20c0dfe
  95         .quad           0x00000000dd45aab8
  96         .quad           0x00000000FFFFFFFF
  97         .octa           0x00000000dea713f10000000105ec76f0
  98
  99         vCONSTANT       .req    v0
 100         dCONSTANT       .req    d0
 101         qCONSTANT       .req    q0
 102
 103         BUF             .req    x0
 104         LEN             .req    x1
 105         CRC             .req    x2
 106
 107         vzr             .req    v9
 108
 109         /**
 110          * Calculate crc32
 111          * BUF - buffer
 112          * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
 113          * CRC - initial crc32
 114          * return %eax crc32
 115          * uint crc32_pmull_le(unsigned char const *buffer,
 116          *                     size_t len, uint crc32)
 117          */
 118         .text
 119 ENTRY(crc32_pmull_le)
 120         adr_l           x3, .Lcrc32_constants
 121         b               0f
 122
 123 ENTRY(crc32c_pmull_le)
 124         adr_l           x3, .Lcrc32c_constants
 125
 126 0:      bic             LEN, LEN, #15
 127         ld1             {v1.16b-v4.16b}, [BUF], #0x40
 128         movi            vzr.16b, #0
 129         fmov            dCONSTANT, CRC
 130         eor             v1.16b, v1.16b, vCONSTANT.16b
 131         sub             LEN, LEN, #0x40
 132         cmp             LEN, #0x40
 133         b.lt            less_64
 134
 135         ldr             qCONSTANT, [x3]
 136
 137 loop_64:                /* 64 bytes Full cache line folding */
 138         sub             LEN, LEN, #0x40
 139
 140         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 141         pmull2          v6.1q, v2.2d, vCONSTANT.2d
 142         pmull2          v7.1q, v3.2d, vCONSTANT.2d
 143         pmull2          v8.1q, v4.2d, vCONSTANT.2d
 144
 145         pmull           v1.1q, v1.1d, vCONSTANT.1d
 146         pmull           v2.1q, v2.1d, vCONSTANT.1d
 147         pmull           v3.1q, v3.1d, vCONSTANT.1d
 148         pmull           v4.1q, v4.1d, vCONSTANT.1d
 149
 150         eor             v1.16b, v1.16b, v5.16b
 151         ld1             {v5.16b}, [BUF], #0x10
 152         eor             v2.16b, v2.16b, v6.16b
 153         ld1             {v6.16b}, [BUF], #0x10
 154         eor             v3.16b, v3.16b, v7.16b
 155         ld1             {v7.16b}, [BUF], #0x10
 156         eor             v4.16b, v4.16b, v8.16b
 157         ld1             {v8.16b}, [BUF], #0x10
 158
 159         eor             v1.16b, v1.16b, v5.16b
 160         eor             v2.16b, v2.16b, v6.16b
 161         eor             v3.16b, v3.16b, v7.16b
 162         eor             v4.16b, v4.16b, v8.16b
 163
 164         cmp             LEN, #0x40
 165         b.ge            loop_64
 166
 167 less_64:                /* Folding cache line into 128bit */
 168         ldr             qCONSTANT, [x3, #16]
 169
 170         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 171         pmull           v1.1q, v1.1d, vCONSTANT.1d
 172         eor             v1.16b, v1.16b, v5.16b
 173         eor             v1.16b, v1.16b, v2.16b
 174
 175         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 176         pmull           v1.1q, v1.1d, vCONSTANT.1d
 177         eor             v1.16b, v1.16b, v5.16b
 178         eor             v1.16b, v1.16b, v3.16b
 179
 180         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 181         pmull           v1.1q, v1.1d, vCONSTANT.1d
 182         eor             v1.16b, v1.16b, v5.16b
 183         eor             v1.16b, v1.16b, v4.16b
 184
 185         cbz             LEN, fold_64
 186
 187 loop_16:                /* Folding rest buffer into 128bit */
 188         subs            LEN, LEN, #0x10
 189
 190         ld1             {v2.16b}, [BUF], #0x10
 191         pmull2          v5.1q, v1.2d, vCONSTANT.2d
 192         pmull           v1.1q, v1.1d, vCONSTANT.1d
 193         eor             v1.16b, v1.16b, v5.16b
 194         eor             v1.16b, v1.16b, v2.16b
 195
 196         b.ne            loop_16
 197
 198 fold_64:
 199         /* perform the last 64 bit fold, also adds 32 zeroes
 200          * to the input stream */
 201         ext             v2.16b, v1.16b, v1.16b, #8
 202         pmull2          v2.1q, v2.2d, vCONSTANT.2d
 203         ext             v1.16b, v1.16b, vzr.16b, #8
 204         eor             v1.16b, v1.16b, v2.16b
 205
 206         /* final 32-bit fold */
 207         ldr             dCONSTANT, [x3, #32]
 208         ldr             d3, [x3, #40]
 209
 210         ext             v2.16b, v1.16b, vzr.16b, #4
 211         and             v1.16b, v1.16b, v3.16b
 212         pmull           v1.1q, v1.1d, vCONSTANT.1d
 213         eor             v1.16b, v1.16b, v2.16b
 214
 215         /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
 216         ldr             qCONSTANT, [x3, #48]
 217
 218         and             v2.16b, v1.16b, v3.16b
 219         ext             v2.16b, vzr.16b, v2.16b, #8
 220         pmull2          v2.1q, v2.2d, vCONSTANT.2d
 221         and             v2.16b, v2.16b, v3.16b
 222         pmull           v2.1q, v2.1d, vCONSTANT.1d
 223         eor             v1.16b, v1.16b, v2.16b
 224         mov             w0, v1.s[1]
 225
 226         ret
 227 ENDPROC(crc32_pmull_le)
 228 ENDPROC(crc32c_pmull_le)
 229
 230         .macro          __crc32, c
 231 0:      subs            x2, x2, #16
 232         b.mi            8f
 233         ldp             x3, x4, [x1], #16
 234 CPU_BE( rev             x3, x3          )
 235 CPU_BE( rev             x4, x4          )
 236         crc32\c\()x     w0, w0, x3
 237         crc32\c\()x     w0, w0, x4
 238         b.ne            0b
 239         ret
 240
 241 8:      tbz             x2, #3, 4f
 242         ldr             x3, [x1], #8
 243 CPU_BE( rev             x3, x3          )
 244         crc32\c\()x     w0, w0, x3
 245 4:      tbz             x2, #2, 2f
 246         ldr             w3, [x1], #4
 247 CPU_BE( rev             w3, w3          )
 248         crc32\c\()w     w0, w0, w3
 249 2:      tbz             x2, #1, 1f
 250         ldrh            w3, [x1], #2
 251 CPU_BE( rev16           w3, w3          )
 252         crc32\c\()h     w0, w0, w3
 253 1:      tbz             x2, #0, 0f
 254         ldrb            w3, [x1]
 255         crc32\c\()b     w0, w0, w3
 256 0:      ret
 257         .endm
 258
 259         .align          5
 260 ENTRY(crc32_armv8_le)
 261         __crc32
 262 ENDPROC(crc32_armv8_le)
 263
 264         .align          5
 265 ENTRY(crc32c_armv8_le)
 266         __crc32         c
 267 ENDPROC(crc32c_armv8_le)