arch/arm/crypto/crc32-ce-core.S

   1 /*
   2  * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions
   3  *
   4  * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 as
   8  * published by the Free Software Foundation.
   9  */
  10
  11 /* GPL HEADER START
  12  *
  13  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  14  *
  15  * This program is free software; you can redistribute it and/or modify
  16  * it under the terms of the GNU General Public License version 2 only,
  17  * as published by the Free Software Foundation.
  18  *
  19  * This program is distributed in the hope that it will be useful, but
  20  * WITHOUT ANY WARRANTY; without even the implied warranty of
  21  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22  * General Public License version 2 for more details (a copy is included
  23  * in the LICENSE file that accompanied this code).
  24  *
  25  * You should have received a copy of the GNU General Public License
  26  * version 2 along with this program; If not, see http://www.gnu.org/licenses
  27  *
  28  * Please  visit http://www.xyratex.com/contact if you need additional
  29  * information or have any questions.
  30  *
  31  * GPL HEADER END
  32  */
  33
  34 /*
  35  * Copyright 2012 Xyratex Technology Limited
  36  *
  37  * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32
  38  * calculation.
  39  * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE)
  40  * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found
  41  * at:
  42  * http://www.intel.com/products/processor/manuals/
  43  * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
  44  * Volume 2B: Instruction Set Reference, N-Z
  45  *
  46  * Authors:   Gregory Prestas <Gregory_Prestas@us.xyratex.com>
  47  *            Alexander Boyko <Alexander_Boyko@xyratex.com>
  48  */
  49
  50 #include <linux/linkage.h>
  51 #include <asm/assembler.h>
  52
  53         .text
  54         .align          6
  55         .arch           armv8-a
  56         .arch_extension crc
  57         .fpu            crypto-neon-fp-armv8
  58
  59 .Lcrc32_constants:
  60         /*
  61          * [x4*128+32 mod P(x) << 32)]'  << 1   = 0x154442bd4
  62          * #define CONSTANT_R1  0x154442bd4LL
  63          *
  64          * [(x4*128-32 mod P(x) << 32)]' << 1   = 0x1c6e41596
  65          * #define CONSTANT_R2  0x1c6e41596LL
  66          */
  67         .quad           0x0000000154442bd4
  68         .quad           0x00000001c6e41596
  69
  70         /*
  71          * [(x128+32 mod P(x) << 32)]'   << 1   = 0x1751997d0
  72          * #define CONSTANT_R3  0x1751997d0LL
  73          *
  74          * [(x128-32 mod P(x) << 32)]'   << 1   = 0x0ccaa009e
  75          * #define CONSTANT_R4  0x0ccaa009eLL
  76          */
  77         .quad           0x00000001751997d0
  78         .quad           0x00000000ccaa009e
  79
  80         /*
  81          * [(x64 mod P(x) << 32)]'       << 1   = 0x163cd6124
  82          * #define CONSTANT_R5  0x163cd6124LL
  83          */
  84         .quad           0x0000000163cd6124
  85         .quad           0x00000000FFFFFFFF
  86
  87         /*
  88          * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL
  89          *
  90          * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))`
  91          *                                                      = 0x1F7011641LL
  92          * #define CONSTANT_RU  0x1F7011641LL
  93          */
  94         .quad           0x00000001DB710641
  95         .quad           0x00000001F7011641
  96
  97 .Lcrc32c_constants:
  98         .quad           0x00000000740eef02
  99         .quad           0x000000009e4addf8
 100         .quad           0x00000000f20c0dfe
 101         .quad           0x000000014cd00bd6
 102         .quad           0x00000000dd45aab8
 103         .quad           0x00000000FFFFFFFF
 104         .quad           0x0000000105ec76f0
 105         .quad           0x00000000dea713f1
 106
 107         dCONSTANTl      .req    d0
 108         dCONSTANTh      .req    d1
 109         qCONSTANT       .req    q0
 110
 111         BUF             .req    r0
 112         LEN             .req    r1
 113         CRC             .req    r2
 114
 115         qzr             .req    q9
 116
 117         /**
 118          * Calculate crc32
 119          * BUF - buffer
 120          * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63
 121          * CRC - initial crc32
 122          * return %eax crc32
 123          * uint crc32_pmull_le(unsigned char const *buffer,
 124          *                     size_t len, uint crc32)
 125          */
 126 ENTRY(crc32_pmull_le)
 127         adr             r3, .Lcrc32_constants
 128         b               0f
 129
 130 ENTRY(crc32c_pmull_le)
 131         adr             r3, .Lcrc32c_constants
 132
 133 0:      bic             LEN, LEN, #15
 134         vld1.8          {q1-q2}, [BUF, :128]!
 135         vld1.8          {q3-q4}, [BUF, :128]!
 136         vmov.i8         qzr, #0
 137         vmov.i8         qCONSTANT, #0
 138         vmov.32         dCONSTANTl[0], CRC
 139         veor.8          d2, d2, dCONSTANTl
 140         sub             LEN, LEN, #0x40
 141         cmp             LEN, #0x40
 142         blt             less_64
 143
 144         vld1.64         {qCONSTANT}, [r3]
 145
 146 loop_64:                /* 64 bytes Full cache line folding */
 147         sub             LEN, LEN, #0x40
 148
 149         vmull.p64       q5, d3, dCONSTANTh
 150         vmull.p64       q6, d5, dCONSTANTh
 151         vmull.p64       q7, d7, dCONSTANTh
 152         vmull.p64       q8, d9, dCONSTANTh
 153
 154         vmull.p64       q1, d2, dCONSTANTl
 155         vmull.p64       q2, d4, dCONSTANTl
 156         vmull.p64       q3, d6, dCONSTANTl
 157         vmull.p64       q4, d8, dCONSTANTl
 158
 159         veor.8          q1, q1, q5
 160         vld1.8          {q5}, [BUF, :128]!
 161         veor.8          q2, q2, q6
 162         vld1.8          {q6}, [BUF, :128]!
 163         veor.8          q3, q3, q7
 164         vld1.8          {q7}, [BUF, :128]!
 165         veor.8          q4, q4, q8
 166         vld1.8          {q8}, [BUF, :128]!
 167
 168         veor.8          q1, q1, q5
 169         veor.8          q2, q2, q6
 170         veor.8          q3, q3, q7
 171         veor.8          q4, q4, q8
 172
 173         cmp             LEN, #0x40
 174         bge             loop_64
 175
 176 less_64:                /* Folding cache line into 128bit */
 177         vldr            dCONSTANTl, [r3, #16]
 178         vldr            dCONSTANTh, [r3, #24]
 179
 180         vmull.p64       q5, d3, dCONSTANTh
 181         vmull.p64       q1, d2, dCONSTANTl
 182         veor.8          q1, q1, q5
 183         veor.8          q1, q1, q2
 184
 185         vmull.p64       q5, d3, dCONSTANTh
 186         vmull.p64       q1, d2, dCONSTANTl
 187         veor.8          q1, q1, q5
 188         veor.8          q1, q1, q3
 189
 190         vmull.p64       q5, d3, dCONSTANTh
 191         vmull.p64       q1, d2, dCONSTANTl
 192         veor.8          q1, q1, q5
 193         veor.8          q1, q1, q4
 194
 195         teq             LEN, #0
 196         beq             fold_64
 197
 198 loop_16:                /* Folding rest buffer into 128bit */
 199         subs            LEN, LEN, #0x10
 200
 201         vld1.8          {q2}, [BUF, :128]!
 202         vmull.p64       q5, d3, dCONSTANTh
 203         vmull.p64       q1, d2, dCONSTANTl
 204         veor.8          q1, q1, q5
 205         veor.8          q1, q1, q2
 206
 207         bne             loop_16
 208
 209 fold_64:
 210         /* perform the last 64 bit fold, also adds 32 zeroes
 211          * to the input stream */
 212         vmull.p64       q2, d2, dCONSTANTh
 213         vext.8          q1, q1, qzr, #8
 214         veor.8          q1, q1, q2
 215
 216         /* final 32-bit fold */
 217         vldr            dCONSTANTl, [r3, #32]
 218         vldr            d6, [r3, #40]
 219         vmov.i8         d7, #0
 220
 221         vext.8          q2, q1, qzr, #4
 222         vand.8          d2, d2, d6
 223         vmull.p64       q1, d2, dCONSTANTl
 224         veor.8          q1, q1, q2
 225
 226         /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
 227         vldr            dCONSTANTl, [r3, #48]
 228         vldr            dCONSTANTh, [r3, #56]
 229
 230         vand.8          q2, q1, q3
 231         vext.8          q2, qzr, q2, #8
 232         vmull.p64       q2, d5, dCONSTANTh
 233         vand.8          q2, q2, q3
 234         vmull.p64       q2, d4, dCONSTANTl
 235         veor.8          q1, q1, q2
 236         vmov            r0, s5
 237
 238         bx              lr
 239 ENDPROC(crc32_pmull_le)
 240 ENDPROC(crc32c_pmull_le)
 241
 242         .macro          __crc32, c
 243         subs            ip, r2, #8
 244         bmi             .Ltail\c
 245
 246         tst             r1, #3
 247         bne             .Lunaligned\c
 248
 249         teq             ip, #0
 250 .Laligned8\c:
 251         ldrd            r2, r3, [r1], #8
 252 ARM_BE8(rev             r2, r2          )
 253 ARM_BE8(rev             r3, r3          )
 254         crc32\c\()w     r0, r0, r2
 255         crc32\c\()w     r0, r0, r3
 256         bxeq            lr
 257         subs            ip, ip, #8
 258         bpl             .Laligned8\c
 259
 260 .Ltail\c:
 261         tst             ip, #4
 262         beq             2f
 263         ldr             r3, [r1], #4
 264 ARM_BE8(rev             r3, r3          )
 265         crc32\c\()w     r0, r0, r3
 266
 267 2:      tst             ip, #2
 268         beq             1f
 269         ldrh            r3, [r1], #2
 270 ARM_BE8(rev16           r3, r3          )
 271         crc32\c\()h     r0, r0, r3
 272
 273 1:      tst             ip, #1
 274         bxeq            lr
 275         ldrb            r3, [r1]
 276         crc32\c\()b     r0, r0, r3
 277         bx              lr
 278
 279 .Lunaligned\c:
 280         tst             r1, #1
 281         beq             2f
 282         ldrb            r3, [r1], #1
 283         subs            r2, r2, #1
 284         crc32\c\()b     r0, r0, r3
 285
 286         tst             r1, #2
 287         beq             0f
 288 2:      ldrh            r3, [r1], #2
 289         subs            r2, r2, #2
 290 ARM_BE8(rev16           r3, r3          )
 291         crc32\c\()h     r0, r0, r3
 292
 293 0:      subs            ip, r2, #8
 294         bpl             .Laligned8\c
 295         b               .Ltail\c
 296         .endm
 297
 298         .align          5
 299 ENTRY(crc32_armv8_le)
 300         __crc32
 301 ENDPROC(crc32_armv8_le)
 302
 303         .align          5
 304 ENTRY(crc32c_armv8_le)
 305         __crc32         c
 306 ENDPROC(crc32c_armv8_le)