2 // Accelerated CRC-T10DIF using ARM NEON and Crypto Extensions instructions
4 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 // This program is free software; you can redistribute it and/or modify
7 // it under the terms of the GNU General Public License version 2 as
8 // published by the Free Software Foundation.
12 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
14 // Copyright (c) 2013, Intel Corporation
17 // Erdinc Ozturk <erdinc.ozturk@intel.com>
18 // Vinodh Gopal <vinodh.gopal@intel.com>
19 // James Guilford <james.guilford@intel.com>
20 // Tim Chen <tim.c.chen@linux.intel.com>
22 // This software is available to you under a choice of one of two
23 // licenses. You may choose to be licensed under the terms of the GNU
24 // General Public License (GPL) Version 2, available from the file
25 // COPYING in the main directory of this source tree, or the
26 // OpenIB.org BSD license below:
28 // Redistribution and use in source and binary forms, with or without
29 // modification, are permitted provided that the following conditions are
32 // * Redistributions of source code must retain the above copyright
33 // notice, this list of conditions and the following disclaimer.
35 // * Redistributions in binary form must reproduce the above copyright
36 // notice, this list of conditions and the following disclaimer in the
37 // documentation and/or other materials provided with the
40 // * Neither the name of the Intel Corporation nor the names of its
41 // contributors may be used to endorse or promote products derived from
42 // this software without specific prior written permission.
45 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
46 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
49 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
50 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
52 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
53 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
54 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58 // UINT16 crc_t10dif_pcl(
59 // UINT16 init_crc, //initial CRC value, 16 bits
60 // const unsigned char *buf, //buffer pointer to calculate CRC on
61 // UINT64 len //buffer length in bytes (64-bit data)
64 // Reference paper titled "Fast CRC Computation for Generic
65 // Polynomials Using PCLMULQDQ Instruction"
66 // URL: http://www.intel.com/content/dam/www/public/us/en/documents
67 // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
71 #include <linux/linkage.h>
72 #include <asm/assembler.h>
74 #ifdef CONFIG_CPU_ENDIAN_BE8
75 #define CPU_LE(code...)
77 #define CPU_LE(code...) code
81 .fpu crypto-neon-fp-armv8
106 ENTRY(crc_t10dif_pmull)
107 vmov.i8 qzr, #0 // init zero register
109 // adjust the 16-bit initial_crc value, scale it to 32 bits
110 lsl arg1_low32, arg1_low32, #16
112 // check if smaller than 256
115 // for sizes less than 128, we can't fold 64B at a time...
118 // load the initial crc value
119 // crc value does not need to be byte-reflected, but it needs
120 // to be moved to the high part of the register.
121 // because data will be byte-reflected and will align with
122 // initial crc at correct place.
123 vmov s0, arg1_low32 // initial crc
124 vext.8 q10, qzr, q0, #4
126 // receive the initial 64B data, xor the initial crc value
127 vld1.64 {q0-q1}, [arg2, :128]!
128 vld1.64 {q2-q3}, [arg2, :128]!
129 vld1.64 {q4-q5}, [arg2, :128]!
130 vld1.64 {q6-q7}, [arg2, :128]!
131 CPU_LE( vrev64.8 q0, q0 )
132 CPU_LE( vrev64.8 q1, q1 )
133 CPU_LE( vrev64.8 q2, q2 )
134 CPU_LE( vrev64.8 q3, q3 )
135 CPU_LE( vrev64.8 q4, q4 )
136 CPU_LE( vrev64.8 q5, q5 )
137 CPU_LE( vrev64.8 q6, q6 )
138 CPU_LE( vrev64.8 q7, q7 )
149 // XOR the initial_crc value
153 vld1.64 {q10}, [ip, :128] // xmm10 has rk3 and rk4
156 // we subtract 256 instead of 128 to save one instruction from the loop
160 // at this section of the code, there is 64*x+y (0<=y<64) bytes of
161 // buffer. The _fold_64_B_loop will fold 64B at a time
162 // until we have 64+y Bytes of buffer
165 // fold 64B at a time. This section of the code folds 4 vector
166 // registers in parallel
169 .macro fold64, reg1, reg2
170 vld1.64 {q11-q12}, [arg2, :128]!
172 vmull.p64 q8, \reg1\()h, d21
173 vmull.p64 \reg1, \reg1\()l, d20
174 vmull.p64 q9, \reg2\()h, d21
175 vmull.p64 \reg2, \reg2\()l, d20
177 CPU_LE( vrev64.8 q11, q11 )
178 CPU_LE( vrev64.8 q12, q12 )
182 veor.8 \reg1, \reg1, q8
183 veor.8 \reg2, \reg2, q9
184 veor.8 \reg1, \reg1, q11
185 veor.8 \reg2, \reg2, q12
193 subs arg3, arg3, #128
195 // check if there is another 64B in the buffer to be able to fold
198 // at this point, the buffer pointer is pointing at the last y Bytes
199 // of the buffer the 64B of folded data is in 4 of the vector
200 // registers: v0, v1, v2, v3
202 // fold the 8 vector registers to 1 vector register with different
206 vld1.64 {q10}, [ip, :128]!
208 .macro fold16, reg, rk
209 vmull.p64 q8, \reg\()l, d20
210 vmull.p64 \reg, \reg\()h, d21
212 vld1.64 {q10}, [ip, :128]!
226 // instead of 64, we add 48 to the loop counter to save 1 instruction
227 // from the loop instead of a cmp instruction, we use the negative
228 // flag with the jl instruction
229 adds arg3, arg3, #(128-16)
230 blt _final_reduction_for_128
232 // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
233 // and the rest is in memory. We can fold 16 bytes at a time if y>=16
234 // continue folding 16B at a time
237 vmull.p64 q8, d14, d20
238 vmull.p64 q7, d15, d21
241 vld1.64 {q0}, [arg2, :128]!
242 CPU_LE( vrev64.8 q0, q0 )
247 // instead of a cmp instruction, we utilize the flags with the
248 // jge instruction equivalent of: cmp arg3, 16-16
249 // check if there is any more 16B in the buffer to be able to fold
250 bge _16B_reduction_loop
252 // now we have 16+z bytes left to reduce, where 0<= z < 16.
253 // first, we reduce the data in the xmm7 register
255 _final_reduction_for_128:
256 // check if any more data to fold. If not, compute the CRC of
257 // the final 128 bits
261 // here we are getting data that is less than 16 bytes.
262 // since we know that there was data before the pointer, we can
263 // offset the input pointer before the actual point, to receive
264 // exactly 16 bytes. after that the registers need to be adjusted.
269 CPU_LE( vrev64.8 q1, q1 )
272 // get rid of the extra data that was loaded before
273 // load the shift constant
274 adr ip, tbl_shf_table + 16
278 // shift v2 to the left by arg3 bytes
279 vtbl.8 d4, {d14-d15}, d0
280 vtbl.8 d5, {d14-d15}, d1
282 // shift v7 to the right by 16-arg3 bytes
285 vtbl.8 d18, {d14-d15}, d0
286 vtbl.8 d19, {d14-d15}, d1
289 vshr.s8 q0, q0, #7 // convert to 8-bit mask
293 vmull.p64 q8, d18, d20
294 vmull.p64 q7, d19, d21
299 // compute crc of a 128-bit value
301 vldr d21, rk6 // rk5 and rk6 in xmm10
304 vext.8 q0, qzr, q7, #8
305 vmull.p64 q7, d15, d20
309 vext.8 q0, q7, qzr, #12
311 vmull.p64 q0, d0, d21
319 vmull.p64 q0, d15, d20
320 vext.8 q0, qzr, q0, #12
321 vmull.p64 q0, d1, d21
322 vext.8 q0, qzr, q0, #12
327 // scale the result back to 16 bits
336 vmov s3, arg1_low32 // get the initial crc value
338 vld1.64 {q7}, [arg2, :128]!
339 CPU_LE( vrev64.8 q7, q7 )
344 beq _128_done // exactly 16 left
345 blt _less_than_16_left
347 // now if there is, load the constants
349 vldr d21, rk2 // rk1 and rk2 in xmm10
351 // check if there is enough buffer to be able to fold 16B at a time
353 addlt arg3, arg3, #16
354 blt _get_last_two_regs
355 b _16B_reduction_loop
359 adr ip, tbl_shf_table + 16
364 vtbl.8 d18, {d14-d15}, d0
365 vtbl.8 d15, {d14-d15}, d1
368 ENDPROC(crc_t10dif_pmull)
370 // precomputed constants
371 // these constants are precomputed from the poly:
372 // 0x8bb70000 (0x8bb7 scaled to 32 bits)
375 // rk1 = 2^(32*3) mod Q << 32
376 // rk2 = 2^(32*5) mod Q << 32
377 // rk3 = 2^(32*15) mod Q << 32
378 // rk4 = 2^(32*17) mod Q << 32
379 // rk5 = 2^(32*3) mod Q << 32
380 // rk6 = 2^(32*2) mod Q << 32
381 // rk7 = floor(2^64/Q)
384 rk3: .quad 0x9d9d000000000000
385 rk4: .quad 0x7cf5000000000000
386 rk5: .quad 0x2d56000000000000
387 rk6: .quad 0x1368000000000000
388 rk7: .quad 0x00000001f65a57f8
389 rk8: .quad 0x000000018bb70000
390 rk9: .quad 0xceae000000000000
391 rk10: .quad 0xbfd6000000000000
392 rk11: .quad 0x1e16000000000000
393 rk12: .quad 0x713c000000000000
394 rk13: .quad 0xf7f9000000000000
395 rk14: .quad 0x80a6000000000000
396 rk15: .quad 0x044c000000000000
397 rk16: .quad 0xe658000000000000
398 rk17: .quad 0xad18000000000000
399 rk18: .quad 0xa497000000000000
400 rk19: .quad 0x6ee3000000000000
401 rk20: .quad 0xe7b5000000000000
402 rk1: .quad 0x2d56000000000000
403 rk2: .quad 0x06df000000000000
406 // use these values for shift constants for the tbl/tbx instruction
407 // different alignments result in values as shown:
408 // DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
409 // DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
410 // DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
411 // DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
412 // DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
413 // DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
414 // DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
415 // DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
416 // DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
417 // DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
418 // DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
419 // DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
420 // DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
421 // DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
422 // DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
424 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
425 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
426 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
427 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0