2 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
4 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 // This program is free software; you can redistribute it and/or modify
7 // it under the terms of the GNU General Public License version 2 as
8 // published by the Free Software Foundation.
12 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
14 // Copyright (c) 2013, Intel Corporation
17 // Erdinc Ozturk <erdinc.ozturk@intel.com>
18 // Vinodh Gopal <vinodh.gopal@intel.com>
19 // James Guilford <james.guilford@intel.com>
20 // Tim Chen <tim.c.chen@linux.intel.com>
22 // This software is available to you under a choice of one of two
23 // licenses. You may choose to be licensed under the terms of the GNU
24 // General Public License (GPL) Version 2, available from the file
25 // COPYING in the main directory of this source tree, or the
26 // OpenIB.org BSD license below:
28 // Redistribution and use in source and binary forms, with or without
29 // modification, are permitted provided that the following conditions are
32 // * Redistributions of source code must retain the above copyright
33 // notice, this list of conditions and the following disclaimer.
35 // * Redistributions in binary form must reproduce the above copyright
36 // notice, this list of conditions and the following disclaimer in the
37 // documentation and/or other materials provided with the
40 // * Neither the name of the Intel Corporation nor the names of its
41 // contributors may be used to endorse or promote products derived from
42 // this software without specific prior written permission.
45 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
46 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
49 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
50 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
52 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
53 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
54 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58 // UINT16 crc_t10dif_pcl(
59 // UINT16 init_crc, //initial CRC value, 16 bits
60 // const unsigned char *buf, //buffer pointer to calculate CRC on
61 // UINT64 len //buffer length in bytes (64-bit data)
64 // Reference paper titled "Fast CRC Computation for Generic
65 // Polynomials Using PCLMULQDQ Instruction"
66 // URL: http://www.intel.com/content/dam/www/public/us/en/documents
67 // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
71 #include <linux/linkage.h>
72 #include <asm/assembler.h>
83 ENTRY(crc_t10dif_pmull)
90 movi vzr.16b, #0 // init zero register
92 // adjust the 16-bit initial_crc value, scale it to 32 bits
93 lsl arg1_low32, arg1_low32, #16
95 // check if smaller than 256
98 // for sizes less than 128, we can't fold 64B at a time...
101 // load the initial crc value
102 // crc value does not need to be byte-reflected, but it needs
103 // to be moved to the high part of the register.
104 // because data will be byte-reflected and will align with
105 // initial crc at correct place.
107 mov v10.s[3], arg1_low32 // initial crc
109 // receive the initial 64B data, xor the initial crc value
111 ldp q2, q3, [arg2, #0x20]
112 ldp q4, q5, [arg2, #0x40]
113 ldp q6, q7, [arg2, #0x60]
114 add arg2, arg2, #0x80
116 CPU_LE( rev64 v0.16b, v0.16b )
117 CPU_LE( rev64 v1.16b, v1.16b )
118 CPU_LE( rev64 v2.16b, v2.16b )
119 CPU_LE( rev64 v3.16b, v3.16b )
120 CPU_LE( rev64 v4.16b, v4.16b )
121 CPU_LE( rev64 v5.16b, v5.16b )
122 CPU_LE( rev64 v6.16b, v6.16b )
123 CPU_LE( rev64 v7.16b, v7.16b )
125 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
126 CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
127 CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
128 CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
129 CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
130 CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
131 CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
132 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
134 // XOR the initial_crc value
135 eor v0.16b, v0.16b, v10.16b
137 ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4
138 // type of pmull instruction
139 // will determine which constant to use
142 // we subtract 256 instead of 128 to save one instruction from the loop
146 // at this section of the code, there is 64*x+y (0<=y<64) bytes of
147 // buffer. The _fold_64_B_loop will fold 64B at a time
148 // until we have 64+y Bytes of buffer
151 // fold 64B at a time. This section of the code folds 4 vector
152 // registers in parallel
155 .macro fold64, reg1, reg2
156 ldp q11, q12, [arg2], #0x20
158 pmull2 v8.1q, \reg1\().2d, v10.2d
159 pmull \reg1\().1q, \reg1\().1d, v10.1d
161 CPU_LE( rev64 v11.16b, v11.16b )
162 CPU_LE( rev64 v12.16b, v12.16b )
164 pmull2 v9.1q, \reg2\().2d, v10.2d
165 pmull \reg2\().1q, \reg2\().1d, v10.1d
167 CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
168 CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
170 eor \reg1\().16b, \reg1\().16b, v8.16b
171 eor \reg2\().16b, \reg2\().16b, v9.16b
172 eor \reg1\().16b, \reg1\().16b, v11.16b
173 eor \reg2\().16b, \reg2\().16b, v12.16b
181 subs arg3, arg3, #128
183 // check if there is another 64B in the buffer to be able to fold
186 if_will_cond_yield_neon
187 stp q0, q1, [sp, #.Lframe_local_offset]
188 stp q2, q3, [sp, #.Lframe_local_offset + 32]
189 stp q4, q5, [sp, #.Lframe_local_offset + 64]
190 stp q6, q7, [sp, #.Lframe_local_offset + 96]
192 ldp q0, q1, [sp, #.Lframe_local_offset]
193 ldp q2, q3, [sp, #.Lframe_local_offset + 32]
194 ldp q4, q5, [sp, #.Lframe_local_offset + 64]
195 ldp q6, q7, [sp, #.Lframe_local_offset + 96]
197 movi vzr.16b, #0 // init zero register
203 // at this point, the buffer pointer is pointing at the last y Bytes
204 // of the buffer the 64B of folded data is in 4 of the vector
205 // registers: v0, v1, v2, v3
207 // fold the 8 vector registers to 1 vector register with different
212 .macro fold16, reg, rk
213 pmull v8.1q, \reg\().1d, v10.1d
214 pmull2 \reg\().1q, \reg\().2d, v10.2d
218 eor v7.16b, v7.16b, v8.16b
219 eor v7.16b, v7.16b, \reg\().16b
230 // instead of 64, we add 48 to the loop counter to save 1 instruction
231 // from the loop instead of a cmp instruction, we use the negative
232 // flag with the jl instruction
233 adds arg3, arg3, #(128-16)
234 b.lt _final_reduction_for_128
236 // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
237 // and the rest is in memory. We can fold 16 bytes at a time if y>=16
238 // continue folding 16B at a time
241 pmull v8.1q, v7.1d, v10.1d
242 pmull2 v7.1q, v7.2d, v10.2d
243 eor v7.16b, v7.16b, v8.16b
246 CPU_LE( rev64 v0.16b, v0.16b )
247 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
248 eor v7.16b, v7.16b, v0.16b
251 // instead of a cmp instruction, we utilize the flags with the
252 // jge instruction equivalent of: cmp arg3, 16-16
253 // check if there is any more 16B in the buffer to be able to fold
254 b.ge _16B_reduction_loop
256 // now we have 16+z bytes left to reduce, where 0<= z < 16.
257 // first, we reduce the data in the xmm7 register
259 _final_reduction_for_128:
260 // check if any more data to fold. If not, compute the CRC of
261 // the final 128 bits
265 // here we are getting data that is less than 16 bytes.
266 // since we know that there was data before the pointer, we can
267 // offset the input pointer before the actual point, to receive
268 // exactly 16 bytes. after that the registers need to be adjusted.
272 CPU_LE( rev64 v1.16b, v1.16b )
273 CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
275 // get rid of the extra data that was loaded before
276 // load the shift constant
277 adr_l x4, tbl_shf_table + 16
281 // shift v2 to the left by arg3 bytes
282 tbl v2.16b, {v7.16b}, v0.16b
284 // shift v7 to the right by 16-arg3 bytes
286 eor v0.16b, v0.16b, v9.16b
287 tbl v7.16b, {v7.16b}, v0.16b
290 sshr v0.16b, v0.16b, #7 // convert to 8-bit mask
291 bsl v0.16b, v2.16b, v1.16b
294 pmull v8.1q, v7.1d, v10.1d
295 pmull2 v7.1q, v7.2d, v10.2d
296 eor v7.16b, v7.16b, v8.16b
297 eor v7.16b, v7.16b, v0.16b
300 // compute crc of a 128-bit value
301 ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
304 ext v0.16b, vzr.16b, v7.16b, #8
306 pmull v7.1q, v7.1d, v10.1d
307 eor v7.16b, v7.16b, v0.16b
310 ext v0.16b, v7.16b, vzr.16b, #4
311 mov v7.s[3], vzr.s[0]
312 pmull2 v0.1q, v0.2d, v10.2d
313 eor v7.16b, v7.16b, v0.16b
320 pmull v0.1q, v0.1d, v10.1d
321 ext v0.16b, vzr.16b, v0.16b, #12
322 pmull2 v0.1q, v0.2d, v10.2d
323 ext v0.16b, vzr.16b, v0.16b, #12
324 eor v7.16b, v7.16b, v0.16b
328 // scale the result back to 16 bits
337 mov v0.s[3], arg1_low32 // get the initial crc value
339 ldr q7, [arg2], #0x10
340 CPU_LE( rev64 v7.16b, v7.16b )
341 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
342 eor v7.16b, v7.16b, v0.16b // xor the initial crc value
345 b.eq _128_done // exactly 16 left
346 b.lt _less_than_16_left
348 ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
350 // update the counter. subtract 32 instead of 16 to save one
351 // instruction from the loop
353 b.ge _16B_reduction_loop
360 adr_l x0, tbl_shf_table + 16
364 eor v0.16b, v0.16b, v9.16b
365 tbl v7.16b, {v7.16b}, v0.16b
367 ENDPROC(crc_t10dif_pmull)
369 // precomputed constants
370 // these constants are precomputed from the poly:
371 // 0x8bb70000 (0x8bb7 scaled to 32 bits)
372 .section ".rodata", "a"
375 // rk1 = 2^(32*3) mod Q << 32
376 // rk2 = 2^(32*5) mod Q << 32
377 // rk3 = 2^(32*15) mod Q << 32
378 // rk4 = 2^(32*17) mod Q << 32
379 // rk5 = 2^(32*3) mod Q << 32
380 // rk6 = 2^(32*2) mod Q << 32
381 // rk7 = floor(2^64/Q)
384 rk1: .octa 0x06df0000000000002d56000000000000
385 rk3: .octa 0x7cf50000000000009d9d000000000000
386 rk5: .octa 0x13680000000000002d56000000000000
387 rk7: .octa 0x000000018bb7000000000001f65a57f8
388 rk9: .octa 0xbfd6000000000000ceae000000000000
389 rk11: .octa 0x713c0000000000001e16000000000000
390 rk13: .octa 0x80a6000000000000f7f9000000000000
391 rk15: .octa 0xe658000000000000044c000000000000
392 rk17: .octa 0xa497000000000000ad18000000000000
393 rk19: .octa 0xe7b50000000000006ee3000000000000
396 // use these values for shift constants for the tbl/tbx instruction
397 // different alignments result in values as shown:
398 // DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
399 // DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
400 // DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
401 // DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
402 // DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
403 // DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
404 // DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
405 // DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
406 // DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
407 // DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
408 // DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
409 // DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
410 // DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
411 // DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
412 // DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
414 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
415 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
416 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
417 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0