1 // SPDX-License-Identifier: GPL-2.0-only
3 * arch/arm64/lib/xor-neon.c
5 * Authors: Jackie Liu <liuyun01@kylinos.cn>
6 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
9 #include <linux/raid/xor.h>
10 #include <linux/module.h>
11 #include <asm/neon-intrinsics.h>
13 static void xor_arm64_neon_2(unsigned long bytes
, unsigned long * __restrict p1
,
14 const unsigned long * __restrict p2
)
16 uint64_t *dp1
= (uint64_t *)p1
;
17 uint64_t *dp2
= (uint64_t *)p2
;
19 register uint64x2_t v0
, v1
, v2
, v3
;
20 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
24 v0
= veorq_u64(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0));
25 v1
= veorq_u64(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2));
26 v2
= veorq_u64(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4));
27 v3
= veorq_u64(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6));
30 vst1q_u64(dp1
+ 0, v0
);
31 vst1q_u64(dp1
+ 2, v1
);
32 vst1q_u64(dp1
+ 4, v2
);
33 vst1q_u64(dp1
+ 6, v3
);
37 } while (--lines
> 0);
40 static void xor_arm64_neon_3(unsigned long bytes
, unsigned long * __restrict p1
,
41 const unsigned long * __restrict p2
,
42 const unsigned long * __restrict p3
)
44 uint64_t *dp1
= (uint64_t *)p1
;
45 uint64_t *dp2
= (uint64_t *)p2
;
46 uint64_t *dp3
= (uint64_t *)p3
;
48 register uint64x2_t v0
, v1
, v2
, v3
;
49 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
53 v0
= veorq_u64(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0));
54 v1
= veorq_u64(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2));
55 v2
= veorq_u64(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4));
56 v3
= veorq_u64(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6));
59 v0
= veorq_u64(v0
, vld1q_u64(dp3
+ 0));
60 v1
= veorq_u64(v1
, vld1q_u64(dp3
+ 2));
61 v2
= veorq_u64(v2
, vld1q_u64(dp3
+ 4));
62 v3
= veorq_u64(v3
, vld1q_u64(dp3
+ 6));
65 vst1q_u64(dp1
+ 0, v0
);
66 vst1q_u64(dp1
+ 2, v1
);
67 vst1q_u64(dp1
+ 4, v2
);
68 vst1q_u64(dp1
+ 6, v3
);
73 } while (--lines
> 0);
76 static void xor_arm64_neon_4(unsigned long bytes
, unsigned long * __restrict p1
,
77 const unsigned long * __restrict p2
,
78 const unsigned long * __restrict p3
,
79 const unsigned long * __restrict p4
)
81 uint64_t *dp1
= (uint64_t *)p1
;
82 uint64_t *dp2
= (uint64_t *)p2
;
83 uint64_t *dp3
= (uint64_t *)p3
;
84 uint64_t *dp4
= (uint64_t *)p4
;
86 register uint64x2_t v0
, v1
, v2
, v3
;
87 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
91 v0
= veorq_u64(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0));
92 v1
= veorq_u64(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2));
93 v2
= veorq_u64(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4));
94 v3
= veorq_u64(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6));
97 v0
= veorq_u64(v0
, vld1q_u64(dp3
+ 0));
98 v1
= veorq_u64(v1
, vld1q_u64(dp3
+ 2));
99 v2
= veorq_u64(v2
, vld1q_u64(dp3
+ 4));
100 v3
= veorq_u64(v3
, vld1q_u64(dp3
+ 6));
103 v0
= veorq_u64(v0
, vld1q_u64(dp4
+ 0));
104 v1
= veorq_u64(v1
, vld1q_u64(dp4
+ 2));
105 v2
= veorq_u64(v2
, vld1q_u64(dp4
+ 4));
106 v3
= veorq_u64(v3
, vld1q_u64(dp4
+ 6));
109 vst1q_u64(dp1
+ 0, v0
);
110 vst1q_u64(dp1
+ 2, v1
);
111 vst1q_u64(dp1
+ 4, v2
);
112 vst1q_u64(dp1
+ 6, v3
);
118 } while (--lines
> 0);
121 static void xor_arm64_neon_5(unsigned long bytes
, unsigned long * __restrict p1
,
122 const unsigned long * __restrict p2
,
123 const unsigned long * __restrict p3
,
124 const unsigned long * __restrict p4
,
125 const unsigned long * __restrict p5
)
127 uint64_t *dp1
= (uint64_t *)p1
;
128 uint64_t *dp2
= (uint64_t *)p2
;
129 uint64_t *dp3
= (uint64_t *)p3
;
130 uint64_t *dp4
= (uint64_t *)p4
;
131 uint64_t *dp5
= (uint64_t *)p5
;
133 register uint64x2_t v0
, v1
, v2
, v3
;
134 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
138 v0
= veorq_u64(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0));
139 v1
= veorq_u64(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2));
140 v2
= veorq_u64(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4));
141 v3
= veorq_u64(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6));
144 v0
= veorq_u64(v0
, vld1q_u64(dp3
+ 0));
145 v1
= veorq_u64(v1
, vld1q_u64(dp3
+ 2));
146 v2
= veorq_u64(v2
, vld1q_u64(dp3
+ 4));
147 v3
= veorq_u64(v3
, vld1q_u64(dp3
+ 6));
150 v0
= veorq_u64(v0
, vld1q_u64(dp4
+ 0));
151 v1
= veorq_u64(v1
, vld1q_u64(dp4
+ 2));
152 v2
= veorq_u64(v2
, vld1q_u64(dp4
+ 4));
153 v3
= veorq_u64(v3
, vld1q_u64(dp4
+ 6));
156 v0
= veorq_u64(v0
, vld1q_u64(dp5
+ 0));
157 v1
= veorq_u64(v1
, vld1q_u64(dp5
+ 2));
158 v2
= veorq_u64(v2
, vld1q_u64(dp5
+ 4));
159 v3
= veorq_u64(v3
, vld1q_u64(dp5
+ 6));
162 vst1q_u64(dp1
+ 0, v0
);
163 vst1q_u64(dp1
+ 2, v1
);
164 vst1q_u64(dp1
+ 4, v2
);
165 vst1q_u64(dp1
+ 6, v3
);
172 } while (--lines
> 0);
175 struct xor_block_template xor_block_inner_neon __ro_after_init
= {
176 .name
= "__inner_neon__",
177 .do_2
= xor_arm64_neon_2
,
178 .do_3
= xor_arm64_neon_3
,
179 .do_4
= xor_arm64_neon_4
,
180 .do_5
= xor_arm64_neon_5
,
182 EXPORT_SYMBOL(xor_block_inner_neon
);
184 static inline uint64x2_t
eor3(uint64x2_t p
, uint64x2_t q
, uint64x2_t r
)
188 asm(ARM64_ASM_PREAMBLE
".arch_extension sha3\n"
189 "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
190 : "=w"(res
) : "w"(p
), "w"(q
), "w"(r
));
194 static void xor_arm64_eor3_3(unsigned long bytes
,
195 unsigned long * __restrict p1
,
196 const unsigned long * __restrict p2
,
197 const unsigned long * __restrict p3
)
199 uint64_t *dp1
= (uint64_t *)p1
;
200 uint64_t *dp2
= (uint64_t *)p2
;
201 uint64_t *dp3
= (uint64_t *)p3
;
203 register uint64x2_t v0
, v1
, v2
, v3
;
204 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
208 v0
= eor3(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0),
210 v1
= eor3(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2),
212 v2
= eor3(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4),
214 v3
= eor3(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6),
218 vst1q_u64(dp1
+ 0, v0
);
219 vst1q_u64(dp1
+ 2, v1
);
220 vst1q_u64(dp1
+ 4, v2
);
221 vst1q_u64(dp1
+ 6, v3
);
226 } while (--lines
> 0);
229 static void xor_arm64_eor3_4(unsigned long bytes
,
230 unsigned long * __restrict p1
,
231 const unsigned long * __restrict p2
,
232 const unsigned long * __restrict p3
,
233 const unsigned long * __restrict p4
)
235 uint64_t *dp1
= (uint64_t *)p1
;
236 uint64_t *dp2
= (uint64_t *)p2
;
237 uint64_t *dp3
= (uint64_t *)p3
;
238 uint64_t *dp4
= (uint64_t *)p4
;
240 register uint64x2_t v0
, v1
, v2
, v3
;
241 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
245 v0
= eor3(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0),
247 v1
= eor3(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2),
249 v2
= eor3(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4),
251 v3
= eor3(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6),
255 v0
= veorq_u64(v0
, vld1q_u64(dp4
+ 0));
256 v1
= veorq_u64(v1
, vld1q_u64(dp4
+ 2));
257 v2
= veorq_u64(v2
, vld1q_u64(dp4
+ 4));
258 v3
= veorq_u64(v3
, vld1q_u64(dp4
+ 6));
261 vst1q_u64(dp1
+ 0, v0
);
262 vst1q_u64(dp1
+ 2, v1
);
263 vst1q_u64(dp1
+ 4, v2
);
264 vst1q_u64(dp1
+ 6, v3
);
270 } while (--lines
> 0);
273 static void xor_arm64_eor3_5(unsigned long bytes
,
274 unsigned long * __restrict p1
,
275 const unsigned long * __restrict p2
,
276 const unsigned long * __restrict p3
,
277 const unsigned long * __restrict p4
,
278 const unsigned long * __restrict p5
)
280 uint64_t *dp1
= (uint64_t *)p1
;
281 uint64_t *dp2
= (uint64_t *)p2
;
282 uint64_t *dp3
= (uint64_t *)p3
;
283 uint64_t *dp4
= (uint64_t *)p4
;
284 uint64_t *dp5
= (uint64_t *)p5
;
286 register uint64x2_t v0
, v1
, v2
, v3
;
287 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
291 v0
= eor3(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0),
293 v1
= eor3(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2),
295 v2
= eor3(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4),
297 v3
= eor3(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6),
301 v0
= eor3(v0
, vld1q_u64(dp4
+ 0), vld1q_u64(dp5
+ 0));
302 v1
= eor3(v1
, vld1q_u64(dp4
+ 2), vld1q_u64(dp5
+ 2));
303 v2
= eor3(v2
, vld1q_u64(dp4
+ 4), vld1q_u64(dp5
+ 4));
304 v3
= eor3(v3
, vld1q_u64(dp4
+ 6), vld1q_u64(dp5
+ 6));
307 vst1q_u64(dp1
+ 0, v0
);
308 vst1q_u64(dp1
+ 2, v1
);
309 vst1q_u64(dp1
+ 4, v2
);
310 vst1q_u64(dp1
+ 6, v3
);
317 } while (--lines
> 0);
320 static int __init
xor_neon_init(void)
322 if (IS_ENABLED(CONFIG_AS_HAS_SHA3
) && cpu_have_named_feature(SHA3
)) {
323 xor_block_inner_neon
.do_3
= xor_arm64_eor3_3
;
324 xor_block_inner_neon
.do_4
= xor_arm64_eor3_4
;
325 xor_block_inner_neon
.do_5
= xor_arm64_eor3_5
;
329 module_init(xor_neon_init
);
331 static void __exit
xor_neon_exit(void)
334 module_exit(xor_neon_exit
);
336 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
337 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
338 MODULE_LICENSE("GPL");