1 // SPDX-License-Identifier: GPL-2.0-only
3 * arch/arm64/lib/xor-neon.c
5 * Authors: Jackie Liu <liuyun01@kylinos.cn>
6 * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
9 #include <linux/raid/xor.h>
10 #include <linux/module.h>
11 #include <asm/neon-intrinsics.h>
13 void xor_arm64_neon_2(unsigned long bytes
, unsigned long *p1
,
16 uint64_t *dp1
= (uint64_t *)p1
;
17 uint64_t *dp2
= (uint64_t *)p2
;
19 register uint64x2_t v0
, v1
, v2
, v3
;
20 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
24 v0
= veorq_u64(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0));
25 v1
= veorq_u64(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2));
26 v2
= veorq_u64(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4));
27 v3
= veorq_u64(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6));
30 vst1q_u64(dp1
+ 0, v0
);
31 vst1q_u64(dp1
+ 2, v1
);
32 vst1q_u64(dp1
+ 4, v2
);
33 vst1q_u64(dp1
+ 6, v3
);
37 } while (--lines
> 0);
40 void xor_arm64_neon_3(unsigned long bytes
, unsigned long *p1
,
41 unsigned long *p2
, unsigned long *p3
)
43 uint64_t *dp1
= (uint64_t *)p1
;
44 uint64_t *dp2
= (uint64_t *)p2
;
45 uint64_t *dp3
= (uint64_t *)p3
;
47 register uint64x2_t v0
, v1
, v2
, v3
;
48 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
52 v0
= veorq_u64(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0));
53 v1
= veorq_u64(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2));
54 v2
= veorq_u64(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4));
55 v3
= veorq_u64(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6));
58 v0
= veorq_u64(v0
, vld1q_u64(dp3
+ 0));
59 v1
= veorq_u64(v1
, vld1q_u64(dp3
+ 2));
60 v2
= veorq_u64(v2
, vld1q_u64(dp3
+ 4));
61 v3
= veorq_u64(v3
, vld1q_u64(dp3
+ 6));
64 vst1q_u64(dp1
+ 0, v0
);
65 vst1q_u64(dp1
+ 2, v1
);
66 vst1q_u64(dp1
+ 4, v2
);
67 vst1q_u64(dp1
+ 6, v3
);
72 } while (--lines
> 0);
75 void xor_arm64_neon_4(unsigned long bytes
, unsigned long *p1
,
76 unsigned long *p2
, unsigned long *p3
, unsigned long *p4
)
78 uint64_t *dp1
= (uint64_t *)p1
;
79 uint64_t *dp2
= (uint64_t *)p2
;
80 uint64_t *dp3
= (uint64_t *)p3
;
81 uint64_t *dp4
= (uint64_t *)p4
;
83 register uint64x2_t v0
, v1
, v2
, v3
;
84 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
88 v0
= veorq_u64(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0));
89 v1
= veorq_u64(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2));
90 v2
= veorq_u64(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4));
91 v3
= veorq_u64(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6));
94 v0
= veorq_u64(v0
, vld1q_u64(dp3
+ 0));
95 v1
= veorq_u64(v1
, vld1q_u64(dp3
+ 2));
96 v2
= veorq_u64(v2
, vld1q_u64(dp3
+ 4));
97 v3
= veorq_u64(v3
, vld1q_u64(dp3
+ 6));
100 v0
= veorq_u64(v0
, vld1q_u64(dp4
+ 0));
101 v1
= veorq_u64(v1
, vld1q_u64(dp4
+ 2));
102 v2
= veorq_u64(v2
, vld1q_u64(dp4
+ 4));
103 v3
= veorq_u64(v3
, vld1q_u64(dp4
+ 6));
106 vst1q_u64(dp1
+ 0, v0
);
107 vst1q_u64(dp1
+ 2, v1
);
108 vst1q_u64(dp1
+ 4, v2
);
109 vst1q_u64(dp1
+ 6, v3
);
115 } while (--lines
> 0);
118 void xor_arm64_neon_5(unsigned long bytes
, unsigned long *p1
,
119 unsigned long *p2
, unsigned long *p3
,
120 unsigned long *p4
, unsigned long *p5
)
122 uint64_t *dp1
= (uint64_t *)p1
;
123 uint64_t *dp2
= (uint64_t *)p2
;
124 uint64_t *dp3
= (uint64_t *)p3
;
125 uint64_t *dp4
= (uint64_t *)p4
;
126 uint64_t *dp5
= (uint64_t *)p5
;
128 register uint64x2_t v0
, v1
, v2
, v3
;
129 long lines
= bytes
/ (sizeof(uint64x2_t
) * 4);
133 v0
= veorq_u64(vld1q_u64(dp1
+ 0), vld1q_u64(dp2
+ 0));
134 v1
= veorq_u64(vld1q_u64(dp1
+ 2), vld1q_u64(dp2
+ 2));
135 v2
= veorq_u64(vld1q_u64(dp1
+ 4), vld1q_u64(dp2
+ 4));
136 v3
= veorq_u64(vld1q_u64(dp1
+ 6), vld1q_u64(dp2
+ 6));
139 v0
= veorq_u64(v0
, vld1q_u64(dp3
+ 0));
140 v1
= veorq_u64(v1
, vld1q_u64(dp3
+ 2));
141 v2
= veorq_u64(v2
, vld1q_u64(dp3
+ 4));
142 v3
= veorq_u64(v3
, vld1q_u64(dp3
+ 6));
145 v0
= veorq_u64(v0
, vld1q_u64(dp4
+ 0));
146 v1
= veorq_u64(v1
, vld1q_u64(dp4
+ 2));
147 v2
= veorq_u64(v2
, vld1q_u64(dp4
+ 4));
148 v3
= veorq_u64(v3
, vld1q_u64(dp4
+ 6));
151 v0
= veorq_u64(v0
, vld1q_u64(dp5
+ 0));
152 v1
= veorq_u64(v1
, vld1q_u64(dp5
+ 2));
153 v2
= veorq_u64(v2
, vld1q_u64(dp5
+ 4));
154 v3
= veorq_u64(v3
, vld1q_u64(dp5
+ 6));
157 vst1q_u64(dp1
+ 0, v0
);
158 vst1q_u64(dp1
+ 2, v1
);
159 vst1q_u64(dp1
+ 4, v2
);
160 vst1q_u64(dp1
+ 6, v3
);
167 } while (--lines
> 0);
170 struct xor_block_template
const xor_block_inner_neon
= {
171 .name
= "__inner_neon__",
172 .do_2
= xor_arm64_neon_2
,
173 .do_3
= xor_arm64_neon_3
,
174 .do_4
= xor_arm64_neon_4
,
175 .do_5
= xor_arm64_neon_5
,
177 EXPORT_SYMBOL(xor_block_inner_neon
);
179 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
180 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
181 MODULE_LICENSE("GPL");