1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
4 ; These test should allow scheduling of the loads before the stores.
6 define void @scalable_v16i8(ptr noalias nocapture noundef %l0) {
7 ; CHECK-LABEL: scalable_v16i8:
9 ; CHECK-NEXT: ptrue p0.b
10 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
11 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl]
12 ; CHECK-NEXT: movprfx z2, z0
13 ; CHECK-NEXT: mul z2.b, p0/m, z2.b, z0.b
14 ; CHECK-NEXT: movprfx z3, z1
15 ; CHECK-NEXT: mul z3.b, p0/m, z3.b, z1.b
16 ; CHECK-NEXT: eor z0.d, z2.d, z0.d
17 ; CHECK-NEXT: eor z1.d, z3.d, z1.d
18 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
19 ; CHECK-NEXT: st1b { z1.b }, p0, [x0, #1, mul vl]
21 %l3 = load <vscale x 16 x i8>, ptr %l0, align 16
22 %l5 = mul <vscale x 16 x i8> %l3, %l3
23 %l6 = xor <vscale x 16 x i8> %l5, %l3
24 store <vscale x 16 x i8> %l6, ptr %l0, align 16
25 %l7 = tail call i64 @llvm.vscale.i64()
26 %l8 = shl nuw nsw i64 %l7, 4
27 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
28 %l11 = load <vscale x 16 x i8>, ptr %l9, align 16
29 %l13 = mul <vscale x 16 x i8> %l11, %l11
30 %l14 = xor <vscale x 16 x i8> %l13, %l11
31 store <vscale x 16 x i8> %l14, ptr %l9, align 16
35 define void @scalable_v8i16(ptr noalias nocapture noundef %l0) {
36 ; CHECK-LABEL: scalable_v8i16:
38 ; CHECK-NEXT: ptrue p0.h
39 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
40 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl]
41 ; CHECK-NEXT: movprfx z2, z0
42 ; CHECK-NEXT: mul z2.h, p0/m, z2.h, z0.h
43 ; CHECK-NEXT: movprfx z3, z1
44 ; CHECK-NEXT: mul z3.h, p0/m, z3.h, z1.h
45 ; CHECK-NEXT: eor z0.d, z2.d, z0.d
46 ; CHECK-NEXT: eor z1.d, z3.d, z1.d
47 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
48 ; CHECK-NEXT: st1h { z1.h }, p0, [x0, #1, mul vl]
50 %l3 = load <vscale x 8 x i16>, ptr %l0, align 16
51 %l5 = mul <vscale x 8 x i16> %l3, %l3
52 %l6 = xor <vscale x 8 x i16> %l5, %l3
53 store <vscale x 8 x i16> %l6, ptr %l0, align 16
54 %l7 = tail call i64 @llvm.vscale.i64()
55 %l8 = shl nuw nsw i64 %l7, 4
56 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
57 %l11 = load <vscale x 8 x i16>, ptr %l9, align 16
58 %l13 = mul <vscale x 8 x i16> %l11, %l11
59 %l14 = xor <vscale x 8 x i16> %l13, %l11
60 store <vscale x 8 x i16> %l14, ptr %l9, align 16
64 define void @scalable_v4i32(ptr noalias nocapture noundef %l0) {
65 ; CHECK-LABEL: scalable_v4i32:
67 ; CHECK-NEXT: ptrue p0.s
68 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
69 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
70 ; CHECK-NEXT: movprfx z2, z0
71 ; CHECK-NEXT: mul z2.s, p0/m, z2.s, z0.s
72 ; CHECK-NEXT: movprfx z3, z1
73 ; CHECK-NEXT: mul z3.s, p0/m, z3.s, z1.s
74 ; CHECK-NEXT: eor z0.d, z2.d, z0.d
75 ; CHECK-NEXT: eor z1.d, z3.d, z1.d
76 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
77 ; CHECK-NEXT: st1w { z1.s }, p0, [x0, #1, mul vl]
79 %l3 = load <vscale x 4 x i32>, ptr %l0, align 16
80 %l5 = mul <vscale x 4 x i32> %l3, %l3
81 %l6 = xor <vscale x 4 x i32> %l5, %l3
82 store <vscale x 4 x i32> %l6, ptr %l0, align 16
83 %l7 = tail call i64 @llvm.vscale.i64()
84 %l8 = shl nuw nsw i64 %l7, 4
85 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
86 %l11 = load <vscale x 4 x i32>, ptr %l9, align 16
87 %l13 = mul <vscale x 4 x i32> %l11, %l11
88 %l14 = xor <vscale x 4 x i32> %l13, %l11
89 store <vscale x 4 x i32> %l14, ptr %l9, align 16
93 define void @scalable_v2i64(ptr noalias nocapture noundef %l0) {
94 ; CHECK-LABEL: scalable_v2i64:
96 ; CHECK-NEXT: ptrue p0.d
97 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
98 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #1, mul vl]
99 ; CHECK-NEXT: movprfx z2, z0
100 ; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d
101 ; CHECK-NEXT: movprfx z3, z1
102 ; CHECK-NEXT: mul z3.d, p0/m, z3.d, z1.d
103 ; CHECK-NEXT: eor z0.d, z2.d, z0.d
104 ; CHECK-NEXT: eor z1.d, z3.d, z1.d
105 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
106 ; CHECK-NEXT: st1d { z1.d }, p0, [x0, #1, mul vl]
108 %l3 = load <vscale x 2 x i64>, ptr %l0, align 16
109 %l5 = mul <vscale x 2 x i64> %l3, %l3
110 %l6 = xor <vscale x 2 x i64> %l5, %l3
111 store <vscale x 2 x i64> %l6, ptr %l0, align 16
112 %l7 = tail call i64 @llvm.vscale.i64()
113 %l8 = shl nuw nsw i64 %l7, 4
114 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
115 %l11 = load <vscale x 2 x i64>, ptr %l9, align 16
116 %l13 = mul <vscale x 2 x i64> %l11, %l11
117 %l14 = xor <vscale x 2 x i64> %l13, %l11
118 store <vscale x 2 x i64> %l14, ptr %l9, align 16
122 define void @scalable_v8i8(ptr noalias nocapture noundef %l0) {
123 ; CHECK-LABEL: scalable_v8i8:
125 ; CHECK-NEXT: ptrue p0.h
126 ; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
127 ; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
128 ; CHECK-NEXT: movprfx z2, z0
129 ; CHECK-NEXT: mul z2.h, p0/m, z2.h, z0.h
130 ; CHECK-NEXT: movprfx z3, z1
131 ; CHECK-NEXT: mul z3.h, p0/m, z3.h, z1.h
132 ; CHECK-NEXT: eor z0.d, z2.d, z0.d
133 ; CHECK-NEXT: eor z1.d, z3.d, z1.d
134 ; CHECK-NEXT: st1b { z0.h }, p0, [x0]
135 ; CHECK-NEXT: st1b { z1.h }, p0, [x0, #1, mul vl]
137 %l3 = load <vscale x 8 x i8>, ptr %l0, align 16
138 %s3 = sext <vscale x 8 x i8> %l3 to <vscale x 8 x i16>
139 %l5 = mul <vscale x 8 x i16> %s3, %s3
140 %l6 = xor <vscale x 8 x i16> %l5, %s3
141 %t6 = trunc <vscale x 8 x i16> %l6 to <vscale x 8 x i8>
142 store <vscale x 8 x i8> %t6, ptr %l0, align 16
143 %l7 = tail call i64 @llvm.vscale.i64()
144 %l8 = shl nuw nsw i64 %l7, 3
145 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
146 %l11 = load <vscale x 8 x i8>, ptr %l9, align 16
147 %s11 = sext <vscale x 8 x i8> %l11 to <vscale x 8 x i16>
148 %l13 = mul <vscale x 8 x i16> %s11, %s11
149 %l14 = xor <vscale x 8 x i16> %l13, %s11
150 %t14 = trunc <vscale x 8 x i16> %l14 to <vscale x 8 x i8>
151 store <vscale x 8 x i8> %t14, ptr %l9, align 16
155 define void @scalable_v4i8(ptr noalias nocapture noundef %l0) {
156 ; CHECK-LABEL: scalable_v4i8:
158 ; CHECK-NEXT: ptrue p0.s
159 ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
160 ; CHECK-NEXT: ld1sb { z1.s }, p0/z, [x0, #1, mul vl]
161 ; CHECK-NEXT: movprfx z2, z0
162 ; CHECK-NEXT: mul z2.s, p0/m, z2.s, z0.s
163 ; CHECK-NEXT: movprfx z3, z1
164 ; CHECK-NEXT: mul z3.s, p0/m, z3.s, z1.s
165 ; CHECK-NEXT: eor z0.d, z2.d, z0.d
166 ; CHECK-NEXT: eor z1.d, z3.d, z1.d
167 ; CHECK-NEXT: st1b { z0.s }, p0, [x0]
168 ; CHECK-NEXT: st1b { z1.s }, p0, [x0, #1, mul vl]
170 %l3 = load <vscale x 4 x i8>, ptr %l0, align 16
171 %s3 = sext <vscale x 4 x i8> %l3 to <vscale x 4 x i32>
172 %l5 = mul <vscale x 4 x i32> %s3, %s3
173 %l6 = xor <vscale x 4 x i32> %l5, %s3
174 %t6 = trunc <vscale x 4 x i32> %l6 to <vscale x 4 x i8>
175 store <vscale x 4 x i8> %t6, ptr %l0, align 16
176 %l7 = tail call i64 @llvm.vscale.i64()
177 %l8 = shl nuw nsw i64 %l7, 2
178 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
179 %l11 = load <vscale x 4 x i8>, ptr %l9, align 16
180 %s11 = sext <vscale x 4 x i8> %l11 to <vscale x 4 x i32>
181 %l13 = mul <vscale x 4 x i32> %s11, %s11
182 %l14 = xor <vscale x 4 x i32> %l13, %s11
183 %t14 = trunc <vscale x 4 x i32> %l14 to <vscale x 4 x i8>
184 store <vscale x 4 x i8> %t14, ptr %l9, align 16
188 define void @scalable_v2i8(ptr noalias nocapture noundef %l0) {
189 ; CHECK-LABEL: scalable_v2i8:
191 ; CHECK-NEXT: ptrue p0.d
192 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
193 ; CHECK-NEXT: ld1sb { z1.d }, p0/z, [x0, #1, mul vl]
194 ; CHECK-NEXT: movprfx z2, z0
195 ; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d
196 ; CHECK-NEXT: movprfx z3, z1
197 ; CHECK-NEXT: mul z3.d, p0/m, z3.d, z1.d
198 ; CHECK-NEXT: eor z0.d, z2.d, z0.d
199 ; CHECK-NEXT: eor z1.d, z3.d, z1.d
200 ; CHECK-NEXT: st1b { z0.d }, p0, [x0]
201 ; CHECK-NEXT: st1b { z1.d }, p0, [x0, #1, mul vl]
203 %l3 = load <vscale x 2 x i8>, ptr %l0, align 16
204 %s3 = sext <vscale x 2 x i8> %l3 to <vscale x 2 x i64>
205 %l5 = mul <vscale x 2 x i64> %s3, %s3
206 %l6 = xor <vscale x 2 x i64> %l5, %s3
207 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i8>
208 store <vscale x 2 x i8> %t6, ptr %l0, align 16
209 %l7 = tail call i64 @llvm.vscale.i64()
210 %l8 = shl nuw nsw i64 %l7, 1
211 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
212 %l11 = load <vscale x 2 x i8>, ptr %l9, align 16
213 %s11 = sext <vscale x 2 x i8> %l11 to <vscale x 2 x i64>
214 %l13 = mul <vscale x 2 x i64> %s11, %s11
215 %l14 = xor <vscale x 2 x i64> %l13, %s11
216 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i8>
217 store <vscale x 2 x i8> %t14, ptr %l9, align 16
221 define void @scalable_v4i16(ptr noalias nocapture noundef %l0) {
222 ; CHECK-LABEL: scalable_v4i16:
224 ; CHECK-NEXT: ptrue p0.s
225 ; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
226 ; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
227 ; CHECK-NEXT: movprfx z2, z0
228 ; CHECK-NEXT: mul z2.s, p0/m, z2.s, z0.s
229 ; CHECK-NEXT: movprfx z3, z1
230 ; CHECK-NEXT: mul z3.s, p0/m, z3.s, z1.s
231 ; CHECK-NEXT: eor z0.d, z2.d, z0.d
232 ; CHECK-NEXT: eor z1.d, z3.d, z1.d
233 ; CHECK-NEXT: st1h { z0.s }, p0, [x0]
234 ; CHECK-NEXT: st1h { z1.s }, p0, [x0, #1, mul vl]
236 %l3 = load <vscale x 4 x i16>, ptr %l0, align 16
237 %s3 = sext <vscale x 4 x i16> %l3 to <vscale x 4 x i32>
238 %l5 = mul <vscale x 4 x i32> %s3, %s3
239 %l6 = xor <vscale x 4 x i32> %l5, %s3
240 %t6 = trunc <vscale x 4 x i32> %l6 to <vscale x 4 x i16>
241 store <vscale x 4 x i16> %t6, ptr %l0, align 16
242 %l7 = tail call i64 @llvm.vscale.i64()
243 %l8 = shl nuw nsw i64 %l7, 3
244 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
245 %l11 = load <vscale x 4 x i16>, ptr %l9, align 16
246 %s11 = sext <vscale x 4 x i16> %l11 to <vscale x 4 x i32>
247 %l13 = mul <vscale x 4 x i32> %s11, %s11
248 %l14 = xor <vscale x 4 x i32> %l13, %s11
249 %t14 = trunc <vscale x 4 x i32> %l14 to <vscale x 4 x i16>
250 store <vscale x 4 x i16> %t14, ptr %l9, align 16
254 define void @scalable_v2i16(ptr noalias nocapture noundef %l0) {
255 ; CHECK-LABEL: scalable_v2i16:
257 ; CHECK-NEXT: ptrue p0.d
258 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
259 ; CHECK-NEXT: ld1sh { z1.d }, p0/z, [x0, #1, mul vl]
260 ; CHECK-NEXT: movprfx z2, z0
261 ; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d
262 ; CHECK-NEXT: movprfx z3, z1
263 ; CHECK-NEXT: mul z3.d, p0/m, z3.d, z1.d
264 ; CHECK-NEXT: eor z0.d, z2.d, z0.d
265 ; CHECK-NEXT: eor z1.d, z3.d, z1.d
266 ; CHECK-NEXT: st1h { z0.d }, p0, [x0]
267 ; CHECK-NEXT: st1h { z1.d }, p0, [x0, #1, mul vl]
269 %l3 = load <vscale x 2 x i16>, ptr %l0, align 16
270 %s3 = sext <vscale x 2 x i16> %l3 to <vscale x 2 x i64>
271 %l5 = mul <vscale x 2 x i64> %s3, %s3
272 %l6 = xor <vscale x 2 x i64> %l5, %s3
273 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i16>
274 store <vscale x 2 x i16> %t6, ptr %l0, align 16
275 %l7 = tail call i64 @llvm.vscale.i64()
276 %l8 = shl nuw nsw i64 %l7, 2
277 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
278 %l11 = load <vscale x 2 x i16>, ptr %l9, align 16
279 %s11 = sext <vscale x 2 x i16> %l11 to <vscale x 2 x i64>
280 %l13 = mul <vscale x 2 x i64> %s11, %s11
281 %l14 = xor <vscale x 2 x i64> %l13, %s11
282 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i16>
283 store <vscale x 2 x i16> %t14, ptr %l9, align 16
287 define void @scalable_v2i32(ptr noalias nocapture noundef %l0) {
288 ; CHECK-LABEL: scalable_v2i32:
290 ; CHECK-NEXT: ptrue p0.d
291 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
292 ; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
293 ; CHECK-NEXT: movprfx z2, z0
294 ; CHECK-NEXT: mul z2.d, p0/m, z2.d, z0.d
295 ; CHECK-NEXT: movprfx z3, z1
296 ; CHECK-NEXT: mul z3.d, p0/m, z3.d, z1.d
297 ; CHECK-NEXT: eor z0.d, z2.d, z0.d
298 ; CHECK-NEXT: eor z1.d, z3.d, z1.d
299 ; CHECK-NEXT: st1w { z0.d }, p0, [x0]
300 ; CHECK-NEXT: st1w { z1.d }, p0, [x0, #1, mul vl]
302 %l3 = load <vscale x 2 x i32>, ptr %l0, align 16
303 %s3 = sext <vscale x 2 x i32> %l3 to <vscale x 2 x i64>
304 %l5 = mul <vscale x 2 x i64> %s3, %s3
305 %l6 = xor <vscale x 2 x i64> %l5, %s3
306 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i32>
307 store <vscale x 2 x i32> %t6, ptr %l0, align 16
308 %l7 = tail call i64 @llvm.vscale.i64()
309 %l8 = shl nuw nsw i64 %l7, 3
310 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
311 %l11 = load <vscale x 2 x i32>, ptr %l9, align 16
312 %s11 = sext <vscale x 2 x i32> %l11 to <vscale x 2 x i64>
313 %l13 = mul <vscale x 2 x i64> %s11, %s11
314 %l14 = xor <vscale x 2 x i64> %l13, %s11
315 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i32>
316 store <vscale x 2 x i32> %t14, ptr %l9, align 16
320 define void @negative_tooshort_v16i8(ptr noalias nocapture noundef %l0) {
321 ; CHECK-LABEL: negative_tooshort_v16i8:
323 ; CHECK-NEXT: ptrue p0.b
324 ; CHECK-NEXT: cnth x8
325 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
326 ; CHECK-NEXT: movprfx z1, z0
327 ; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b
328 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
329 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
330 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
331 ; CHECK-NEXT: movprfx z1, z0
332 ; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b
333 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
334 ; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8]
336 %l3 = load <vscale x 16 x i8>, ptr %l0, align 16
337 %l5 = mul <vscale x 16 x i8> %l3, %l3
338 %l6 = xor <vscale x 16 x i8> %l5, %l3
339 store <vscale x 16 x i8> %l6, ptr %l0, align 16
340 %l7 = tail call i64 @llvm.vscale.i64()
341 %l8 = shl nuw nsw i64 %l7, 3
342 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
343 %l11 = load <vscale x 16 x i8>, ptr %l9, align 16
344 %l13 = mul <vscale x 16 x i8> %l11, %l11
345 %l14 = xor <vscale x 16 x i8> %l13, %l11
346 store <vscale x 16 x i8> %l14, ptr %l9, align 16
350 define void @negative_scalable_v2i8(ptr noalias nocapture noundef %l0) {
351 ; CHECK-LABEL: negative_scalable_v2i8:
353 ; CHECK-NEXT: ptrue p0.d
354 ; CHECK-NEXT: rdvl x8, #1
355 ; CHECK-NEXT: lsr x8, x8, #4
356 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
357 ; CHECK-NEXT: movprfx z1, z0
358 ; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d
359 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
360 ; CHECK-NEXT: st1b { z0.d }, p0, [x0]
361 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x8]
362 ; CHECK-NEXT: movprfx z1, z0
363 ; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d
364 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
365 ; CHECK-NEXT: st1b { z0.d }, p0, [x0, x8]
367 %l3 = load <vscale x 2 x i8>, ptr %l0, align 16
368 %s3 = sext <vscale x 2 x i8> %l3 to <vscale x 2 x i64>
369 %l5 = mul <vscale x 2 x i64> %s3, %s3
370 %l6 = xor <vscale x 2 x i64> %l5, %s3
371 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i8>
372 store <vscale x 2 x i8> %t6, ptr %l0, align 16
373 %l7 = tail call i64 @llvm.vscale.i64()
374 %l8 = shl nuw nsw i64 %l7, 0
375 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
376 %l11 = load <vscale x 2 x i8>, ptr %l9, align 16
377 %s11 = sext <vscale x 2 x i8> %l11 to <vscale x 2 x i64>
378 %l13 = mul <vscale x 2 x i64> %s11, %s11
379 %l14 = xor <vscale x 2 x i64> %l13, %s11
380 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i8>
381 store <vscale x 2 x i8> %t14, ptr %l9, align 16
385 define void @negative_scalable_v2i16(ptr noalias nocapture noundef %l0) {
386 ; CHECK-LABEL: negative_scalable_v2i16:
388 ; CHECK-NEXT: ptrue p0.d
389 ; CHECK-NEXT: cntd x8
390 ; CHECK-NEXT: add x8, x0, x8
391 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
392 ; CHECK-NEXT: movprfx z1, z0
393 ; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d
394 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
395 ; CHECK-NEXT: st1h { z0.d }, p0, [x0]
396 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x8]
397 ; CHECK-NEXT: movprfx z1, z0
398 ; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d
399 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
400 ; CHECK-NEXT: st1h { z0.d }, p0, [x8]
402 %l3 = load <vscale x 2 x i16>, ptr %l0, align 16
403 %s3 = sext <vscale x 2 x i16> %l3 to <vscale x 2 x i64>
404 %l5 = mul <vscale x 2 x i64> %s3, %s3
405 %l6 = xor <vscale x 2 x i64> %l5, %s3
406 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i16>
407 store <vscale x 2 x i16> %t6, ptr %l0, align 16
408 %l7 = tail call i64 @llvm.vscale.i64()
409 %l8 = shl nuw nsw i64 %l7, 1
410 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
411 %l11 = load <vscale x 2 x i16>, ptr %l9, align 16
412 %s11 = sext <vscale x 2 x i16> %l11 to <vscale x 2 x i64>
413 %l13 = mul <vscale x 2 x i64> %s11, %s11
414 %l14 = xor <vscale x 2 x i64> %l13, %s11
415 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i16>
416 store <vscale x 2 x i16> %t14, ptr %l9, align 16
420 define void @negative_scalable_v2i32(ptr noalias nocapture noundef %l0) {
421 ; CHECK-LABEL: negative_scalable_v2i32:
423 ; CHECK-NEXT: ptrue p0.d
424 ; CHECK-NEXT: cntw x8
425 ; CHECK-NEXT: add x8, x0, x8
426 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
427 ; CHECK-NEXT: movprfx z1, z0
428 ; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d
429 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
430 ; CHECK-NEXT: st1w { z0.d }, p0, [x0]
431 ; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x8]
432 ; CHECK-NEXT: movprfx z1, z0
433 ; CHECK-NEXT: mul z1.d, p0/m, z1.d, z0.d
434 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
435 ; CHECK-NEXT: st1w { z0.d }, p0, [x8]
437 %l3 = load <vscale x 2 x i32>, ptr %l0, align 16
438 %s3 = sext <vscale x 2 x i32> %l3 to <vscale x 2 x i64>
439 %l5 = mul <vscale x 2 x i64> %s3, %s3
440 %l6 = xor <vscale x 2 x i64> %l5, %s3
441 %t6 = trunc <vscale x 2 x i64> %l6 to <vscale x 2 x i32>
442 store <vscale x 2 x i32> %t6, ptr %l0, align 16
443 %l7 = tail call i64 @llvm.vscale.i64()
444 %l8 = shl nuw nsw i64 %l7, 2
445 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
446 %l11 = load <vscale x 2 x i32>, ptr %l9, align 16
447 %s11 = sext <vscale x 2 x i32> %l11 to <vscale x 2 x i64>
448 %l13 = mul <vscale x 2 x i64> %s11, %s11
449 %l14 = xor <vscale x 2 x i64> %l13, %s11
450 %t14 = trunc <vscale x 2 x i64> %l14 to <vscale x 2 x i32>
451 store <vscale x 2 x i32> %t14, ptr %l9, align 16
455 define void @triple_v16i8(ptr noalias nocapture noundef %l0) {
456 ; CHECK-LABEL: triple_v16i8:
458 ; CHECK-NEXT: ptrue p0.b
459 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
460 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl]
461 ; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, #2, mul vl]
462 ; CHECK-NEXT: movprfx z3, z0
463 ; CHECK-NEXT: mul z3.b, p0/m, z3.b, z0.b
464 ; CHECK-NEXT: movprfx z4, z1
465 ; CHECK-NEXT: mul z4.b, p0/m, z4.b, z1.b
466 ; CHECK-NEXT: movprfx z5, z2
467 ; CHECK-NEXT: mul z5.b, p0/m, z5.b, z2.b
468 ; CHECK-NEXT: eor z0.d, z3.d, z0.d
469 ; CHECK-NEXT: eor z1.d, z4.d, z1.d
470 ; CHECK-NEXT: eor z2.d, z5.d, z2.d
471 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
472 ; CHECK-NEXT: st1b { z1.b }, p0, [x0, #1, mul vl]
473 ; CHECK-NEXT: st1b { z2.b }, p0, [x0, #2, mul vl]
475 %l3 = load <vscale x 16 x i8>, ptr %l0, align 16
476 %l5 = mul <vscale x 16 x i8> %l3, %l3
477 %l6 = xor <vscale x 16 x i8> %l5, %l3
478 store <vscale x 16 x i8> %l6, ptr %l0, align 16
479 %l7 = tail call i64 @llvm.vscale.i64()
480 %l8 = shl nuw nsw i64 %l7, 4
481 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
482 %l11 = load <vscale x 16 x i8>, ptr %l9, align 16
483 %l13 = mul <vscale x 16 x i8> %l11, %l11
484 %l14 = xor <vscale x 16 x i8> %l13, %l11
485 store <vscale x 16 x i8> %l14, ptr %l9, align 16
486 %m9 = getelementptr inbounds i8, ptr %l9, i64 %l8
487 %m11 = load <vscale x 16 x i8>, ptr %m9, align 16
488 %m13 = mul <vscale x 16 x i8> %m11, %m11
489 %m14 = xor <vscale x 16 x i8> %m13, %m11
490 store <vscale x 16 x i8> %m14, ptr %m9, align 16
494 define void @negative_tripletooshort_v16i8(ptr noalias nocapture noundef %l0) {
495 ; CHECK-LABEL: negative_tripletooshort_v16i8:
497 ; CHECK-NEXT: ptrue p0.b
498 ; CHECK-NEXT: cntw x8
499 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
500 ; CHECK-NEXT: movprfx z1, z0
501 ; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b
502 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
503 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
504 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
505 ; CHECK-NEXT: movprfx z1, z0
506 ; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b
507 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
508 ; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8]
509 ; CHECK-NEXT: cnth x8
510 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
511 ; CHECK-NEXT: movprfx z1, z0
512 ; CHECK-NEXT: mul z1.b, p0/m, z1.b, z0.b
513 ; CHECK-NEXT: eor z0.d, z1.d, z0.d
514 ; CHECK-NEXT: st1b { z0.b }, p0, [x0, x8]
516 %l3 = load <vscale x 16 x i8>, ptr %l0, align 16
517 %l5 = mul <vscale x 16 x i8> %l3, %l3
518 %l6 = xor <vscale x 16 x i8> %l5, %l3
519 store <vscale x 16 x i8> %l6, ptr %l0, align 16
520 %l7 = tail call i64 @llvm.vscale.i64()
521 %l8 = shl nuw nsw i64 %l7, 2
522 %l9 = getelementptr inbounds i8, ptr %l0, i64 %l8
523 %l11 = load <vscale x 16 x i8>, ptr %l9, align 16
524 %l13 = mul <vscale x 16 x i8> %l11, %l11
525 %l14 = xor <vscale x 16 x i8> %l13, %l11
526 store <vscale x 16 x i8> %l14, ptr %l9, align 16
527 %m9 = getelementptr inbounds i8, ptr %l9, i64 %l8
528 %m11 = load <vscale x 16 x i8>, ptr %m9, align 16
529 %m13 = mul <vscale x 16 x i8> %m11, %m11
530 %m14 = xor <vscale x 16 x i8> %m13, %m11
531 store <vscale x 16 x i8> %m14, ptr %m9, align 16
535 declare i64 @llvm.vscale.i64()