1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; Don't use SVE for 64-bit vectors.
13 define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
14 ; CHECK-LABEL: add_v8i8:
16 ; CHECK-NEXT: add v0.8b, v0.8b, v1.8b
18 %res = add <8 x i8> %op1, %op2
22 ; Don't use SVE for 128-bit vectors.
23 define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
24 ; CHECK-LABEL: add_v16i8:
26 ; CHECK-NEXT: add v0.16b, v0.16b, v1.16b
28 %res = add <16 x i8> %op1, %op2
32 define void @add_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
33 ; CHECK-LABEL: add_v32i8:
35 ; CHECK-NEXT: ptrue p0.b, vl32
36 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
37 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
38 ; CHECK-NEXT: add z0.b, z0.b, z1.b
39 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
41 %op1 = load <32 x i8>, ptr %a
42 %op2 = load <32 x i8>, ptr %b
43 %res = add <32 x i8> %op1, %op2
44 store <32 x i8> %res, ptr %a
48 define void @add_v64i8(ptr %a, ptr %b) #0 {
49 ; VBITS_GE_256-LABEL: add_v64i8:
50 ; VBITS_GE_256: // %bb.0:
51 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
52 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
53 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
54 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
55 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
56 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
57 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z2.b
58 ; VBITS_GE_256-NEXT: add z1.b, z1.b, z3.b
59 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
60 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
61 ; VBITS_GE_256-NEXT: ret
63 ; VBITS_GE_512-LABEL: add_v64i8:
64 ; VBITS_GE_512: // %bb.0:
65 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
66 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
67 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
68 ; VBITS_GE_512-NEXT: add z0.b, z0.b, z1.b
69 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
70 ; VBITS_GE_512-NEXT: ret
71 %op1 = load <64 x i8>, ptr %a
72 %op2 = load <64 x i8>, ptr %b
73 %res = add <64 x i8> %op1, %op2
74 store <64 x i8> %res, ptr %a
78 define void @add_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
79 ; CHECK-LABEL: add_v128i8:
81 ; CHECK-NEXT: ptrue p0.b, vl128
82 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
83 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
84 ; CHECK-NEXT: add z0.b, z0.b, z1.b
85 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
87 %op1 = load <128 x i8>, ptr %a
88 %op2 = load <128 x i8>, ptr %b
89 %res = add <128 x i8> %op1, %op2
90 store <128 x i8> %res, ptr %a
94 define void @add_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
95 ; CHECK-LABEL: add_v256i8:
97 ; CHECK-NEXT: ptrue p0.b, vl256
98 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
99 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
100 ; CHECK-NEXT: add z0.b, z0.b, z1.b
101 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
103 %op1 = load <256 x i8>, ptr %a
104 %op2 = load <256 x i8>, ptr %b
105 %res = add <256 x i8> %op1, %op2
106 store <256 x i8> %res, ptr %a
110 ; Don't use SVE for 64-bit vectors.
111 define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
112 ; CHECK-LABEL: add_v4i16:
114 ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
116 %res = add <4 x i16> %op1, %op2
120 ; Don't use SVE for 128-bit vectors.
121 define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
122 ; CHECK-LABEL: add_v8i16:
124 ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
126 %res = add <8 x i16> %op1, %op2
130 define void @add_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
131 ; CHECK-LABEL: add_v16i16:
133 ; CHECK-NEXT: ptrue p0.h, vl16
134 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
135 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
136 ; CHECK-NEXT: add z0.h, z0.h, z1.h
137 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
139 %op1 = load <16 x i16>, ptr %a
140 %op2 = load <16 x i16>, ptr %b
141 %res = add <16 x i16> %op1, %op2
142 store <16 x i16> %res, ptr %a
146 define void @add_v32i16(ptr %a, ptr %b) #0 {
147 ; VBITS_GE_256-LABEL: add_v32i16:
148 ; VBITS_GE_256: // %bb.0:
149 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
150 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
151 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
152 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
153 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
154 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
155 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z2.h
156 ; VBITS_GE_256-NEXT: add z1.h, z1.h, z3.h
157 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
158 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
159 ; VBITS_GE_256-NEXT: ret
161 ; VBITS_GE_512-LABEL: add_v32i16:
162 ; VBITS_GE_512: // %bb.0:
163 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
164 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
165 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
166 ; VBITS_GE_512-NEXT: add z0.h, z0.h, z1.h
167 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
168 ; VBITS_GE_512-NEXT: ret
169 %op1 = load <32 x i16>, ptr %a
170 %op2 = load <32 x i16>, ptr %b
171 %res = add <32 x i16> %op1, %op2
172 store <32 x i16> %res, ptr %a
176 define void @add_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
177 ; CHECK-LABEL: add_v64i16:
179 ; CHECK-NEXT: ptrue p0.h, vl64
180 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
181 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
182 ; CHECK-NEXT: add z0.h, z0.h, z1.h
183 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
185 %op1 = load <64 x i16>, ptr %a
186 %op2 = load <64 x i16>, ptr %b
187 %res = add <64 x i16> %op1, %op2
188 store <64 x i16> %res, ptr %a
192 define void @add_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
193 ; CHECK-LABEL: add_v128i16:
195 ; CHECK-NEXT: ptrue p0.h, vl128
196 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
197 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
198 ; CHECK-NEXT: add z0.h, z0.h, z1.h
199 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
201 %op1 = load <128 x i16>, ptr %a
202 %op2 = load <128 x i16>, ptr %b
203 %res = add <128 x i16> %op1, %op2
204 store <128 x i16> %res, ptr %a
208 ; Don't use SVE for 64-bit vectors.
209 define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
210 ; CHECK-LABEL: add_v2i32:
212 ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s
214 %res = add <2 x i32> %op1, %op2
218 ; Don't use SVE for 128-bit vectors.
219 define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
220 ; CHECK-LABEL: add_v4i32:
222 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
224 %res = add <4 x i32> %op1, %op2
228 define void @add_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
229 ; CHECK-LABEL: add_v8i32:
231 ; CHECK-NEXT: ptrue p0.s, vl8
232 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
233 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
234 ; CHECK-NEXT: add z0.s, z0.s, z1.s
235 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
237 %op1 = load <8 x i32>, ptr %a
238 %op2 = load <8 x i32>, ptr %b
239 %res = add <8 x i32> %op1, %op2
240 store <8 x i32> %res, ptr %a
244 define void @add_v16i32(ptr %a, ptr %b) #0 {
245 ; VBITS_GE_256-LABEL: add_v16i32:
246 ; VBITS_GE_256: // %bb.0:
247 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
248 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
249 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
250 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
251 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
252 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
253 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z2.s
254 ; VBITS_GE_256-NEXT: add z1.s, z1.s, z3.s
255 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
256 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
257 ; VBITS_GE_256-NEXT: ret
259 ; VBITS_GE_512-LABEL: add_v16i32:
260 ; VBITS_GE_512: // %bb.0:
261 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
262 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
263 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
264 ; VBITS_GE_512-NEXT: add z0.s, z0.s, z1.s
265 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
266 ; VBITS_GE_512-NEXT: ret
267 %op1 = load <16 x i32>, ptr %a
268 %op2 = load <16 x i32>, ptr %b
269 %res = add <16 x i32> %op1, %op2
270 store <16 x i32> %res, ptr %a
274 define void @add_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
275 ; CHECK-LABEL: add_v32i32:
277 ; CHECK-NEXT: ptrue p0.s, vl32
278 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
279 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
280 ; CHECK-NEXT: add z0.s, z0.s, z1.s
281 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
283 %op1 = load <32 x i32>, ptr %a
284 %op2 = load <32 x i32>, ptr %b
285 %res = add <32 x i32> %op1, %op2
286 store <32 x i32> %res, ptr %a
290 define void @add_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
291 ; CHECK-LABEL: add_v64i32:
293 ; CHECK-NEXT: ptrue p0.s, vl64
294 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
295 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
296 ; CHECK-NEXT: add z0.s, z0.s, z1.s
297 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
299 %op1 = load <64 x i32>, ptr %a
300 %op2 = load <64 x i32>, ptr %b
301 %res = add <64 x i32> %op1, %op2
302 store <64 x i32> %res, ptr %a
306 ; Don't use SVE for 64-bit vectors.
307 define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
308 ; CHECK-LABEL: add_v1i64:
310 ; CHECK-NEXT: add d0, d0, d1
312 %res = add <1 x i64> %op1, %op2
316 ; Don't use SVE for 128-bit vectors.
317 define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
318 ; CHECK-LABEL: add_v2i64:
320 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
322 %res = add <2 x i64> %op1, %op2
326 define void @add_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
327 ; CHECK-LABEL: add_v4i64:
329 ; CHECK-NEXT: ptrue p0.d, vl4
330 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
331 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
332 ; CHECK-NEXT: add z0.d, z0.d, z1.d
333 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
335 %op1 = load <4 x i64>, ptr %a
336 %op2 = load <4 x i64>, ptr %b
337 %res = add <4 x i64> %op1, %op2
338 store <4 x i64> %res, ptr %a
342 define void @add_v8i64(ptr %a, ptr %b) #0 {
343 ; VBITS_GE_256-LABEL: add_v8i64:
344 ; VBITS_GE_256: // %bb.0:
345 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
346 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
347 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
348 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
349 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
350 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
351 ; VBITS_GE_256-NEXT: add z0.d, z0.d, z2.d
352 ; VBITS_GE_256-NEXT: add z1.d, z1.d, z3.d
353 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
354 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
355 ; VBITS_GE_256-NEXT: ret
357 ; VBITS_GE_512-LABEL: add_v8i64:
358 ; VBITS_GE_512: // %bb.0:
359 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
360 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
361 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
362 ; VBITS_GE_512-NEXT: add z0.d, z0.d, z1.d
363 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
364 ; VBITS_GE_512-NEXT: ret
365 %op1 = load <8 x i64>, ptr %a
366 %op2 = load <8 x i64>, ptr %b
367 %res = add <8 x i64> %op1, %op2
368 store <8 x i64> %res, ptr %a
372 define void @add_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
373 ; CHECK-LABEL: add_v16i64:
375 ; CHECK-NEXT: ptrue p0.d, vl16
376 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
377 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
378 ; CHECK-NEXT: add z0.d, z0.d, z1.d
379 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
381 %op1 = load <16 x i64>, ptr %a
382 %op2 = load <16 x i64>, ptr %b
383 %res = add <16 x i64> %op1, %op2
384 store <16 x i64> %res, ptr %a
388 define void @add_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
389 ; CHECK-LABEL: add_v32i64:
391 ; CHECK-NEXT: ptrue p0.d, vl16
392 ; CHECK-NEXT: mov x8, #16 // =0x10
393 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
394 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
395 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
396 ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1]
397 ; CHECK-NEXT: add z0.d, z0.d, z2.d
398 ; CHECK-NEXT: add z1.d, z1.d, z3.d
399 ; CHECK-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
400 ; CHECK-NEXT: st1d { z1.d }, p0, [x0]
402 %op1 = load <32 x i64>, ptr %a
403 %op2 = load <32 x i64>, ptr %b
404 %res = add <32 x i64> %op1, %op2
405 store <32 x i64> %res, ptr %a
413 ; Don't use SVE for 64-bit vectors.
414 define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
415 ; CHECK-LABEL: mul_v8i8:
417 ; CHECK-NEXT: mul v0.8b, v0.8b, v1.8b
419 %res = mul <8 x i8> %op1, %op2
423 ; Don't use SVE for 128-bit vectors.
424 define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
425 ; CHECK-LABEL: mul_v16i8:
427 ; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b
429 %res = mul <16 x i8> %op1, %op2
433 define void @mul_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
434 ; CHECK-LABEL: mul_v32i8:
436 ; CHECK-NEXT: ptrue p0.b, vl32
437 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
438 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
439 ; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
440 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
442 %op1 = load <32 x i8>, ptr %a
443 %op2 = load <32 x i8>, ptr %b
444 %res = mul <32 x i8> %op1, %op2
445 store <32 x i8> %res, ptr %a
449 define void @mul_v64i8(ptr %a, ptr %b) #0 {
450 ; VBITS_GE_256-LABEL: mul_v64i8:
451 ; VBITS_GE_256: // %bb.0:
452 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
453 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
454 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
455 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
456 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
457 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
458 ; VBITS_GE_256-NEXT: mul z0.b, p0/m, z0.b, z2.b
459 ; VBITS_GE_256-NEXT: mul z1.b, p0/m, z1.b, z3.b
460 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
461 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
462 ; VBITS_GE_256-NEXT: ret
464 ; VBITS_GE_512-LABEL: mul_v64i8:
465 ; VBITS_GE_512: // %bb.0:
466 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
467 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
468 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
469 ; VBITS_GE_512-NEXT: mul z0.b, p0/m, z0.b, z1.b
470 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
471 ; VBITS_GE_512-NEXT: ret
472 %op1 = load <64 x i8>, ptr %a
473 %op2 = load <64 x i8>, ptr %b
474 %res = mul <64 x i8> %op1, %op2
475 store <64 x i8> %res, ptr %a
479 define void @mul_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
480 ; CHECK-LABEL: mul_v128i8:
482 ; CHECK-NEXT: ptrue p0.b, vl128
483 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
484 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
485 ; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
486 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
488 %op1 = load <128 x i8>, ptr %a
489 %op2 = load <128 x i8>, ptr %b
490 %res = mul <128 x i8> %op1, %op2
491 store <128 x i8> %res, ptr %a
495 define void @mul_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
496 ; CHECK-LABEL: mul_v256i8:
498 ; CHECK-NEXT: ptrue p0.b, vl256
499 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
500 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
501 ; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
502 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
504 %op1 = load <256 x i8>, ptr %a
505 %op2 = load <256 x i8>, ptr %b
506 %res = mul <256 x i8> %op1, %op2
507 store <256 x i8> %res, ptr %a
511 ; Don't use SVE for 64-bit vectors.
512 define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
513 ; CHECK-LABEL: mul_v4i16:
515 ; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h
517 %res = mul <4 x i16> %op1, %op2
521 ; Don't use SVE for 128-bit vectors.
522 define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
523 ; CHECK-LABEL: mul_v8i16:
525 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
527 %res = mul <8 x i16> %op1, %op2
531 define void @mul_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
532 ; CHECK-LABEL: mul_v16i16:
534 ; CHECK-NEXT: ptrue p0.h, vl16
535 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
536 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
537 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
538 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
540 %op1 = load <16 x i16>, ptr %a
541 %op2 = load <16 x i16>, ptr %b
542 %res = mul <16 x i16> %op1, %op2
543 store <16 x i16> %res, ptr %a
547 define void @mul_v32i16(ptr %a, ptr %b) #0 {
548 ; VBITS_GE_256-LABEL: mul_v32i16:
549 ; VBITS_GE_256: // %bb.0:
550 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
551 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
552 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
553 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
554 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
555 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
556 ; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z2.h
557 ; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z3.h
558 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
559 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
560 ; VBITS_GE_256-NEXT: ret
562 ; VBITS_GE_512-LABEL: mul_v32i16:
563 ; VBITS_GE_512: // %bb.0:
564 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
565 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
566 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
567 ; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h
568 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
569 ; VBITS_GE_512-NEXT: ret
570 %op1 = load <32 x i16>, ptr %a
571 %op2 = load <32 x i16>, ptr %b
572 %res = mul <32 x i16> %op1, %op2
573 store <32 x i16> %res, ptr %a
577 define void @mul_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
578 ; CHECK-LABEL: mul_v64i16:
580 ; CHECK-NEXT: ptrue p0.h, vl64
581 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
582 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
583 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
584 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
586 %op1 = load <64 x i16>, ptr %a
587 %op2 = load <64 x i16>, ptr %b
588 %res = mul <64 x i16> %op1, %op2
589 store <64 x i16> %res, ptr %a
593 define void @mul_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
594 ; CHECK-LABEL: mul_v128i16:
596 ; CHECK-NEXT: ptrue p0.h, vl128
597 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
598 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
599 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
600 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
602 %op1 = load <128 x i16>, ptr %a
603 %op2 = load <128 x i16>, ptr %b
604 %res = mul <128 x i16> %op1, %op2
605 store <128 x i16> %res, ptr %a
609 ; Don't use SVE for 64-bit vectors.
610 define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
611 ; CHECK-LABEL: mul_v2i32:
613 ; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s
615 %res = mul <2 x i32> %op1, %op2
619 ; Don't use SVE for 128-bit vectors.
620 define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
621 ; CHECK-LABEL: mul_v4i32:
623 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
625 %res = mul <4 x i32> %op1, %op2
629 define void @mul_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
630 ; CHECK-LABEL: mul_v8i32:
632 ; CHECK-NEXT: ptrue p0.s, vl8
633 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
634 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
635 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
636 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
638 %op1 = load <8 x i32>, ptr %a
639 %op2 = load <8 x i32>, ptr %b
640 %res = mul <8 x i32> %op1, %op2
641 store <8 x i32> %res, ptr %a
645 define void @mul_v16i32(ptr %a, ptr %b) #0 {
646 ; VBITS_GE_256-LABEL: mul_v16i32:
647 ; VBITS_GE_256: // %bb.0:
648 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
649 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
650 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
651 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
652 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
653 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
654 ; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z2.s
655 ; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z3.s
656 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
657 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
658 ; VBITS_GE_256-NEXT: ret
660 ; VBITS_GE_512-LABEL: mul_v16i32:
661 ; VBITS_GE_512: // %bb.0:
662 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
663 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
664 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
665 ; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s
666 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
667 ; VBITS_GE_512-NEXT: ret
668 %op1 = load <16 x i32>, ptr %a
669 %op2 = load <16 x i32>, ptr %b
670 %res = mul <16 x i32> %op1, %op2
671 store <16 x i32> %res, ptr %a
675 define void @mul_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
676 ; CHECK-LABEL: mul_v32i32:
678 ; CHECK-NEXT: ptrue p0.s, vl32
679 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
680 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
681 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
682 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
684 %op1 = load <32 x i32>, ptr %a
685 %op2 = load <32 x i32>, ptr %b
686 %res = mul <32 x i32> %op1, %op2
687 store <32 x i32> %res, ptr %a
691 define void @mul_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
692 ; CHECK-LABEL: mul_v64i32:
694 ; CHECK-NEXT: ptrue p0.s, vl64
695 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
696 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
697 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
698 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
700 %op1 = load <64 x i32>, ptr %a
701 %op2 = load <64 x i32>, ptr %b
702 %res = mul <64 x i32> %op1, %op2
703 store <64 x i32> %res, ptr %a
707 define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
708 ; CHECK-LABEL: mul_v1i64:
710 ; CHECK-NEXT: ptrue p0.d, vl1
711 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
712 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
713 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
714 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
716 %res = mul <1 x i64> %op1, %op2
720 define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
721 ; CHECK-LABEL: mul_v2i64:
723 ; CHECK-NEXT: ptrue p0.d, vl2
724 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
725 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
726 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
727 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
729 %res = mul <2 x i64> %op1, %op2
733 define void @mul_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
734 ; CHECK-LABEL: mul_v4i64:
736 ; CHECK-NEXT: ptrue p0.d, vl4
737 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
738 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
739 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
740 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
742 %op1 = load <4 x i64>, ptr %a
743 %op2 = load <4 x i64>, ptr %b
744 %res = mul <4 x i64> %op1, %op2
745 store <4 x i64> %res, ptr %a
749 define void @mul_v8i64(ptr %a, ptr %b) #0 {
750 ; VBITS_GE_256-LABEL: mul_v8i64:
751 ; VBITS_GE_256: // %bb.0:
752 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
753 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
754 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
755 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
756 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
757 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
758 ; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z2.d
759 ; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z3.d
760 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
761 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
762 ; VBITS_GE_256-NEXT: ret
764 ; VBITS_GE_512-LABEL: mul_v8i64:
765 ; VBITS_GE_512: // %bb.0:
766 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
767 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
768 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
769 ; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d
770 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
771 ; VBITS_GE_512-NEXT: ret
772 %op1 = load <8 x i64>, ptr %a
773 %op2 = load <8 x i64>, ptr %b
774 %res = mul <8 x i64> %op1, %op2
775 store <8 x i64> %res, ptr %a
779 define void @mul_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
780 ; CHECK-LABEL: mul_v16i64:
782 ; CHECK-NEXT: ptrue p0.d, vl16
783 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
784 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
785 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
786 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
788 %op1 = load <16 x i64>, ptr %a
789 %op2 = load <16 x i64>, ptr %b
790 %res = mul <16 x i64> %op1, %op2
791 store <16 x i64> %res, ptr %a
795 define void @mul_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
796 ; CHECK-LABEL: mul_v32i64:
798 ; CHECK-NEXT: ptrue p0.d, vl32
799 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
800 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
801 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
802 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
804 %op1 = load <32 x i64>, ptr %a
805 %op2 = load <32 x i64>, ptr %b
806 %res = mul <32 x i64> %op1, %op2
807 store <32 x i64> %res, ptr %a
815 ; Don't use SVE for 64-bit vectors.
816 define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
817 ; CHECK-LABEL: sub_v8i8:
819 ; CHECK-NEXT: sub v0.8b, v0.8b, v1.8b
821 %res = sub <8 x i8> %op1, %op2
825 ; Don't use SVE for 128-bit vectors.
826 define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
827 ; CHECK-LABEL: sub_v16i8:
829 ; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b
831 %res = sub <16 x i8> %op1, %op2
835 define void @sub_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
836 ; CHECK-LABEL: sub_v32i8:
838 ; CHECK-NEXT: ptrue p0.b, vl32
839 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
840 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
841 ; CHECK-NEXT: sub z0.b, z0.b, z1.b
842 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
844 %op1 = load <32 x i8>, ptr %a
845 %op2 = load <32 x i8>, ptr %b
846 %res = sub <32 x i8> %op1, %op2
847 store <32 x i8> %res, ptr %a
851 define void @sub_v64i8(ptr %a, ptr %b) #0 {
852 ; VBITS_GE_256-LABEL: sub_v64i8:
853 ; VBITS_GE_256: // %bb.0:
854 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
855 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
856 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
857 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
858 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
859 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
860 ; VBITS_GE_256-NEXT: sub z0.b, z0.b, z2.b
861 ; VBITS_GE_256-NEXT: sub z1.b, z1.b, z3.b
862 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
863 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
864 ; VBITS_GE_256-NEXT: ret
866 ; VBITS_GE_512-LABEL: sub_v64i8:
867 ; VBITS_GE_512: // %bb.0:
868 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
869 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
870 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
871 ; VBITS_GE_512-NEXT: sub z0.b, z0.b, z1.b
872 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
873 ; VBITS_GE_512-NEXT: ret
874 %op1 = load <64 x i8>, ptr %a
875 %op2 = load <64 x i8>, ptr %b
876 %res = sub <64 x i8> %op1, %op2
877 store <64 x i8> %res, ptr %a
881 define void @sub_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
882 ; CHECK-LABEL: sub_v128i8:
884 ; CHECK-NEXT: ptrue p0.b, vl128
885 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
886 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
887 ; CHECK-NEXT: sub z0.b, z0.b, z1.b
888 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
890 %op1 = load <128 x i8>, ptr %a
891 %op2 = load <128 x i8>, ptr %b
892 %res = sub <128 x i8> %op1, %op2
893 store <128 x i8> %res, ptr %a
897 define void @sub_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
898 ; CHECK-LABEL: sub_v256i8:
900 ; CHECK-NEXT: ptrue p0.b, vl256
901 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
902 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
903 ; CHECK-NEXT: sub z0.b, z0.b, z1.b
904 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
906 %op1 = load <256 x i8>, ptr %a
907 %op2 = load <256 x i8>, ptr %b
908 %res = sub <256 x i8> %op1, %op2
909 store <256 x i8> %res, ptr %a
913 ; Don't use SVE for 64-bit vectors.
914 define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
915 ; CHECK-LABEL: sub_v4i16:
917 ; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h
919 %res = sub <4 x i16> %op1, %op2
923 ; Don't use SVE for 128-bit vectors.
924 define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
925 ; CHECK-LABEL: sub_v8i16:
927 ; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
929 %res = sub <8 x i16> %op1, %op2
933 define void @sub_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
934 ; CHECK-LABEL: sub_v16i16:
936 ; CHECK-NEXT: ptrue p0.h, vl16
937 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
938 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
939 ; CHECK-NEXT: sub z0.h, z0.h, z1.h
940 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
942 %op1 = load <16 x i16>, ptr %a
943 %op2 = load <16 x i16>, ptr %b
944 %res = sub <16 x i16> %op1, %op2
945 store <16 x i16> %res, ptr %a
949 define void @sub_v32i16(ptr %a, ptr %b) #0 {
950 ; VBITS_GE_256-LABEL: sub_v32i16:
951 ; VBITS_GE_256: // %bb.0:
952 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
953 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
954 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
955 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
956 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
957 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
958 ; VBITS_GE_256-NEXT: sub z0.h, z0.h, z2.h
959 ; VBITS_GE_256-NEXT: sub z1.h, z1.h, z3.h
960 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
961 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
962 ; VBITS_GE_256-NEXT: ret
964 ; VBITS_GE_512-LABEL: sub_v32i16:
965 ; VBITS_GE_512: // %bb.0:
966 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
967 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
968 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
969 ; VBITS_GE_512-NEXT: sub z0.h, z0.h, z1.h
970 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
971 ; VBITS_GE_512-NEXT: ret
972 %op1 = load <32 x i16>, ptr %a
973 %op2 = load <32 x i16>, ptr %b
974 %res = sub <32 x i16> %op1, %op2
975 store <32 x i16> %res, ptr %a
979 define void @sub_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
980 ; CHECK-LABEL: sub_v64i16:
982 ; CHECK-NEXT: ptrue p0.h, vl64
983 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
984 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
985 ; CHECK-NEXT: sub z0.h, z0.h, z1.h
986 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
988 %op1 = load <64 x i16>, ptr %a
989 %op2 = load <64 x i16>, ptr %b
990 %res = sub <64 x i16> %op1, %op2
991 store <64 x i16> %res, ptr %a
995 define void @sub_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
996 ; CHECK-LABEL: sub_v128i16:
998 ; CHECK-NEXT: ptrue p0.h, vl128
999 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1000 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1001 ; CHECK-NEXT: sub z0.h, z0.h, z1.h
1002 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1004 %op1 = load <128 x i16>, ptr %a
1005 %op2 = load <128 x i16>, ptr %b
1006 %res = sub <128 x i16> %op1, %op2
1007 store <128 x i16> %res, ptr %a
1011 ; Don't use SVE for 64-bit vectors.
1012 define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
1013 ; CHECK-LABEL: sub_v2i32:
1015 ; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s
1017 %res = sub <2 x i32> %op1, %op2
1021 ; Don't use SVE for 128-bit vectors.
1022 define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
1023 ; CHECK-LABEL: sub_v4i32:
1025 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
1027 %res = sub <4 x i32> %op1, %op2
1031 define void @sub_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1032 ; CHECK-LABEL: sub_v8i32:
1034 ; CHECK-NEXT: ptrue p0.s, vl8
1035 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1036 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1037 ; CHECK-NEXT: sub z0.s, z0.s, z1.s
1038 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1040 %op1 = load <8 x i32>, ptr %a
1041 %op2 = load <8 x i32>, ptr %b
1042 %res = sub <8 x i32> %op1, %op2
1043 store <8 x i32> %res, ptr %a
1047 define void @sub_v16i32(ptr %a, ptr %b) #0 {
1048 ; VBITS_GE_256-LABEL: sub_v16i32:
1049 ; VBITS_GE_256: // %bb.0:
1050 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1051 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1052 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1053 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1054 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
1055 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
1056 ; VBITS_GE_256-NEXT: sub z0.s, z0.s, z2.s
1057 ; VBITS_GE_256-NEXT: sub z1.s, z1.s, z3.s
1058 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1059 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1060 ; VBITS_GE_256-NEXT: ret
1062 ; VBITS_GE_512-LABEL: sub_v16i32:
1063 ; VBITS_GE_512: // %bb.0:
1064 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1065 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1066 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
1067 ; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s
1068 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1069 ; VBITS_GE_512-NEXT: ret
1070 %op1 = load <16 x i32>, ptr %a
1071 %op2 = load <16 x i32>, ptr %b
1072 %res = sub <16 x i32> %op1, %op2
1073 store <16 x i32> %res, ptr %a
1077 define void @sub_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1078 ; CHECK-LABEL: sub_v32i32:
1080 ; CHECK-NEXT: ptrue p0.s, vl32
1081 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1082 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1083 ; CHECK-NEXT: sub z0.s, z0.s, z1.s
1084 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1086 %op1 = load <32 x i32>, ptr %a
1087 %op2 = load <32 x i32>, ptr %b
1088 %res = sub <32 x i32> %op1, %op2
1089 store <32 x i32> %res, ptr %a
1093 define void @sub_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1094 ; CHECK-LABEL: sub_v64i32:
1096 ; CHECK-NEXT: ptrue p0.s, vl64
1097 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1098 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1099 ; CHECK-NEXT: sub z0.s, z0.s, z1.s
1100 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1102 %op1 = load <64 x i32>, ptr %a
1103 %op2 = load <64 x i32>, ptr %b
1104 %res = sub <64 x i32> %op1, %op2
1105 store <64 x i32> %res, ptr %a
1109 ; Don't use SVE for 64-bit vectors.
1110 define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
1111 ; CHECK-LABEL: sub_v1i64:
1113 ; CHECK-NEXT: sub d0, d0, d1
1115 %res = sub <1 x i64> %op1, %op2
1119 ; Don't use SVE for 128-bit vectors.
1120 define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
1121 ; CHECK-LABEL: sub_v2i64:
1123 ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
1125 %res = sub <2 x i64> %op1, %op2
1129 define void @sub_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1130 ; CHECK-LABEL: sub_v4i64:
1132 ; CHECK-NEXT: ptrue p0.d, vl4
1133 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1134 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1135 ; CHECK-NEXT: sub z0.d, z0.d, z1.d
1136 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1138 %op1 = load <4 x i64>, ptr %a
1139 %op2 = load <4 x i64>, ptr %b
1140 %res = sub <4 x i64> %op1, %op2
1141 store <4 x i64> %res, ptr %a
1145 define void @sub_v8i64(ptr %a, ptr %b) #0 {
1146 ; VBITS_GE_256-LABEL: sub_v8i64:
1147 ; VBITS_GE_256: // %bb.0:
1148 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1149 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1150 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1151 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1152 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
1153 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
1154 ; VBITS_GE_256-NEXT: sub z0.d, z0.d, z2.d
1155 ; VBITS_GE_256-NEXT: sub z1.d, z1.d, z3.d
1156 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1157 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1158 ; VBITS_GE_256-NEXT: ret
1160 ; VBITS_GE_512-LABEL: sub_v8i64:
1161 ; VBITS_GE_512: // %bb.0:
1162 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1163 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1164 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
1165 ; VBITS_GE_512-NEXT: sub z0.d, z0.d, z1.d
1166 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1167 ; VBITS_GE_512-NEXT: ret
1168 %op1 = load <8 x i64>, ptr %a
1169 %op2 = load <8 x i64>, ptr %b
1170 %res = sub <8 x i64> %op1, %op2
1171 store <8 x i64> %res, ptr %a
1175 define void @sub_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1176 ; CHECK-LABEL: sub_v16i64:
1178 ; CHECK-NEXT: ptrue p0.d, vl16
1179 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1180 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1181 ; CHECK-NEXT: sub z0.d, z0.d, z1.d
1182 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1184 %op1 = load <16 x i64>, ptr %a
1185 %op2 = load <16 x i64>, ptr %b
1186 %res = sub <16 x i64> %op1, %op2
1187 store <16 x i64> %res, ptr %a
1191 define void @sub_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1192 ; CHECK-LABEL: sub_v32i64:
1194 ; CHECK-NEXT: ptrue p0.d, vl32
1195 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1196 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1197 ; CHECK-NEXT: sub z0.d, z0.d, z1.d
1198 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1200 %op1 = load <32 x i64>, ptr %a
1201 %op2 = load <32 x i64>, ptr %b
1202 %res = sub <32 x i64> %op1, %op2
1203 store <32 x i64> %res, ptr %a
1212 ; Don't use SVE for 64-bit vectors.
1213 define <8 x i8> @abs_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
1214 ; CHECK-LABEL: abs_v8i8:
1216 ; CHECK-NEXT: abs v0.8b, v0.8b
1218 %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
1222 ; Don't use SVE for 128-bit vectors.
1223 define <16 x i8> @abs_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
1224 ; CHECK-LABEL: abs_v16i8:
1226 ; CHECK-NEXT: abs v0.16b, v0.16b
1228 %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
1232 define void @abs_v32i8(ptr %a) vscale_range(2,0) #0 {
1233 ; CHECK-LABEL: abs_v32i8:
1235 ; CHECK-NEXT: ptrue p0.b, vl32
1236 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1237 ; CHECK-NEXT: abs z0.b, p0/m, z0.b
1238 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1240 %op1 = load <32 x i8>, ptr %a
1241 %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
1242 store <32 x i8> %res, ptr %a
1246 define void @abs_v64i8(ptr %a) #0 {
1247 ; VBITS_GE_256-LABEL: abs_v64i8:
1248 ; VBITS_GE_256: // %bb.0:
1249 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
1250 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
1251 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
1252 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
1253 ; VBITS_GE_256-NEXT: abs z0.b, p0/m, z0.b
1254 ; VBITS_GE_256-NEXT: abs z1.b, p0/m, z1.b
1255 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
1256 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
1257 ; VBITS_GE_256-NEXT: ret
1259 ; VBITS_GE_512-LABEL: abs_v64i8:
1260 ; VBITS_GE_512: // %bb.0:
1261 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
1262 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
1263 ; VBITS_GE_512-NEXT: abs z0.b, p0/m, z0.b
1264 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
1265 ; VBITS_GE_512-NEXT: ret
1266 %op1 = load <64 x i8>, ptr %a
1267 %res = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %op1, i1 false)
1268 store <64 x i8> %res, ptr %a
1272 define void @abs_v128i8(ptr %a) vscale_range(8,0) #0 {
1273 ; CHECK-LABEL: abs_v128i8:
1275 ; CHECK-NEXT: ptrue p0.b, vl128
1276 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1277 ; CHECK-NEXT: abs z0.b, p0/m, z0.b
1278 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1280 %op1 = load <128 x i8>, ptr %a
1281 %res = call <128 x i8> @llvm.abs.v128i8(<128 x i8> %op1, i1 false)
1282 store <128 x i8> %res, ptr %a
1286 define void @abs_v256i8(ptr %a) vscale_range(16,0) #0 {
1287 ; CHECK-LABEL: abs_v256i8:
1289 ; CHECK-NEXT: ptrue p0.b, vl256
1290 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1291 ; CHECK-NEXT: abs z0.b, p0/m, z0.b
1292 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1294 %op1 = load <256 x i8>, ptr %a
1295 %res = call <256 x i8> @llvm.abs.v256i8(<256 x i8> %op1, i1 false)
1296 store <256 x i8> %res, ptr %a
1300 ; Don't use SVE for 64-bit vectors.
1301 define <4 x i16> @abs_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
1302 ; CHECK-LABEL: abs_v4i16:
1304 ; CHECK-NEXT: abs v0.4h, v0.4h
1306 %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
1310 ; Don't use SVE for 128-bit vectors.
1311 define <8 x i16> @abs_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
1312 ; CHECK-LABEL: abs_v8i16:
1314 ; CHECK-NEXT: abs v0.8h, v0.8h
1316 %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
1320 define void @abs_v16i16(ptr %a) vscale_range(2,0) #0 {
1321 ; CHECK-LABEL: abs_v16i16:
1323 ; CHECK-NEXT: ptrue p0.h, vl16
1324 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1325 ; CHECK-NEXT: abs z0.h, p0/m, z0.h
1326 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1328 %op1 = load <16 x i16>, ptr %a
1329 %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
1330 store <16 x i16> %res, ptr %a
1334 define void @abs_v32i16(ptr %a) vscale_range(2,0) #0 {
1335 ; CHECK-LABEL: abs_v32i16:
1337 ; CHECK-NEXT: ptrue p0.h, vl16
1338 ; CHECK-NEXT: mov x8, #16 // =0x10
1339 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1340 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
1341 ; CHECK-NEXT: abs z0.h, p0/m, z0.h
1342 ; CHECK-NEXT: abs z1.h, p0/m, z1.h
1343 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1344 ; CHECK-NEXT: st1h { z1.h }, p0, [x0]
1346 %op1 = load <32 x i16>, ptr %a
1347 %res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %op1, i1 false)
1348 store <32 x i16> %res, ptr %a
1352 define void @abs_v64i16(ptr %a) vscale_range(2,0) #0 {
1353 ; CHECK-LABEL: abs_v64i16:
1355 ; CHECK-NEXT: ptrue p0.h, vl16
1356 ; CHECK-NEXT: mov x8, #32 // =0x20
1357 ; CHECK-NEXT: mov x9, #48 // =0x30
1358 ; CHECK-NEXT: mov x10, #16 // =0x10
1359 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1360 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
1361 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
1362 ; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0]
1363 ; CHECK-NEXT: abs z0.h, p0/m, z0.h
1364 ; CHECK-NEXT: abs z1.h, p0/m, z1.h
1365 ; CHECK-NEXT: abs z2.h, p0/m, z2.h
1366 ; CHECK-NEXT: abs z3.h, p0/m, z3.h
1367 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1368 ; CHECK-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
1369 ; CHECK-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
1370 ; CHECK-NEXT: st1h { z3.h }, p0, [x0]
1372 %op1 = load <64 x i16>, ptr %a
1373 %res = call <64 x i16> @llvm.abs.v64i16(<64 x i16> %op1, i1 false)
1374 store <64 x i16> %res, ptr %a
1378 define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 {
1379 ; CHECK-LABEL: abs_v128i16:
1381 ; CHECK-NEXT: ptrue p0.h, vl16
1382 ; CHECK-NEXT: mov x8, #96 // =0x60
1383 ; CHECK-NEXT: mov x9, #112 // =0x70
1384 ; CHECK-NEXT: mov x10, #64 // =0x40
1385 ; CHECK-NEXT: mov x11, #80 // =0x50
1386 ; CHECK-NEXT: mov x12, #32 // =0x20
1387 ; CHECK-NEXT: mov x13, #48 // =0x30
1388 ; CHECK-NEXT: mov x14, #16 // =0x10
1389 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1390 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
1391 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
1392 ; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
1393 ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
1394 ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
1395 ; CHECK-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
1396 ; CHECK-NEXT: ld1h { z7.h }, p0/z, [x0]
1397 ; CHECK-NEXT: abs z0.h, p0/m, z0.h
1398 ; CHECK-NEXT: abs z1.h, p0/m, z1.h
1399 ; CHECK-NEXT: abs z2.h, p0/m, z2.h
1400 ; CHECK-NEXT: abs z3.h, p0/m, z3.h
1401 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1402 ; CHECK-NEXT: movprfx z0, z4
1403 ; CHECK-NEXT: abs z0.h, p0/m, z4.h
1404 ; CHECK-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
1405 ; CHECK-NEXT: movprfx z1, z5
1406 ; CHECK-NEXT: abs z1.h, p0/m, z5.h
1407 ; CHECK-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
1408 ; CHECK-NEXT: movprfx z2, z6
1409 ; CHECK-NEXT: abs z2.h, p0/m, z6.h
1410 ; CHECK-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1]
1411 ; CHECK-NEXT: movprfx z3, z7
1412 ; CHECK-NEXT: abs z3.h, p0/m, z7.h
1413 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, x12, lsl #1]
1414 ; CHECK-NEXT: st1h { z1.h }, p0, [x0, x13, lsl #1]
1415 ; CHECK-NEXT: st1h { z2.h }, p0, [x0, x14, lsl #1]
1416 ; CHECK-NEXT: st1h { z3.h }, p0, [x0]
1418 %op1 = load <128 x i16>, ptr %a
1419 %res = call <128 x i16> @llvm.abs.v128i16(<128 x i16> %op1, i1 false)
1420 store <128 x i16> %res, ptr %a
1424 ; Don't use SVE for 64-bit vectors.
1425 define <2 x i32> @abs_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
1426 ; CHECK-LABEL: abs_v2i32:
1428 ; CHECK-NEXT: abs v0.2s, v0.2s
1430 %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
1434 ; Don't use SVE for 128-bit vectors.
1435 define <4 x i32> @abs_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
1436 ; CHECK-LABEL: abs_v4i32:
1438 ; CHECK-NEXT: abs v0.4s, v0.4s
1440 %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
1444 define void @abs_v8i32(ptr %a) vscale_range(2,0) #0 {
1445 ; CHECK-LABEL: abs_v8i32:
1447 ; CHECK-NEXT: ptrue p0.s, vl8
1448 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1449 ; CHECK-NEXT: abs z0.s, p0/m, z0.s
1450 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1452 %op1 = load <8 x i32>, ptr %a
1453 %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
1454 store <8 x i32> %res, ptr %a
1458 define void @abs_v16i32(ptr %a) #0 {
1459 ; VBITS_GE_256-LABEL: abs_v16i32:
1460 ; VBITS_GE_256: // %bb.0:
1461 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1462 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1463 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1464 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1465 ; VBITS_GE_256-NEXT: abs z0.s, p0/m, z0.s
1466 ; VBITS_GE_256-NEXT: abs z1.s, p0/m, z1.s
1467 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1468 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1469 ; VBITS_GE_256-NEXT: ret
1471 ; VBITS_GE_512-LABEL: abs_v16i32:
1472 ; VBITS_GE_512: // %bb.0:
1473 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1474 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1475 ; VBITS_GE_512-NEXT: abs z0.s, p0/m, z0.s
1476 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1477 ; VBITS_GE_512-NEXT: ret
1478 %op1 = load <16 x i32>, ptr %a
1479 %res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false)
1480 store <16 x i32> %res, ptr %a
1484 define void @abs_v32i32(ptr %a) vscale_range(8,0) #0 {
1485 ; CHECK-LABEL: abs_v32i32:
1487 ; CHECK-NEXT: ptrue p0.s, vl32
1488 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1489 ; CHECK-NEXT: abs z0.s, p0/m, z0.s
1490 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1492 %op1 = load <32 x i32>, ptr %a
1493 %res = call <32 x i32> @llvm.abs.v32i32(<32 x i32> %op1, i1 false)
1494 store <32 x i32> %res, ptr %a
1498 define void @abs_v64i32(ptr %a) vscale_range(16,0) #0 {
1499 ; CHECK-LABEL: abs_v64i32:
1501 ; CHECK-NEXT: ptrue p0.s, vl64
1502 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1503 ; CHECK-NEXT: abs z0.s, p0/m, z0.s
1504 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1506 %op1 = load <64 x i32>, ptr %a
1507 %res = call <64 x i32> @llvm.abs.v64i32(<64 x i32> %op1, i1 false)
1508 store <64 x i32> %res, ptr %a
1512 ; Don't use SVE for 64-bit vectors.
1513 define <1 x i64> @abs_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
1514 ; CHECK-LABEL: abs_v1i64:
1516 ; CHECK-NEXT: abs d0, d0
1518 %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
1522 ; Don't use SVE for 128-bit vectors.
1523 define <2 x i64> @abs_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
1524 ; CHECK-LABEL: abs_v2i64:
1526 ; CHECK-NEXT: abs v0.2d, v0.2d
1528 %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
1532 define void @abs_v4i64(ptr %a) vscale_range(2,0) #0 {
1533 ; CHECK-LABEL: abs_v4i64:
1535 ; CHECK-NEXT: ptrue p0.d, vl4
1536 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1537 ; CHECK-NEXT: abs z0.d, p0/m, z0.d
1538 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1540 %op1 = load <4 x i64>, ptr %a
1541 %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
1542 store <4 x i64> %res, ptr %a
1546 define void @abs_v8i64(ptr %a) #0 {
1547 ; VBITS_GE_256-LABEL: abs_v8i64:
1548 ; VBITS_GE_256: // %bb.0:
1549 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1550 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1551 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1552 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1553 ; VBITS_GE_256-NEXT: abs z0.d, p0/m, z0.d
1554 ; VBITS_GE_256-NEXT: abs z1.d, p0/m, z1.d
1555 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1556 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1557 ; VBITS_GE_256-NEXT: ret
1559 ; VBITS_GE_512-LABEL: abs_v8i64:
1560 ; VBITS_GE_512: // %bb.0:
1561 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1562 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1563 ; VBITS_GE_512-NEXT: abs z0.d, p0/m, z0.d
1564 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1565 ; VBITS_GE_512-NEXT: ret
1566 %op1 = load <8 x i64>, ptr %a
1567 %res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false)
1568 store <8 x i64> %res, ptr %a
1572 define void @abs_v16i64(ptr %a) vscale_range(8,0) #0 {
1573 ; CHECK-LABEL: abs_v16i64:
1575 ; CHECK-NEXT: ptrue p0.d, vl16
1576 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1577 ; CHECK-NEXT: abs z0.d, p0/m, z0.d
1578 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1580 %op1 = load <16 x i64>, ptr %a
1581 %res = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %op1, i1 false)
1582 store <16 x i64> %res, ptr %a
1586 define void @abs_v32i64(ptr %a) vscale_range(16,0) #0 {
1587 ; CHECK-LABEL: abs_v32i64:
1589 ; CHECK-NEXT: ptrue p0.d, vl32
1590 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1591 ; CHECK-NEXT: abs z0.d, p0/m, z0.d
1592 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1594 %op1 = load <32 x i64>, ptr %a
1595 %res = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %op1, i1 false)
1596 store <32 x i64> %res, ptr %a
1600 declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1)
1601 declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
1602 declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1)
1603 declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1)
1604 declare <128 x i8> @llvm.abs.v128i8(<128 x i8>, i1)
1605 declare <256 x i8> @llvm.abs.v256i8(<256 x i8>, i1)
1606 declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1)
1607 declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)
1608 declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
1609 declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1)
1610 declare <64 x i16> @llvm.abs.v64i16(<64 x i16>, i1)
1611 declare <128 x i16> @llvm.abs.v128i16(<128 x i16>, i1)
1612 declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
1613 declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
1614 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)
1615 declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)
1616 declare <32 x i32> @llvm.abs.v32i32(<32 x i32>, i1)
1617 declare <64 x i32> @llvm.abs.v64i32(<64 x i32>, i1)
1618 declare <1 x i64> @llvm.abs.v1i64(<1 x i64>, i1)
1619 declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
1620 declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
1621 declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
1622 declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
1623 declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1)
1625 attributes #0 = { "target-features"="+sve" }