1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; Don't use SVE for 64-bit vectors.
13 define <8 x i8> @add_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
14 ; CHECK-LABEL: add_v8i8:
16 ; CHECK-NEXT: add v0.8b, v0.8b, v1.8b
18 %res = add <8 x i8> %op1, %op2
22 ; Don't use SVE for 128-bit vectors.
23 define <16 x i8> @add_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
24 ; CHECK-LABEL: add_v16i8:
26 ; CHECK-NEXT: add v0.16b, v0.16b, v1.16b
28 %res = add <16 x i8> %op1, %op2
32 define void @add_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
33 ; CHECK-LABEL: add_v32i8:
35 ; CHECK-NEXT: ptrue p0.b, vl32
36 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
37 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
38 ; CHECK-NEXT: add z0.b, z0.b, z1.b
39 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
41 %op1 = load <32 x i8>, ptr %a
42 %op2 = load <32 x i8>, ptr %b
43 %res = add <32 x i8> %op1, %op2
44 store <32 x i8> %res, ptr %a
48 define void @add_v64i8(ptr %a, ptr %b) #0 {
49 ; VBITS_GE_256-LABEL: add_v64i8:
50 ; VBITS_GE_256: // %bb.0:
51 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
52 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
53 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
54 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
55 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
56 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
57 ; VBITS_GE_256-NEXT: add z0.b, z0.b, z1.b
58 ; VBITS_GE_256-NEXT: add z1.b, z2.b, z3.b
59 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
60 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
61 ; VBITS_GE_256-NEXT: ret
63 ; VBITS_GE_512-LABEL: add_v64i8:
64 ; VBITS_GE_512: // %bb.0:
65 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
66 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
67 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
68 ; VBITS_GE_512-NEXT: add z0.b, z0.b, z1.b
69 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
70 ; VBITS_GE_512-NEXT: ret
71 %op1 = load <64 x i8>, ptr %a
72 %op2 = load <64 x i8>, ptr %b
73 %res = add <64 x i8> %op1, %op2
74 store <64 x i8> %res, ptr %a
78 define void @add_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
79 ; CHECK-LABEL: add_v128i8:
81 ; CHECK-NEXT: ptrue p0.b, vl128
82 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
83 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
84 ; CHECK-NEXT: add z0.b, z0.b, z1.b
85 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
87 %op1 = load <128 x i8>, ptr %a
88 %op2 = load <128 x i8>, ptr %b
89 %res = add <128 x i8> %op1, %op2
90 store <128 x i8> %res, ptr %a
94 define void @add_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
95 ; CHECK-LABEL: add_v256i8:
97 ; CHECK-NEXT: ptrue p0.b, vl256
98 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
99 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
100 ; CHECK-NEXT: add z0.b, z0.b, z1.b
101 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
103 %op1 = load <256 x i8>, ptr %a
104 %op2 = load <256 x i8>, ptr %b
105 %res = add <256 x i8> %op1, %op2
106 store <256 x i8> %res, ptr %a
110 ; Don't use SVE for 64-bit vectors.
111 define <4 x i16> @add_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
112 ; CHECK-LABEL: add_v4i16:
114 ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
116 %res = add <4 x i16> %op1, %op2
120 ; Don't use SVE for 128-bit vectors.
121 define <8 x i16> @add_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
122 ; CHECK-LABEL: add_v8i16:
124 ; CHECK-NEXT: add v0.8h, v0.8h, v1.8h
126 %res = add <8 x i16> %op1, %op2
130 define void @add_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
131 ; CHECK-LABEL: add_v16i16:
133 ; CHECK-NEXT: ptrue p0.h, vl16
134 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
135 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
136 ; CHECK-NEXT: add z0.h, z0.h, z1.h
137 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
139 %op1 = load <16 x i16>, ptr %a
140 %op2 = load <16 x i16>, ptr %b
141 %res = add <16 x i16> %op1, %op2
142 store <16 x i16> %res, ptr %a
146 define void @add_v32i16(ptr %a, ptr %b) #0 {
147 ; VBITS_GE_256-LABEL: add_v32i16:
148 ; VBITS_GE_256: // %bb.0:
149 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
150 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
151 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
152 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
153 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
154 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
155 ; VBITS_GE_256-NEXT: add z0.h, z0.h, z1.h
156 ; VBITS_GE_256-NEXT: add z1.h, z2.h, z3.h
157 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
158 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
159 ; VBITS_GE_256-NEXT: ret
161 ; VBITS_GE_512-LABEL: add_v32i16:
162 ; VBITS_GE_512: // %bb.0:
163 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
164 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
165 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
166 ; VBITS_GE_512-NEXT: add z0.h, z0.h, z1.h
167 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
168 ; VBITS_GE_512-NEXT: ret
169 %op1 = load <32 x i16>, ptr %a
170 %op2 = load <32 x i16>, ptr %b
171 %res = add <32 x i16> %op1, %op2
172 store <32 x i16> %res, ptr %a
176 define void @add_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
177 ; CHECK-LABEL: add_v64i16:
179 ; CHECK-NEXT: ptrue p0.h, vl64
180 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
181 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
182 ; CHECK-NEXT: add z0.h, z0.h, z1.h
183 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
185 %op1 = load <64 x i16>, ptr %a
186 %op2 = load <64 x i16>, ptr %b
187 %res = add <64 x i16> %op1, %op2
188 store <64 x i16> %res, ptr %a
192 define void @add_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
193 ; CHECK-LABEL: add_v128i16:
195 ; CHECK-NEXT: ptrue p0.h, vl128
196 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
197 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
198 ; CHECK-NEXT: add z0.h, z0.h, z1.h
199 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
201 %op1 = load <128 x i16>, ptr %a
202 %op2 = load <128 x i16>, ptr %b
203 %res = add <128 x i16> %op1, %op2
204 store <128 x i16> %res, ptr %a
208 ; Don't use SVE for 64-bit vectors.
209 define <2 x i32> @add_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
210 ; CHECK-LABEL: add_v2i32:
212 ; CHECK-NEXT: add v0.2s, v0.2s, v1.2s
214 %res = add <2 x i32> %op1, %op2
218 ; Don't use SVE for 128-bit vectors.
219 define <4 x i32> @add_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
220 ; CHECK-LABEL: add_v4i32:
222 ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
224 %res = add <4 x i32> %op1, %op2
228 define void @add_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
229 ; CHECK-LABEL: add_v8i32:
231 ; CHECK-NEXT: ptrue p0.s, vl8
232 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
233 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
234 ; CHECK-NEXT: add z0.s, z0.s, z1.s
235 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
237 %op1 = load <8 x i32>, ptr %a
238 %op2 = load <8 x i32>, ptr %b
239 %res = add <8 x i32> %op1, %op2
240 store <8 x i32> %res, ptr %a
244 define void @add_v16i32(ptr %a, ptr %b) #0 {
245 ; VBITS_GE_256-LABEL: add_v16i32:
246 ; VBITS_GE_256: // %bb.0:
247 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
248 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
249 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
250 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
251 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
252 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
253 ; VBITS_GE_256-NEXT: add z0.s, z0.s, z1.s
254 ; VBITS_GE_256-NEXT: add z1.s, z2.s, z3.s
255 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
256 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
257 ; VBITS_GE_256-NEXT: ret
259 ; VBITS_GE_512-LABEL: add_v16i32:
260 ; VBITS_GE_512: // %bb.0:
261 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
262 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
263 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
264 ; VBITS_GE_512-NEXT: add z0.s, z0.s, z1.s
265 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
266 ; VBITS_GE_512-NEXT: ret
267 %op1 = load <16 x i32>, ptr %a
268 %op2 = load <16 x i32>, ptr %b
269 %res = add <16 x i32> %op1, %op2
270 store <16 x i32> %res, ptr %a
274 define void @add_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
275 ; CHECK-LABEL: add_v32i32:
277 ; CHECK-NEXT: ptrue p0.s, vl32
278 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
279 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
280 ; CHECK-NEXT: add z0.s, z0.s, z1.s
281 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
283 %op1 = load <32 x i32>, ptr %a
284 %op2 = load <32 x i32>, ptr %b
285 %res = add <32 x i32> %op1, %op2
286 store <32 x i32> %res, ptr %a
290 define void @add_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
291 ; CHECK-LABEL: add_v64i32:
293 ; CHECK-NEXT: ptrue p0.s, vl64
294 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
295 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
296 ; CHECK-NEXT: add z0.s, z0.s, z1.s
297 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
299 %op1 = load <64 x i32>, ptr %a
300 %op2 = load <64 x i32>, ptr %b
301 %res = add <64 x i32> %op1, %op2
302 store <64 x i32> %res, ptr %a
306 ; Don't use SVE for 64-bit vectors.
307 define <1 x i64> @add_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
308 ; CHECK-LABEL: add_v1i64:
310 ; CHECK-NEXT: add d0, d0, d1
312 %res = add <1 x i64> %op1, %op2
316 ; Don't use SVE for 128-bit vectors.
317 define <2 x i64> @add_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
318 ; CHECK-LABEL: add_v2i64:
320 ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d
322 %res = add <2 x i64> %op1, %op2
326 define void @add_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
327 ; CHECK-LABEL: add_v4i64:
329 ; CHECK-NEXT: ptrue p0.d, vl4
330 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
331 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
332 ; CHECK-NEXT: add z0.d, z0.d, z1.d
333 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
335 %op1 = load <4 x i64>, ptr %a
336 %op2 = load <4 x i64>, ptr %b
337 %res = add <4 x i64> %op1, %op2
338 store <4 x i64> %res, ptr %a
342 define void @add_v8i64(ptr %a, ptr %b) #0 {
343 ; VBITS_GE_256-LABEL: add_v8i64:
344 ; VBITS_GE_256: // %bb.0:
345 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
346 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
347 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
348 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
349 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
350 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
351 ; VBITS_GE_256-NEXT: add z0.d, z0.d, z1.d
352 ; VBITS_GE_256-NEXT: add z1.d, z2.d, z3.d
353 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
354 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
355 ; VBITS_GE_256-NEXT: ret
357 ; VBITS_GE_512-LABEL: add_v8i64:
358 ; VBITS_GE_512: // %bb.0:
359 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
360 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
361 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
362 ; VBITS_GE_512-NEXT: add z0.d, z0.d, z1.d
363 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
364 ; VBITS_GE_512-NEXT: ret
365 %op1 = load <8 x i64>, ptr %a
366 %op2 = load <8 x i64>, ptr %b
367 %res = add <8 x i64> %op1, %op2
368 store <8 x i64> %res, ptr %a
372 define void @add_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
373 ; CHECK-LABEL: add_v16i64:
375 ; CHECK-NEXT: ptrue p0.d, vl16
376 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
377 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
378 ; CHECK-NEXT: add z0.d, z0.d, z1.d
379 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
381 %op1 = load <16 x i64>, ptr %a
382 %op2 = load <16 x i64>, ptr %b
383 %res = add <16 x i64> %op1, %op2
384 store <16 x i64> %res, ptr %a
388 define void @add_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
389 ; CHECK-LABEL: add_v32i64:
391 ; CHECK-NEXT: ptrue p0.d, vl16
392 ; CHECK-NEXT: mov x8, #16 // =0x10
393 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
394 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
395 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
396 ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x1]
397 ; CHECK-NEXT: add z0.d, z0.d, z1.d
398 ; CHECK-NEXT: add z1.d, z2.d, z3.d
399 ; CHECK-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
400 ; CHECK-NEXT: st1d { z1.d }, p0, [x0]
402 %op1 = load <32 x i64>, ptr %a
403 %op2 = load <32 x i64>, ptr %b
404 %res = add <32 x i64> %op1, %op2
405 store <32 x i64> %res, ptr %a
413 ; Don't use SVE for 64-bit vectors.
414 define <8 x i8> @mul_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
415 ; CHECK-LABEL: mul_v8i8:
417 ; CHECK-NEXT: mul v0.8b, v0.8b, v1.8b
419 %res = mul <8 x i8> %op1, %op2
423 ; Don't use SVE for 128-bit vectors.
424 define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
425 ; CHECK-LABEL: mul_v16i8:
427 ; CHECK-NEXT: mul v0.16b, v0.16b, v1.16b
429 %res = mul <16 x i8> %op1, %op2
433 define void @mul_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
434 ; CHECK-LABEL: mul_v32i8:
436 ; CHECK-NEXT: ptrue p0.b, vl32
437 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
438 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
439 ; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
440 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
442 %op1 = load <32 x i8>, ptr %a
443 %op2 = load <32 x i8>, ptr %b
444 %res = mul <32 x i8> %op1, %op2
445 store <32 x i8> %res, ptr %a
449 define void @mul_v64i8(ptr %a, ptr %b) #0 {
450 ; VBITS_GE_256-LABEL: mul_v64i8:
451 ; VBITS_GE_256: // %bb.0:
452 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
453 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
454 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
455 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
456 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
457 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
458 ; VBITS_GE_256-NEXT: mul z0.b, p0/m, z0.b, z1.b
459 ; VBITS_GE_256-NEXT: movprfx z1, z2
460 ; VBITS_GE_256-NEXT: mul z1.b, p0/m, z1.b, z3.b
461 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
462 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
463 ; VBITS_GE_256-NEXT: ret
465 ; VBITS_GE_512-LABEL: mul_v64i8:
466 ; VBITS_GE_512: // %bb.0:
467 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
468 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
469 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
470 ; VBITS_GE_512-NEXT: mul z0.b, p0/m, z0.b, z1.b
471 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
472 ; VBITS_GE_512-NEXT: ret
473 %op1 = load <64 x i8>, ptr %a
474 %op2 = load <64 x i8>, ptr %b
475 %res = mul <64 x i8> %op1, %op2
476 store <64 x i8> %res, ptr %a
480 define void @mul_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
481 ; CHECK-LABEL: mul_v128i8:
483 ; CHECK-NEXT: ptrue p0.b, vl128
484 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
485 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
486 ; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
487 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
489 %op1 = load <128 x i8>, ptr %a
490 %op2 = load <128 x i8>, ptr %b
491 %res = mul <128 x i8> %op1, %op2
492 store <128 x i8> %res, ptr %a
496 define void @mul_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
497 ; CHECK-LABEL: mul_v256i8:
499 ; CHECK-NEXT: ptrue p0.b, vl256
500 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
501 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
502 ; CHECK-NEXT: mul z0.b, p0/m, z0.b, z1.b
503 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
505 %op1 = load <256 x i8>, ptr %a
506 %op2 = load <256 x i8>, ptr %b
507 %res = mul <256 x i8> %op1, %op2
508 store <256 x i8> %res, ptr %a
512 ; Don't use SVE for 64-bit vectors.
513 define <4 x i16> @mul_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
514 ; CHECK-LABEL: mul_v4i16:
516 ; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h
518 %res = mul <4 x i16> %op1, %op2
522 ; Don't use SVE for 128-bit vectors.
523 define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
524 ; CHECK-LABEL: mul_v8i16:
526 ; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
528 %res = mul <8 x i16> %op1, %op2
532 define void @mul_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
533 ; CHECK-LABEL: mul_v16i16:
535 ; CHECK-NEXT: ptrue p0.h, vl16
536 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
537 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
538 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
539 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
541 %op1 = load <16 x i16>, ptr %a
542 %op2 = load <16 x i16>, ptr %b
543 %res = mul <16 x i16> %op1, %op2
544 store <16 x i16> %res, ptr %a
548 define void @mul_v32i16(ptr %a, ptr %b) #0 {
549 ; VBITS_GE_256-LABEL: mul_v32i16:
550 ; VBITS_GE_256: // %bb.0:
551 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
552 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
553 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
554 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
555 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
556 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
557 ; VBITS_GE_256-NEXT: mul z0.h, p0/m, z0.h, z1.h
558 ; VBITS_GE_256-NEXT: movprfx z1, z2
559 ; VBITS_GE_256-NEXT: mul z1.h, p0/m, z1.h, z3.h
560 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
561 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
562 ; VBITS_GE_256-NEXT: ret
564 ; VBITS_GE_512-LABEL: mul_v32i16:
565 ; VBITS_GE_512: // %bb.0:
566 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
567 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
568 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
569 ; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h
570 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
571 ; VBITS_GE_512-NEXT: ret
572 %op1 = load <32 x i16>, ptr %a
573 %op2 = load <32 x i16>, ptr %b
574 %res = mul <32 x i16> %op1, %op2
575 store <32 x i16> %res, ptr %a
579 define void @mul_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
580 ; CHECK-LABEL: mul_v64i16:
582 ; CHECK-NEXT: ptrue p0.h, vl64
583 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
584 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
585 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
586 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
588 %op1 = load <64 x i16>, ptr %a
589 %op2 = load <64 x i16>, ptr %b
590 %res = mul <64 x i16> %op1, %op2
591 store <64 x i16> %res, ptr %a
595 define void @mul_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
596 ; CHECK-LABEL: mul_v128i16:
598 ; CHECK-NEXT: ptrue p0.h, vl128
599 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
600 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
601 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h
602 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
604 %op1 = load <128 x i16>, ptr %a
605 %op2 = load <128 x i16>, ptr %b
606 %res = mul <128 x i16> %op1, %op2
607 store <128 x i16> %res, ptr %a
611 ; Don't use SVE for 64-bit vectors.
612 define <2 x i32> @mul_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
613 ; CHECK-LABEL: mul_v2i32:
615 ; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s
617 %res = mul <2 x i32> %op1, %op2
621 ; Don't use SVE for 128-bit vectors.
622 define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
623 ; CHECK-LABEL: mul_v4i32:
625 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
627 %res = mul <4 x i32> %op1, %op2
631 define void @mul_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
632 ; CHECK-LABEL: mul_v8i32:
634 ; CHECK-NEXT: ptrue p0.s, vl8
635 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
636 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
637 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
638 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
640 %op1 = load <8 x i32>, ptr %a
641 %op2 = load <8 x i32>, ptr %b
642 %res = mul <8 x i32> %op1, %op2
643 store <8 x i32> %res, ptr %a
647 define void @mul_v16i32(ptr %a, ptr %b) #0 {
648 ; VBITS_GE_256-LABEL: mul_v16i32:
649 ; VBITS_GE_256: // %bb.0:
650 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
651 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
652 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
653 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
654 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
655 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
656 ; VBITS_GE_256-NEXT: mul z0.s, p0/m, z0.s, z1.s
657 ; VBITS_GE_256-NEXT: movprfx z1, z2
658 ; VBITS_GE_256-NEXT: mul z1.s, p0/m, z1.s, z3.s
659 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
660 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
661 ; VBITS_GE_256-NEXT: ret
663 ; VBITS_GE_512-LABEL: mul_v16i32:
664 ; VBITS_GE_512: // %bb.0:
665 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
666 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
667 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
668 ; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s
669 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
670 ; VBITS_GE_512-NEXT: ret
671 %op1 = load <16 x i32>, ptr %a
672 %op2 = load <16 x i32>, ptr %b
673 %res = mul <16 x i32> %op1, %op2
674 store <16 x i32> %res, ptr %a
678 define void @mul_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
679 ; CHECK-LABEL: mul_v32i32:
681 ; CHECK-NEXT: ptrue p0.s, vl32
682 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
683 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
684 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
685 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
687 %op1 = load <32 x i32>, ptr %a
688 %op2 = load <32 x i32>, ptr %b
689 %res = mul <32 x i32> %op1, %op2
690 store <32 x i32> %res, ptr %a
694 define void @mul_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
695 ; CHECK-LABEL: mul_v64i32:
697 ; CHECK-NEXT: ptrue p0.s, vl64
698 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
699 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
700 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
701 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
703 %op1 = load <64 x i32>, ptr %a
704 %op2 = load <64 x i32>, ptr %b
705 %res = mul <64 x i32> %op1, %op2
706 store <64 x i32> %res, ptr %a
710 define <1 x i64> @mul_v1i64(<1 x i64> %op1, <1 x i64> %op2) #0 {
711 ; CHECK-LABEL: mul_v1i64:
713 ; CHECK-NEXT: ptrue p0.d, vl1
714 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
715 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
716 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
717 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
719 %res = mul <1 x i64> %op1, %op2
723 define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
724 ; CHECK-LABEL: mul_v2i64:
726 ; CHECK-NEXT: ptrue p0.d, vl2
727 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
728 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
729 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
730 ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
732 %res = mul <2 x i64> %op1, %op2
736 define void @mul_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
737 ; CHECK-LABEL: mul_v4i64:
739 ; CHECK-NEXT: ptrue p0.d, vl4
740 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
741 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
742 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
743 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
745 %op1 = load <4 x i64>, ptr %a
746 %op2 = load <4 x i64>, ptr %b
747 %res = mul <4 x i64> %op1, %op2
748 store <4 x i64> %res, ptr %a
752 define void @mul_v8i64(ptr %a, ptr %b) #0 {
753 ; VBITS_GE_256-LABEL: mul_v8i64:
754 ; VBITS_GE_256: // %bb.0:
755 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
756 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
757 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
758 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
759 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
760 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
761 ; VBITS_GE_256-NEXT: mul z0.d, p0/m, z0.d, z1.d
762 ; VBITS_GE_256-NEXT: movprfx z1, z2
763 ; VBITS_GE_256-NEXT: mul z1.d, p0/m, z1.d, z3.d
764 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
765 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
766 ; VBITS_GE_256-NEXT: ret
768 ; VBITS_GE_512-LABEL: mul_v8i64:
769 ; VBITS_GE_512: // %bb.0:
770 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
771 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
772 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
773 ; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d
774 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
775 ; VBITS_GE_512-NEXT: ret
776 %op1 = load <8 x i64>, ptr %a
777 %op2 = load <8 x i64>, ptr %b
778 %res = mul <8 x i64> %op1, %op2
779 store <8 x i64> %res, ptr %a
783 define void @mul_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
784 ; CHECK-LABEL: mul_v16i64:
786 ; CHECK-NEXT: ptrue p0.d, vl16
787 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
788 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
789 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
790 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
792 %op1 = load <16 x i64>, ptr %a
793 %op2 = load <16 x i64>, ptr %b
794 %res = mul <16 x i64> %op1, %op2
795 store <16 x i64> %res, ptr %a
799 define void @mul_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
800 ; CHECK-LABEL: mul_v32i64:
802 ; CHECK-NEXT: ptrue p0.d, vl32
803 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
804 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
805 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d
806 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
808 %op1 = load <32 x i64>, ptr %a
809 %op2 = load <32 x i64>, ptr %b
810 %res = mul <32 x i64> %op1, %op2
811 store <32 x i64> %res, ptr %a
819 ; Don't use SVE for 64-bit vectors.
820 define <8 x i8> @sub_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
821 ; CHECK-LABEL: sub_v8i8:
823 ; CHECK-NEXT: sub v0.8b, v0.8b, v1.8b
825 %res = sub <8 x i8> %op1, %op2
829 ; Don't use SVE for 128-bit vectors.
830 define <16 x i8> @sub_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
831 ; CHECK-LABEL: sub_v16i8:
833 ; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b
835 %res = sub <16 x i8> %op1, %op2
839 define void @sub_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
840 ; CHECK-LABEL: sub_v32i8:
842 ; CHECK-NEXT: ptrue p0.b, vl32
843 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
844 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
845 ; CHECK-NEXT: sub z0.b, z0.b, z1.b
846 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
848 %op1 = load <32 x i8>, ptr %a
849 %op2 = load <32 x i8>, ptr %b
850 %res = sub <32 x i8> %op1, %op2
851 store <32 x i8> %res, ptr %a
855 define void @sub_v64i8(ptr %a, ptr %b) #0 {
856 ; VBITS_GE_256-LABEL: sub_v64i8:
857 ; VBITS_GE_256: // %bb.0:
858 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
859 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
860 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
861 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8]
862 ; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0]
863 ; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
864 ; VBITS_GE_256-NEXT: sub z0.b, z0.b, z1.b
865 ; VBITS_GE_256-NEXT: sub z1.b, z2.b, z3.b
866 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
867 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
868 ; VBITS_GE_256-NEXT: ret
870 ; VBITS_GE_512-LABEL: sub_v64i8:
871 ; VBITS_GE_512: // %bb.0:
872 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
873 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
874 ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
875 ; VBITS_GE_512-NEXT: sub z0.b, z0.b, z1.b
876 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
877 ; VBITS_GE_512-NEXT: ret
878 %op1 = load <64 x i8>, ptr %a
879 %op2 = load <64 x i8>, ptr %b
880 %res = sub <64 x i8> %op1, %op2
881 store <64 x i8> %res, ptr %a
885 define void @sub_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
886 ; CHECK-LABEL: sub_v128i8:
888 ; CHECK-NEXT: ptrue p0.b, vl128
889 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
890 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
891 ; CHECK-NEXT: sub z0.b, z0.b, z1.b
892 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
894 %op1 = load <128 x i8>, ptr %a
895 %op2 = load <128 x i8>, ptr %b
896 %res = sub <128 x i8> %op1, %op2
897 store <128 x i8> %res, ptr %a
901 define void @sub_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
902 ; CHECK-LABEL: sub_v256i8:
904 ; CHECK-NEXT: ptrue p0.b, vl256
905 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
906 ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1]
907 ; CHECK-NEXT: sub z0.b, z0.b, z1.b
908 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
910 %op1 = load <256 x i8>, ptr %a
911 %op2 = load <256 x i8>, ptr %b
912 %res = sub <256 x i8> %op1, %op2
913 store <256 x i8> %res, ptr %a
917 ; Don't use SVE for 64-bit vectors.
918 define <4 x i16> @sub_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
919 ; CHECK-LABEL: sub_v4i16:
921 ; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h
923 %res = sub <4 x i16> %op1, %op2
927 ; Don't use SVE for 128-bit vectors.
928 define <8 x i16> @sub_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
929 ; CHECK-LABEL: sub_v8i16:
931 ; CHECK-NEXT: sub v0.8h, v0.8h, v1.8h
933 %res = sub <8 x i16> %op1, %op2
937 define void @sub_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
938 ; CHECK-LABEL: sub_v16i16:
940 ; CHECK-NEXT: ptrue p0.h, vl16
941 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
942 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
943 ; CHECK-NEXT: sub z0.h, z0.h, z1.h
944 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
946 %op1 = load <16 x i16>, ptr %a
947 %op2 = load <16 x i16>, ptr %b
948 %res = sub <16 x i16> %op1, %op2
949 store <16 x i16> %res, ptr %a
953 define void @sub_v32i16(ptr %a, ptr %b) #0 {
954 ; VBITS_GE_256-LABEL: sub_v32i16:
955 ; VBITS_GE_256: // %bb.0:
956 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
957 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
958 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
959 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
960 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
961 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
962 ; VBITS_GE_256-NEXT: sub z0.h, z0.h, z1.h
963 ; VBITS_GE_256-NEXT: sub z1.h, z2.h, z3.h
964 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
965 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
966 ; VBITS_GE_256-NEXT: ret
968 ; VBITS_GE_512-LABEL: sub_v32i16:
969 ; VBITS_GE_512: // %bb.0:
970 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
971 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
972 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
973 ; VBITS_GE_512-NEXT: sub z0.h, z0.h, z1.h
974 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
975 ; VBITS_GE_512-NEXT: ret
976 %op1 = load <32 x i16>, ptr %a
977 %op2 = load <32 x i16>, ptr %b
978 %res = sub <32 x i16> %op1, %op2
979 store <32 x i16> %res, ptr %a
983 define void @sub_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
984 ; CHECK-LABEL: sub_v64i16:
986 ; CHECK-NEXT: ptrue p0.h, vl64
987 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
988 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
989 ; CHECK-NEXT: sub z0.h, z0.h, z1.h
990 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
992 %op1 = load <64 x i16>, ptr %a
993 %op2 = load <64 x i16>, ptr %b
994 %res = sub <64 x i16> %op1, %op2
995 store <64 x i16> %res, ptr %a
999 define void @sub_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1000 ; CHECK-LABEL: sub_v128i16:
1002 ; CHECK-NEXT: ptrue p0.h, vl128
1003 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1004 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1005 ; CHECK-NEXT: sub z0.h, z0.h, z1.h
1006 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1008 %op1 = load <128 x i16>, ptr %a
1009 %op2 = load <128 x i16>, ptr %b
1010 %res = sub <128 x i16> %op1, %op2
1011 store <128 x i16> %res, ptr %a
1015 ; Don't use SVE for 64-bit vectors.
1016 define <2 x i32> @sub_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
1017 ; CHECK-LABEL: sub_v2i32:
1019 ; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s
1021 %res = sub <2 x i32> %op1, %op2
1025 ; Don't use SVE for 128-bit vectors.
1026 define <4 x i32> @sub_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
1027 ; CHECK-LABEL: sub_v4i32:
1029 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s
1031 %res = sub <4 x i32> %op1, %op2
1035 define void @sub_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1036 ; CHECK-LABEL: sub_v8i32:
1038 ; CHECK-NEXT: ptrue p0.s, vl8
1039 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1040 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1041 ; CHECK-NEXT: sub z0.s, z0.s, z1.s
1042 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1044 %op1 = load <8 x i32>, ptr %a
1045 %op2 = load <8 x i32>, ptr %b
1046 %res = sub <8 x i32> %op1, %op2
1047 store <8 x i32> %res, ptr %a
1051 define void @sub_v16i32(ptr %a, ptr %b) #0 {
1052 ; VBITS_GE_256-LABEL: sub_v16i32:
1053 ; VBITS_GE_256: // %bb.0:
1054 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1055 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1056 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1057 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1058 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
1059 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
1060 ; VBITS_GE_256-NEXT: sub z0.s, z0.s, z1.s
1061 ; VBITS_GE_256-NEXT: sub z1.s, z2.s, z3.s
1062 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1063 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1064 ; VBITS_GE_256-NEXT: ret
1066 ; VBITS_GE_512-LABEL: sub_v16i32:
1067 ; VBITS_GE_512: // %bb.0:
1068 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1069 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1070 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
1071 ; VBITS_GE_512-NEXT: sub z0.s, z0.s, z1.s
1072 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1073 ; VBITS_GE_512-NEXT: ret
1074 %op1 = load <16 x i32>, ptr %a
1075 %op2 = load <16 x i32>, ptr %b
1076 %res = sub <16 x i32> %op1, %op2
1077 store <16 x i32> %res, ptr %a
1081 define void @sub_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1082 ; CHECK-LABEL: sub_v32i32:
1084 ; CHECK-NEXT: ptrue p0.s, vl32
1085 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1086 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1087 ; CHECK-NEXT: sub z0.s, z0.s, z1.s
1088 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1090 %op1 = load <32 x i32>, ptr %a
1091 %op2 = load <32 x i32>, ptr %b
1092 %res = sub <32 x i32> %op1, %op2
1093 store <32 x i32> %res, ptr %a
1097 define void @sub_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1098 ; CHECK-LABEL: sub_v64i32:
1100 ; CHECK-NEXT: ptrue p0.s, vl64
1101 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1102 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1103 ; CHECK-NEXT: sub z0.s, z0.s, z1.s
1104 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1106 %op1 = load <64 x i32>, ptr %a
1107 %op2 = load <64 x i32>, ptr %b
1108 %res = sub <64 x i32> %op1, %op2
1109 store <64 x i32> %res, ptr %a
1113 ; Don't use SVE for 64-bit vectors.
1114 define <1 x i64> @sub_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(2,0) #0 {
1115 ; CHECK-LABEL: sub_v1i64:
1117 ; CHECK-NEXT: sub d0, d0, d1
1119 %res = sub <1 x i64> %op1, %op2
1123 ; Don't use SVE for 128-bit vectors.
1124 define <2 x i64> @sub_v2i64(<2 x i64> %op1, <2 x i64> %op2) vscale_range(2,0) #0 {
1125 ; CHECK-LABEL: sub_v2i64:
1127 ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
1129 %res = sub <2 x i64> %op1, %op2
1133 define void @sub_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1134 ; CHECK-LABEL: sub_v4i64:
1136 ; CHECK-NEXT: ptrue p0.d, vl4
1137 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1138 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1139 ; CHECK-NEXT: sub z0.d, z0.d, z1.d
1140 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1142 %op1 = load <4 x i64>, ptr %a
1143 %op2 = load <4 x i64>, ptr %b
1144 %res = sub <4 x i64> %op1, %op2
1145 store <4 x i64> %res, ptr %a
1149 define void @sub_v8i64(ptr %a, ptr %b) #0 {
1150 ; VBITS_GE_256-LABEL: sub_v8i64:
1151 ; VBITS_GE_256: // %bb.0:
1152 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1153 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1154 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1155 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1156 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
1157 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
1158 ; VBITS_GE_256-NEXT: sub z0.d, z0.d, z1.d
1159 ; VBITS_GE_256-NEXT: sub z1.d, z2.d, z3.d
1160 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1161 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1162 ; VBITS_GE_256-NEXT: ret
1164 ; VBITS_GE_512-LABEL: sub_v8i64:
1165 ; VBITS_GE_512: // %bb.0:
1166 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1167 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1168 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
1169 ; VBITS_GE_512-NEXT: sub z0.d, z0.d, z1.d
1170 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1171 ; VBITS_GE_512-NEXT: ret
1172 %op1 = load <8 x i64>, ptr %a
1173 %op2 = load <8 x i64>, ptr %b
1174 %res = sub <8 x i64> %op1, %op2
1175 store <8 x i64> %res, ptr %a
1179 define void @sub_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1180 ; CHECK-LABEL: sub_v16i64:
1182 ; CHECK-NEXT: ptrue p0.d, vl16
1183 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1184 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1185 ; CHECK-NEXT: sub z0.d, z0.d, z1.d
1186 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1188 %op1 = load <16 x i64>, ptr %a
1189 %op2 = load <16 x i64>, ptr %b
1190 %res = sub <16 x i64> %op1, %op2
1191 store <16 x i64> %res, ptr %a
1195 define void @sub_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1196 ; CHECK-LABEL: sub_v32i64:
1198 ; CHECK-NEXT: ptrue p0.d, vl32
1199 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1200 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1201 ; CHECK-NEXT: sub z0.d, z0.d, z1.d
1202 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1204 %op1 = load <32 x i64>, ptr %a
1205 %op2 = load <32 x i64>, ptr %b
1206 %res = sub <32 x i64> %op1, %op2
1207 store <32 x i64> %res, ptr %a
1216 ; Don't use SVE for 64-bit vectors.
1217 define <8 x i8> @abs_v8i8(<8 x i8> %op1) vscale_range(2,0) #0 {
1218 ; CHECK-LABEL: abs_v8i8:
1220 ; CHECK-NEXT: abs v0.8b, v0.8b
1222 %res = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %op1, i1 false)
1226 ; Don't use SVE for 128-bit vectors.
1227 define <16 x i8> @abs_v16i8(<16 x i8> %op1) vscale_range(2,0) #0 {
1228 ; CHECK-LABEL: abs_v16i8:
1230 ; CHECK-NEXT: abs v0.16b, v0.16b
1232 %res = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %op1, i1 false)
1236 define void @abs_v32i8(ptr %a) vscale_range(2,0) #0 {
1237 ; CHECK-LABEL: abs_v32i8:
1239 ; CHECK-NEXT: ptrue p0.b, vl32
1240 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1241 ; CHECK-NEXT: abs z0.b, p0/m, z0.b
1242 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1244 %op1 = load <32 x i8>, ptr %a
1245 %res = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %op1, i1 false)
1246 store <32 x i8> %res, ptr %a
1250 define void @abs_v64i8(ptr %a) #0 {
1251 ; VBITS_GE_256-LABEL: abs_v64i8:
1252 ; VBITS_GE_256: // %bb.0:
1253 ; VBITS_GE_256-NEXT: ptrue p0.b, vl32
1254 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
1255 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
1256 ; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
1257 ; VBITS_GE_256-NEXT: abs z0.b, p0/m, z0.b
1258 ; VBITS_GE_256-NEXT: abs z1.b, p0/m, z1.b
1259 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
1260 ; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
1261 ; VBITS_GE_256-NEXT: ret
1263 ; VBITS_GE_512-LABEL: abs_v64i8:
1264 ; VBITS_GE_512: // %bb.0:
1265 ; VBITS_GE_512-NEXT: ptrue p0.b, vl64
1266 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
1267 ; VBITS_GE_512-NEXT: abs z0.b, p0/m, z0.b
1268 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0]
1269 ; VBITS_GE_512-NEXT: ret
1270 %op1 = load <64 x i8>, ptr %a
1271 %res = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %op1, i1 false)
1272 store <64 x i8> %res, ptr %a
1276 define void @abs_v128i8(ptr %a) vscale_range(8,0) #0 {
1277 ; CHECK-LABEL: abs_v128i8:
1279 ; CHECK-NEXT: ptrue p0.b, vl128
1280 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1281 ; CHECK-NEXT: abs z0.b, p0/m, z0.b
1282 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1284 %op1 = load <128 x i8>, ptr %a
1285 %res = call <128 x i8> @llvm.abs.v128i8(<128 x i8> %op1, i1 false)
1286 store <128 x i8> %res, ptr %a
1290 define void @abs_v256i8(ptr %a) vscale_range(16,0) #0 {
1291 ; CHECK-LABEL: abs_v256i8:
1293 ; CHECK-NEXT: ptrue p0.b, vl256
1294 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
1295 ; CHECK-NEXT: abs z0.b, p0/m, z0.b
1296 ; CHECK-NEXT: st1b { z0.b }, p0, [x0]
1298 %op1 = load <256 x i8>, ptr %a
1299 %res = call <256 x i8> @llvm.abs.v256i8(<256 x i8> %op1, i1 false)
1300 store <256 x i8> %res, ptr %a
1304 ; Don't use SVE for 64-bit vectors.
1305 define <4 x i16> @abs_v4i16(<4 x i16> %op1) vscale_range(2,0) #0 {
1306 ; CHECK-LABEL: abs_v4i16:
1308 ; CHECK-NEXT: abs v0.4h, v0.4h
1310 %res = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %op1, i1 false)
1314 ; Don't use SVE for 128-bit vectors.
1315 define <8 x i16> @abs_v8i16(<8 x i16> %op1) vscale_range(2,0) #0 {
1316 ; CHECK-LABEL: abs_v8i16:
1318 ; CHECK-NEXT: abs v0.8h, v0.8h
1320 %res = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %op1, i1 false)
1324 define void @abs_v16i16(ptr %a) vscale_range(2,0) #0 {
1325 ; CHECK-LABEL: abs_v16i16:
1327 ; CHECK-NEXT: ptrue p0.h, vl16
1328 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1329 ; CHECK-NEXT: abs z0.h, p0/m, z0.h
1330 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1332 %op1 = load <16 x i16>, ptr %a
1333 %res = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %op1, i1 false)
1334 store <16 x i16> %res, ptr %a
1338 define void @abs_v32i16(ptr %a) vscale_range(2,0) #0 {
1339 ; CHECK-LABEL: abs_v32i16:
1341 ; CHECK-NEXT: ptrue p0.h, vl16
1342 ; CHECK-NEXT: mov x8, #16 // =0x10
1343 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1344 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
1345 ; CHECK-NEXT: abs z0.h, p0/m, z0.h
1346 ; CHECK-NEXT: abs z1.h, p0/m, z1.h
1347 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1348 ; CHECK-NEXT: st1h { z1.h }, p0, [x0]
1350 %op1 = load <32 x i16>, ptr %a
1351 %res = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %op1, i1 false)
1352 store <32 x i16> %res, ptr %a
1356 define void @abs_v64i16(ptr %a) vscale_range(2,0) #0 {
1357 ; CHECK-LABEL: abs_v64i16:
1359 ; CHECK-NEXT: ptrue p0.h, vl16
1360 ; CHECK-NEXT: mov x8, #32 // =0x20
1361 ; CHECK-NEXT: mov x9, #48 // =0x30
1362 ; CHECK-NEXT: mov x10, #16 // =0x10
1363 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1364 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
1365 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
1366 ; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0]
1367 ; CHECK-NEXT: abs z0.h, p0/m, z0.h
1368 ; CHECK-NEXT: abs z1.h, p0/m, z1.h
1369 ; CHECK-NEXT: abs z2.h, p0/m, z2.h
1370 ; CHECK-NEXT: abs z3.h, p0/m, z3.h
1371 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1372 ; CHECK-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
1373 ; CHECK-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
1374 ; CHECK-NEXT: st1h { z3.h }, p0, [x0]
1376 %op1 = load <64 x i16>, ptr %a
1377 %res = call <64 x i16> @llvm.abs.v64i16(<64 x i16> %op1, i1 false)
1378 store <64 x i16> %res, ptr %a
1382 define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 {
1383 ; CHECK-LABEL: abs_v128i16:
1385 ; CHECK-NEXT: ptrue p0.h, vl16
1386 ; CHECK-NEXT: mov x8, #96 // =0x60
1387 ; CHECK-NEXT: mov x9, #112 // =0x70
1388 ; CHECK-NEXT: mov x10, #64 // =0x40
1389 ; CHECK-NEXT: mov x11, #80 // =0x50
1390 ; CHECK-NEXT: mov x12, #32 // =0x20
1391 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1392 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
1393 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
1394 ; CHECK-NEXT: mov x13, #48 // =0x30
1395 ; CHECK-NEXT: mov x14, #16 // =0x10
1396 ; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
1397 ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
1398 ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
1399 ; CHECK-NEXT: ld1h { z6.h }, p0/z, [x0, x14, lsl #1]
1400 ; CHECK-NEXT: abs z0.h, p0/m, z0.h
1401 ; CHECK-NEXT: abs z1.h, p0/m, z1.h
1402 ; CHECK-NEXT: abs z2.h, p0/m, z2.h
1403 ; CHECK-NEXT: abs z3.h, p0/m, z3.h
1404 ; CHECK-NEXT: abs z4.h, p0/m, z4.h
1405 ; CHECK-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1406 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1407 ; CHECK-NEXT: st1h { z1.h }, p0, [x0, x9, lsl #1]
1408 ; CHECK-NEXT: movprfx z1, z5
1409 ; CHECK-NEXT: abs z1.h, p0/m, z5.h
1410 ; CHECK-NEXT: st1h { z2.h }, p0, [x0, x10, lsl #1]
1411 ; CHECK-NEXT: movprfx z2, z6
1412 ; CHECK-NEXT: abs z2.h, p0/m, z6.h
1413 ; CHECK-NEXT: abs z0.h, p0/m, z0.h
1414 ; CHECK-NEXT: st1h { z3.h }, p0, [x0, x11, lsl #1]
1415 ; CHECK-NEXT: st1h { z4.h }, p0, [x0, x12, lsl #1]
1416 ; CHECK-NEXT: st1h { z1.h }, p0, [x0, x13, lsl #1]
1417 ; CHECK-NEXT: st1h { z2.h }, p0, [x0, x14, lsl #1]
1418 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1420 %op1 = load <128 x i16>, ptr %a
1421 %res = call <128 x i16> @llvm.abs.v128i16(<128 x i16> %op1, i1 false)
1422 store <128 x i16> %res, ptr %a
1426 ; Don't use SVE for 64-bit vectors.
1427 define <2 x i32> @abs_v2i32(<2 x i32> %op1) vscale_range(2,0) #0 {
1428 ; CHECK-LABEL: abs_v2i32:
1430 ; CHECK-NEXT: abs v0.2s, v0.2s
1432 %res = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %op1, i1 false)
1436 ; Don't use SVE for 128-bit vectors.
1437 define <4 x i32> @abs_v4i32(<4 x i32> %op1) vscale_range(2,0) #0 {
1438 ; CHECK-LABEL: abs_v4i32:
1440 ; CHECK-NEXT: abs v0.4s, v0.4s
1442 %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %op1, i1 false)
1446 define void @abs_v8i32(ptr %a) vscale_range(2,0) #0 {
1447 ; CHECK-LABEL: abs_v8i32:
1449 ; CHECK-NEXT: ptrue p0.s, vl8
1450 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1451 ; CHECK-NEXT: abs z0.s, p0/m, z0.s
1452 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1454 %op1 = load <8 x i32>, ptr %a
1455 %res = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %op1, i1 false)
1456 store <8 x i32> %res, ptr %a
1460 define void @abs_v16i32(ptr %a) #0 {
1461 ; VBITS_GE_256-LABEL: abs_v16i32:
1462 ; VBITS_GE_256: // %bb.0:
1463 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1464 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1465 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1466 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1467 ; VBITS_GE_256-NEXT: abs z0.s, p0/m, z0.s
1468 ; VBITS_GE_256-NEXT: abs z1.s, p0/m, z1.s
1469 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1470 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1471 ; VBITS_GE_256-NEXT: ret
1473 ; VBITS_GE_512-LABEL: abs_v16i32:
1474 ; VBITS_GE_512: // %bb.0:
1475 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1476 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1477 ; VBITS_GE_512-NEXT: abs z0.s, p0/m, z0.s
1478 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1479 ; VBITS_GE_512-NEXT: ret
1480 %op1 = load <16 x i32>, ptr %a
1481 %res = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %op1, i1 false)
1482 store <16 x i32> %res, ptr %a
1486 define void @abs_v32i32(ptr %a) vscale_range(8,0) #0 {
1487 ; CHECK-LABEL: abs_v32i32:
1489 ; CHECK-NEXT: ptrue p0.s, vl32
1490 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1491 ; CHECK-NEXT: abs z0.s, p0/m, z0.s
1492 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1494 %op1 = load <32 x i32>, ptr %a
1495 %res = call <32 x i32> @llvm.abs.v32i32(<32 x i32> %op1, i1 false)
1496 store <32 x i32> %res, ptr %a
1500 define void @abs_v64i32(ptr %a) vscale_range(16,0) #0 {
1501 ; CHECK-LABEL: abs_v64i32:
1503 ; CHECK-NEXT: ptrue p0.s, vl64
1504 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1505 ; CHECK-NEXT: abs z0.s, p0/m, z0.s
1506 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1508 %op1 = load <64 x i32>, ptr %a
1509 %res = call <64 x i32> @llvm.abs.v64i32(<64 x i32> %op1, i1 false)
1510 store <64 x i32> %res, ptr %a
1514 ; Don't use SVE for 64-bit vectors.
1515 define <1 x i64> @abs_v1i64(<1 x i64> %op1) vscale_range(2,0) #0 {
1516 ; CHECK-LABEL: abs_v1i64:
1518 ; CHECK-NEXT: abs d0, d0
1520 %res = call <1 x i64> @llvm.abs.v1i64(<1 x i64> %op1, i1 false)
1524 ; Don't use SVE for 128-bit vectors.
1525 define <2 x i64> @abs_v2i64(<2 x i64> %op1) vscale_range(2,0) #0 {
1526 ; CHECK-LABEL: abs_v2i64:
1528 ; CHECK-NEXT: abs v0.2d, v0.2d
1530 %res = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %op1, i1 false)
1534 define void @abs_v4i64(ptr %a) vscale_range(2,0) #0 {
1535 ; CHECK-LABEL: abs_v4i64:
1537 ; CHECK-NEXT: ptrue p0.d, vl4
1538 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1539 ; CHECK-NEXT: abs z0.d, p0/m, z0.d
1540 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1542 %op1 = load <4 x i64>, ptr %a
1543 %res = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %op1, i1 false)
1544 store <4 x i64> %res, ptr %a
1548 define void @abs_v8i64(ptr %a) #0 {
1549 ; VBITS_GE_256-LABEL: abs_v8i64:
1550 ; VBITS_GE_256: // %bb.0:
1551 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1552 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1553 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1554 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1555 ; VBITS_GE_256-NEXT: abs z0.d, p0/m, z0.d
1556 ; VBITS_GE_256-NEXT: abs z1.d, p0/m, z1.d
1557 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1558 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1559 ; VBITS_GE_256-NEXT: ret
1561 ; VBITS_GE_512-LABEL: abs_v8i64:
1562 ; VBITS_GE_512: // %bb.0:
1563 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1564 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1565 ; VBITS_GE_512-NEXT: abs z0.d, p0/m, z0.d
1566 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1567 ; VBITS_GE_512-NEXT: ret
1568 %op1 = load <8 x i64>, ptr %a
1569 %res = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %op1, i1 false)
1570 store <8 x i64> %res, ptr %a
1574 define void @abs_v16i64(ptr %a) vscale_range(8,0) #0 {
1575 ; CHECK-LABEL: abs_v16i64:
1577 ; CHECK-NEXT: ptrue p0.d, vl16
1578 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1579 ; CHECK-NEXT: abs z0.d, p0/m, z0.d
1580 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1582 %op1 = load <16 x i64>, ptr %a
1583 %res = call <16 x i64> @llvm.abs.v16i64(<16 x i64> %op1, i1 false)
1584 store <16 x i64> %res, ptr %a
1588 define void @abs_v32i64(ptr %a) vscale_range(16,0) #0 {
1589 ; CHECK-LABEL: abs_v32i64:
1591 ; CHECK-NEXT: ptrue p0.d, vl32
1592 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1593 ; CHECK-NEXT: abs z0.d, p0/m, z0.d
1594 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1596 %op1 = load <32 x i64>, ptr %a
1597 %res = call <32 x i64> @llvm.abs.v32i64(<32 x i64> %op1, i1 false)
1598 store <32 x i64> %res, ptr %a
1602 declare <8 x i8> @llvm.abs.v8i8(<8 x i8>, i1)
1603 declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
1604 declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1)
1605 declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1)
1606 declare <128 x i8> @llvm.abs.v128i8(<128 x i8>, i1)
1607 declare <256 x i8> @llvm.abs.v256i8(<256 x i8>, i1)
1608 declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1)
1609 declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)
1610 declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1)
1611 declare <32 x i16> @llvm.abs.v32i16(<32 x i16>, i1)
1612 declare <64 x i16> @llvm.abs.v64i16(<64 x i16>, i1)
1613 declare <128 x i16> @llvm.abs.v128i16(<128 x i16>, i1)
1614 declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
1615 declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
1616 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)
1617 declare <16 x i32> @llvm.abs.v16i32(<16 x i32>, i1)
1618 declare <32 x i32> @llvm.abs.v32i32(<32 x i32>, i1)
1619 declare <64 x i32> @llvm.abs.v64i32(<64 x i32>, i1)
1620 declare <1 x i64> @llvm.abs.v1i64(<1 x i64>, i1)
1621 declare <2 x i64> @llvm.abs.v2i64(<2 x i64>, i1)
1622 declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
1623 declare <8 x i64> @llvm.abs.v8i64(<8 x i64>, i1)
1624 declare <16 x i64> @llvm.abs.v16i64(<16 x i64>, i1)
1625 declare <32 x i64> @llvm.abs.v32i64(<32 x i64>, i1)
1627 attributes #0 = { "target-features"="+sve" }