1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; Don't use SVE for 64-bit vectors.
13 define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
14 ; CHECK-LABEL: fadd_v4f16:
16 ; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h
18 %res = fadd <4 x half> %op1, %op2
22 ; Don't use SVE for 128-bit vectors.
23 define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
24 ; CHECK-LABEL: fadd_v8f16:
26 ; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h
28 %res = fadd <8 x half> %op1, %op2
32 define void @fadd_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
33 ; CHECK-LABEL: fadd_v16f16:
35 ; CHECK-NEXT: ptrue p0.h, vl16
36 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
37 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
38 ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h
39 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
41 %op1 = load <16 x half>, ptr %a
42 %op2 = load <16 x half>, ptr %b
43 %res = fadd <16 x half> %op1, %op2
44 store <16 x half> %res, ptr %a
48 define void @fadd_v32f16(ptr %a, ptr %b) #0 {
49 ; VBITS_GE_256-LABEL: fadd_v32f16:
50 ; VBITS_GE_256: // %bb.0:
51 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
52 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
53 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
54 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
55 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
56 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
57 ; VBITS_GE_256-NEXT: fadd z0.h, p0/m, z0.h, z1.h
58 ; VBITS_GE_256-NEXT: movprfx z1, z2
59 ; VBITS_GE_256-NEXT: fadd z1.h, p0/m, z1.h, z3.h
60 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
61 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
62 ; VBITS_GE_256-NEXT: ret
64 ; VBITS_GE_512-LABEL: fadd_v32f16:
65 ; VBITS_GE_512: // %bb.0:
66 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
67 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
68 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
69 ; VBITS_GE_512-NEXT: fadd z0.h, p0/m, z0.h, z1.h
70 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
71 ; VBITS_GE_512-NEXT: ret
72 %op1 = load <32 x half>, ptr %a
73 %op2 = load <32 x half>, ptr %b
74 %res = fadd <32 x half> %op1, %op2
75 store <32 x half> %res, ptr %a
79 define void @fadd_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
80 ; CHECK-LABEL: fadd_v64f16:
82 ; CHECK-NEXT: ptrue p0.h, vl64
83 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
84 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
85 ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h
86 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
88 %op1 = load <64 x half>, ptr %a
89 %op2 = load <64 x half>, ptr %b
90 %res = fadd <64 x half> %op1, %op2
91 store <64 x half> %res, ptr %a
95 define void @fadd_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
96 ; CHECK-LABEL: fadd_v128f16:
98 ; CHECK-NEXT: ptrue p0.h, vl128
99 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
100 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
101 ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h
102 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
104 %op1 = load <128 x half>, ptr %a
105 %op2 = load <128 x half>, ptr %b
106 %res = fadd <128 x half> %op1, %op2
107 store <128 x half> %res, ptr %a
111 ; Don't use SVE for 64-bit vectors.
112 define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
113 ; CHECK-LABEL: fadd_v2f32:
115 ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
117 %res = fadd <2 x float> %op1, %op2
121 ; Don't use SVE for 128-bit vectors.
122 define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
123 ; CHECK-LABEL: fadd_v4f32:
125 ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
127 %res = fadd <4 x float> %op1, %op2
131 define void @fadd_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
132 ; CHECK-LABEL: fadd_v8f32:
134 ; CHECK-NEXT: ptrue p0.s, vl8
135 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
136 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
137 ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
138 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
140 %op1 = load <8 x float>, ptr %a
141 %op2 = load <8 x float>, ptr %b
142 %res = fadd <8 x float> %op1, %op2
143 store <8 x float> %res, ptr %a
147 define void @fadd_v16f32(ptr %a, ptr %b) #0 {
148 ; VBITS_GE_256-LABEL: fadd_v16f32:
149 ; VBITS_GE_256: // %bb.0:
150 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
151 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
152 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
153 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
154 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
155 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
156 ; VBITS_GE_256-NEXT: fadd z0.s, p0/m, z0.s, z1.s
157 ; VBITS_GE_256-NEXT: movprfx z1, z2
158 ; VBITS_GE_256-NEXT: fadd z1.s, p0/m, z1.s, z3.s
159 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
160 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
161 ; VBITS_GE_256-NEXT: ret
163 ; VBITS_GE_512-LABEL: fadd_v16f32:
164 ; VBITS_GE_512: // %bb.0:
165 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
166 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
167 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
168 ; VBITS_GE_512-NEXT: fadd z0.s, p0/m, z0.s, z1.s
169 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
170 ; VBITS_GE_512-NEXT: ret
171 %op1 = load <16 x float>, ptr %a
172 %op2 = load <16 x float>, ptr %b
173 %res = fadd <16 x float> %op1, %op2
174 store <16 x float> %res, ptr %a
178 define void @fadd_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
179 ; CHECK-LABEL: fadd_v32f32:
181 ; CHECK-NEXT: ptrue p0.s, vl32
182 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
183 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
184 ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
185 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
187 %op1 = load <32 x float>, ptr %a
188 %op2 = load <32 x float>, ptr %b
189 %res = fadd <32 x float> %op1, %op2
190 store <32 x float> %res, ptr %a
194 define void @fadd_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
195 ; CHECK-LABEL: fadd_v64f32:
197 ; CHECK-NEXT: ptrue p0.s, vl64
198 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
199 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
200 ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
201 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
203 %op1 = load <64 x float>, ptr %a
204 %op2 = load <64 x float>, ptr %b
205 %res = fadd <64 x float> %op1, %op2
206 store <64 x float> %res, ptr %a
210 ; Don't use SVE for 64-bit vectors.
211 define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
212 ; CHECK-LABEL: fadd_v1f64:
214 ; CHECK-NEXT: fadd d0, d0, d1
216 %res = fadd <1 x double> %op1, %op2
217 ret <1 x double> %res
220 ; Don't use SVE for 128-bit vectors.
221 define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
222 ; CHECK-LABEL: fadd_v2f64:
224 ; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
226 %res = fadd <2 x double> %op1, %op2
227 ret <2 x double> %res
230 define void @fadd_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
231 ; CHECK-LABEL: fadd_v4f64:
233 ; CHECK-NEXT: ptrue p0.d, vl4
234 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
235 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
236 ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d
237 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
239 %op1 = load <4 x double>, ptr %a
240 %op2 = load <4 x double>, ptr %b
241 %res = fadd <4 x double> %op1, %op2
242 store <4 x double> %res, ptr %a
246 define void @fadd_v8f64(ptr %a, ptr %b) #0 {
247 ; VBITS_GE_256-LABEL: fadd_v8f64:
248 ; VBITS_GE_256: // %bb.0:
249 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
250 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
251 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
252 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
253 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
254 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
255 ; VBITS_GE_256-NEXT: fadd z0.d, p0/m, z0.d, z1.d
256 ; VBITS_GE_256-NEXT: movprfx z1, z2
257 ; VBITS_GE_256-NEXT: fadd z1.d, p0/m, z1.d, z3.d
258 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
259 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
260 ; VBITS_GE_256-NEXT: ret
262 ; VBITS_GE_512-LABEL: fadd_v8f64:
263 ; VBITS_GE_512: // %bb.0:
264 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
265 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
266 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
267 ; VBITS_GE_512-NEXT: fadd z0.d, p0/m, z0.d, z1.d
268 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
269 ; VBITS_GE_512-NEXT: ret
270 %op1 = load <8 x double>, ptr %a
271 %op2 = load <8 x double>, ptr %b
272 %res = fadd <8 x double> %op1, %op2
273 store <8 x double> %res, ptr %a
277 define void @fadd_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
278 ; CHECK-LABEL: fadd_v16f64:
280 ; CHECK-NEXT: ptrue p0.d, vl16
281 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
282 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
283 ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d
284 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
286 %op1 = load <16 x double>, ptr %a
287 %op2 = load <16 x double>, ptr %b
288 %res = fadd <16 x double> %op1, %op2
289 store <16 x double> %res, ptr %a
293 define void @fadd_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
294 ; CHECK-LABEL: fadd_v32f64:
296 ; CHECK-NEXT: ptrue p0.d, vl32
297 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
298 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
299 ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d
300 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
302 %op1 = load <32 x double>, ptr %a
303 %op2 = load <32 x double>, ptr %b
304 %res = fadd <32 x double> %op1, %op2
305 store <32 x double> %res, ptr %a
313 ; Don't use SVE for 64-bit vectors.
314 define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
315 ; CHECK-LABEL: fdiv_v4f16:
317 ; CHECK-NEXT: fdiv v0.4h, v0.4h, v1.4h
319 %res = fdiv <4 x half> %op1, %op2
323 ; Don't use SVE for 128-bit vectors.
324 define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
325 ; CHECK-LABEL: fdiv_v8f16:
327 ; CHECK-NEXT: fdiv v0.8h, v0.8h, v1.8h
329 %res = fdiv <8 x half> %op1, %op2
333 define void @fdiv_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
334 ; CHECK-LABEL: fdiv_v16f16:
336 ; CHECK-NEXT: ptrue p0.h, vl16
337 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
338 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
339 ; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
340 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
342 %op1 = load <16 x half>, ptr %a
343 %op2 = load <16 x half>, ptr %b
344 %res = fdiv <16 x half> %op1, %op2
345 store <16 x half> %res, ptr %a
349 define void @fdiv_v32f16(ptr %a, ptr %b) #0 {
350 ; VBITS_GE_256-LABEL: fdiv_v32f16:
351 ; VBITS_GE_256: // %bb.0:
352 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
353 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
354 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
355 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
356 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1]
357 ; VBITS_GE_256-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
358 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
359 ; VBITS_GE_256-NEXT: fdiv z1.h, p0/m, z1.h, z2.h
360 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
361 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
362 ; VBITS_GE_256-NEXT: ret
364 ; VBITS_GE_512-LABEL: fdiv_v32f16:
365 ; VBITS_GE_512: // %bb.0:
366 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
367 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
368 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
369 ; VBITS_GE_512-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
370 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
371 ; VBITS_GE_512-NEXT: ret
372 %op1 = load <32 x half>, ptr %a
373 %op2 = load <32 x half>, ptr %b
374 %res = fdiv <32 x half> %op1, %op2
375 store <32 x half> %res, ptr %a
379 define void @fdiv_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
380 ; CHECK-LABEL: fdiv_v64f16:
382 ; CHECK-NEXT: ptrue p0.h, vl64
383 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
384 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
385 ; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
386 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
388 %op1 = load <64 x half>, ptr %a
389 %op2 = load <64 x half>, ptr %b
390 %res = fdiv <64 x half> %op1, %op2
391 store <64 x half> %res, ptr %a
395 define void @fdiv_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
396 ; CHECK-LABEL: fdiv_v128f16:
398 ; CHECK-NEXT: ptrue p0.h, vl128
399 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
400 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
401 ; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
402 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
404 %op1 = load <128 x half>, ptr %a
405 %op2 = load <128 x half>, ptr %b
406 %res = fdiv <128 x half> %op1, %op2
407 store <128 x half> %res, ptr %a
411 ; Don't use SVE for 64-bit vectors.
412 define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
413 ; CHECK-LABEL: fdiv_v2f32:
415 ; CHECK-NEXT: fdiv v0.2s, v0.2s, v1.2s
417 %res = fdiv <2 x float> %op1, %op2
421 ; Don't use SVE for 128-bit vectors.
422 define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
423 ; CHECK-LABEL: fdiv_v4f32:
425 ; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s
427 %res = fdiv <4 x float> %op1, %op2
431 define void @fdiv_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
432 ; CHECK-LABEL: fdiv_v8f32:
434 ; CHECK-NEXT: ptrue p0.s, vl8
435 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
436 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
437 ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
438 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
440 %op1 = load <8 x float>, ptr %a
441 %op2 = load <8 x float>, ptr %b
442 %res = fdiv <8 x float> %op1, %op2
443 store <8 x float> %res, ptr %a
447 define void @fdiv_v16f32(ptr %a, ptr %b) #0 {
448 ; VBITS_GE_256-LABEL: fdiv_v16f32:
449 ; VBITS_GE_256: // %bb.0:
450 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
451 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
452 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
453 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
454 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1]
455 ; VBITS_GE_256-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
456 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
457 ; VBITS_GE_256-NEXT: fdiv z1.s, p0/m, z1.s, z2.s
458 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
459 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
460 ; VBITS_GE_256-NEXT: ret
462 ; VBITS_GE_512-LABEL: fdiv_v16f32:
463 ; VBITS_GE_512: // %bb.0:
464 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
465 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
466 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
467 ; VBITS_GE_512-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
468 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
469 ; VBITS_GE_512-NEXT: ret
470 %op1 = load <16 x float>, ptr %a
471 %op2 = load <16 x float>, ptr %b
472 %res = fdiv <16 x float> %op1, %op2
473 store <16 x float> %res, ptr %a
477 define void @fdiv_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
478 ; CHECK-LABEL: fdiv_v32f32:
480 ; CHECK-NEXT: ptrue p0.s, vl32
481 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
482 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
483 ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
484 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
486 %op1 = load <32 x float>, ptr %a
487 %op2 = load <32 x float>, ptr %b
488 %res = fdiv <32 x float> %op1, %op2
489 store <32 x float> %res, ptr %a
493 define void @fdiv_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
494 ; CHECK-LABEL: fdiv_v64f32:
496 ; CHECK-NEXT: ptrue p0.s, vl64
497 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
498 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
499 ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
500 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
502 %op1 = load <64 x float>, ptr %a
503 %op2 = load <64 x float>, ptr %b
504 %res = fdiv <64 x float> %op1, %op2
505 store <64 x float> %res, ptr %a
509 ; Don't use SVE for 64-bit vectors.
510 define <1 x double> @fdiv_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
511 ; CHECK-LABEL: fdiv_v1f64:
513 ; CHECK-NEXT: fdiv d0, d0, d1
515 %res = fdiv <1 x double> %op1, %op2
516 ret <1 x double> %res
519 ; Don't use SVE for 128-bit vectors.
520 define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
521 ; CHECK-LABEL: fdiv_v2f64:
523 ; CHECK-NEXT: fdiv v0.2d, v0.2d, v1.2d
525 %res = fdiv <2 x double> %op1, %op2
526 ret <2 x double> %res
529 define void @fdiv_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
530 ; CHECK-LABEL: fdiv_v4f64:
532 ; CHECK-NEXT: ptrue p0.d, vl4
533 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
534 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
535 ; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
536 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
538 %op1 = load <4 x double>, ptr %a
539 %op2 = load <4 x double>, ptr %b
540 %res = fdiv <4 x double> %op1, %op2
541 store <4 x double> %res, ptr %a
545 define void @fdiv_v8f64(ptr %a, ptr %b) #0 {
546 ; VBITS_GE_256-LABEL: fdiv_v8f64:
547 ; VBITS_GE_256: // %bb.0:
548 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
549 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
550 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
551 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
552 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
553 ; VBITS_GE_256-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
554 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
555 ; VBITS_GE_256-NEXT: fdiv z1.d, p0/m, z1.d, z2.d
556 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
557 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
558 ; VBITS_GE_256-NEXT: ret
560 ; VBITS_GE_512-LABEL: fdiv_v8f64:
561 ; VBITS_GE_512: // %bb.0:
562 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
563 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
564 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
565 ; VBITS_GE_512-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
566 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
567 ; VBITS_GE_512-NEXT: ret
568 %op1 = load <8 x double>, ptr %a
569 %op2 = load <8 x double>, ptr %b
570 %res = fdiv <8 x double> %op1, %op2
571 store <8 x double> %res, ptr %a
575 define void @fdiv_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
576 ; CHECK-LABEL: fdiv_v16f64:
578 ; CHECK-NEXT: ptrue p0.d, vl16
579 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
580 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
581 ; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
582 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
584 %op1 = load <16 x double>, ptr %a
585 %op2 = load <16 x double>, ptr %b
586 %res = fdiv <16 x double> %op1, %op2
587 store <16 x double> %res, ptr %a
591 define void @fdiv_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
592 ; CHECK-LABEL: fdiv_v32f64:
594 ; CHECK-NEXT: ptrue p0.d, vl32
595 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
596 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
597 ; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
598 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
600 %op1 = load <32 x double>, ptr %a
601 %op2 = load <32 x double>, ptr %b
602 %res = fdiv <32 x double> %op1, %op2
603 store <32 x double> %res, ptr %a
611 ; Don't use SVE for 64-bit vectors.
612 define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
613 ; CHECK-LABEL: fma_v4f16:
615 ; CHECK-NEXT: fmla v2.4h, v1.4h, v0.4h
616 ; CHECK-NEXT: fmov d0, d2
618 %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
622 ; Don't use SVE for 128-bit vectors.
623 define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
624 ; CHECK-LABEL: fma_v8f16:
626 ; CHECK-NEXT: fmla v2.8h, v1.8h, v0.8h
627 ; CHECK-NEXT: mov v0.16b, v2.16b
629 %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
633 define void @fma_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
634 ; CHECK-LABEL: fma_v16f16:
636 ; CHECK-NEXT: ptrue p0.h, vl16
637 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
638 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
639 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
640 ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
641 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
643 %op1 = load <16 x half>, ptr %a
644 %op2 = load <16 x half>, ptr %b
645 %op3 = load <16 x half>, ptr %c
646 %res = call <16 x half> @llvm.fma.v16f16(<16 x half> %op1, <16 x half> %op2, <16 x half> %op3)
647 store <16 x half> %res, ptr %a
651 define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
652 ; VBITS_GE_256-LABEL: fma_v32f16:
653 ; VBITS_GE_256: // %bb.0:
654 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
655 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
656 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
657 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
658 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x2, x8, lsl #1]
659 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0]
660 ; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x1]
661 ; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2]
662 ; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z1.h, z2.h
663 ; VBITS_GE_256-NEXT: movprfx z1, z5
664 ; VBITS_GE_256-NEXT: fmla z1.h, p0/m, z3.h, z4.h
665 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
666 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
667 ; VBITS_GE_256-NEXT: ret
669 ; VBITS_GE_512-LABEL: fma_v32f16:
670 ; VBITS_GE_512: // %bb.0:
671 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
672 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
673 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
674 ; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x2]
675 ; VBITS_GE_512-NEXT: fmad z0.h, p0/m, z1.h, z2.h
676 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
677 ; VBITS_GE_512-NEXT: ret
678 %op1 = load <32 x half>, ptr %a
679 %op2 = load <32 x half>, ptr %b
680 %op3 = load <32 x half>, ptr %c
681 %res = call <32 x half> @llvm.fma.v32f16(<32 x half> %op1, <32 x half> %op2, <32 x half> %op3)
682 store <32 x half> %res, ptr %a
686 define void @fma_v64f16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
687 ; CHECK-LABEL: fma_v64f16:
689 ; CHECK-NEXT: ptrue p0.h, vl64
690 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
691 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
692 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
693 ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
694 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
696 %op1 = load <64 x half>, ptr %a
697 %op2 = load <64 x half>, ptr %b
698 %op3 = load <64 x half>, ptr %c
699 %res = call <64 x half> @llvm.fma.v64f16(<64 x half> %op1, <64 x half> %op2, <64 x half> %op3)
700 store <64 x half> %res, ptr %a
704 define void @fma_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
705 ; CHECK-LABEL: fma_v128f16:
707 ; CHECK-NEXT: ptrue p0.h, vl128
708 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
709 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
710 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
711 ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
712 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
714 %op1 = load <128 x half>, ptr %a
715 %op2 = load <128 x half>, ptr %b
716 %op3 = load <128 x half>, ptr %c
717 %res = call <128 x half> @llvm.fma.v128f16(<128 x half> %op1, <128 x half> %op2, <128 x half> %op3)
718 store <128 x half> %res, ptr %a
722 ; Don't use SVE for 64-bit vectors.
723 define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
724 ; CHECK-LABEL: fma_v2f32:
726 ; CHECK-NEXT: fmla v2.2s, v1.2s, v0.2s
727 ; CHECK-NEXT: fmov d0, d2
729 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
733 ; Don't use SVE for 128-bit vectors.
734 define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
735 ; CHECK-LABEL: fma_v4f32:
737 ; CHECK-NEXT: fmla v2.4s, v1.4s, v0.4s
738 ; CHECK-NEXT: mov v0.16b, v2.16b
740 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
744 define void @fma_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
745 ; CHECK-LABEL: fma_v8f32:
747 ; CHECK-NEXT: ptrue p0.s, vl8
748 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
749 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
750 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
751 ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
752 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
754 %op1 = load <8 x float>, ptr %a
755 %op2 = load <8 x float>, ptr %b
756 %op3 = load <8 x float>, ptr %c
757 %res = call <8 x float> @llvm.fma.v8f32(<8 x float> %op1, <8 x float> %op2, <8 x float> %op3)
758 store <8 x float> %res, ptr %a
762 define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
763 ; VBITS_GE_256-LABEL: fma_v16f32:
764 ; VBITS_GE_256: // %bb.0:
765 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
766 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
767 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
768 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
769 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x2, x8, lsl #2]
770 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0]
771 ; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x1]
772 ; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2]
773 ; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z1.s, z2.s
774 ; VBITS_GE_256-NEXT: movprfx z1, z5
775 ; VBITS_GE_256-NEXT: fmla z1.s, p0/m, z3.s, z4.s
776 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
777 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
778 ; VBITS_GE_256-NEXT: ret
780 ; VBITS_GE_512-LABEL: fma_v16f32:
781 ; VBITS_GE_512: // %bb.0:
782 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
783 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
784 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
785 ; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x2]
786 ; VBITS_GE_512-NEXT: fmad z0.s, p0/m, z1.s, z2.s
787 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
788 ; VBITS_GE_512-NEXT: ret
789 %op1 = load <16 x float>, ptr %a
790 %op2 = load <16 x float>, ptr %b
791 %op3 = load <16 x float>, ptr %c
792 %res = call <16 x float> @llvm.fma.v16f32(<16 x float> %op1, <16 x float> %op2, <16 x float> %op3)
793 store <16 x float> %res, ptr %a
797 define void @fma_v32f32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
798 ; CHECK-LABEL: fma_v32f32:
800 ; CHECK-NEXT: ptrue p0.s, vl32
801 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
802 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
803 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
804 ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
805 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
807 %op1 = load <32 x float>, ptr %a
808 %op2 = load <32 x float>, ptr %b
809 %op3 = load <32 x float>, ptr %c
810 %res = call <32 x float> @llvm.fma.v32f32(<32 x float> %op1, <32 x float> %op2, <32 x float> %op3)
811 store <32 x float> %res, ptr %a
815 define void @fma_v64f32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
816 ; CHECK-LABEL: fma_v64f32:
818 ; CHECK-NEXT: ptrue p0.s, vl64
819 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
820 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
821 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
822 ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
823 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
825 %op1 = load <64 x float>, ptr %a
826 %op2 = load <64 x float>, ptr %b
827 %op3 = load <64 x float>, ptr %c
828 %res = call <64 x float> @llvm.fma.v64f32(<64 x float> %op1, <64 x float> %op2, <64 x float> %op3)
829 store <64 x float> %res, ptr %a
833 ; Don't use SVE for 64-bit vectors.
834 define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
835 ; CHECK-LABEL: fma_v1f64:
837 ; CHECK-NEXT: fmadd d0, d0, d1, d2
839 %res = call <1 x double> @llvm.fma.v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3)
840 ret <1 x double> %res
843 ; Don't use SVE for 128-bit vectors.
844 define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
845 ; CHECK-LABEL: fma_v2f64:
847 ; CHECK-NEXT: fmla v2.2d, v1.2d, v0.2d
848 ; CHECK-NEXT: mov v0.16b, v2.16b
850 %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
851 ret <2 x double> %res
854 define void @fma_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
855 ; CHECK-LABEL: fma_v4f64:
857 ; CHECK-NEXT: ptrue p0.d, vl4
858 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
859 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
860 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
861 ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
862 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
864 %op1 = load <4 x double>, ptr %a
865 %op2 = load <4 x double>, ptr %b
866 %op3 = load <4 x double>, ptr %c
867 %res = call <4 x double> @llvm.fma.v4f64(<4 x double> %op1, <4 x double> %op2, <4 x double> %op3)
868 store <4 x double> %res, ptr %a
872 define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
873 ; VBITS_GE_256-LABEL: fma_v8f64:
874 ; VBITS_GE_256: // %bb.0:
875 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
876 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
877 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
878 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
879 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x2, x8, lsl #3]
880 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0]
881 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1]
882 ; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2]
883 ; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z1.d, z2.d
884 ; VBITS_GE_256-NEXT: movprfx z1, z5
885 ; VBITS_GE_256-NEXT: fmla z1.d, p0/m, z3.d, z4.d
886 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
887 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
888 ; VBITS_GE_256-NEXT: ret
890 ; VBITS_GE_512-LABEL: fma_v8f64:
891 ; VBITS_GE_512: // %bb.0:
892 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
893 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
894 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
895 ; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x2]
896 ; VBITS_GE_512-NEXT: fmad z0.d, p0/m, z1.d, z2.d
897 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
898 ; VBITS_GE_512-NEXT: ret
899 %op1 = load <8 x double>, ptr %a
900 %op2 = load <8 x double>, ptr %b
901 %op3 = load <8 x double>, ptr %c
902 %res = call <8 x double> @llvm.fma.v8f64(<8 x double> %op1, <8 x double> %op2, <8 x double> %op3)
903 store <8 x double> %res, ptr %a
907 define void @fma_v16f64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
908 ; CHECK-LABEL: fma_v16f64:
910 ; CHECK-NEXT: ptrue p0.d, vl16
911 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
912 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
913 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
914 ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
915 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
917 %op1 = load <16 x double>, ptr %a
918 %op2 = load <16 x double>, ptr %b
919 %op3 = load <16 x double>, ptr %c
920 %res = call <16 x double> @llvm.fma.v16f64(<16 x double> %op1, <16 x double> %op2, <16 x double> %op3)
921 store <16 x double> %res, ptr %a
925 define void @fma_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
926 ; CHECK-LABEL: fma_v32f64:
928 ; CHECK-NEXT: ptrue p0.d, vl32
929 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
930 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
931 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
932 ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
933 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
935 %op1 = load <32 x double>, ptr %a
936 %op2 = load <32 x double>, ptr %b
937 %op3 = load <32 x double>, ptr %c
938 %res = call <32 x double> @llvm.fma.v32f64(<32 x double> %op1, <32 x double> %op2, <32 x double> %op3)
939 store <32 x double> %res, ptr %a
947 ; Don't use SVE for 64-bit vectors.
948 define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
949 ; CHECK-LABEL: fmul_v4f16:
951 ; CHECK-NEXT: fmul v0.4h, v0.4h, v1.4h
953 %res = fmul <4 x half> %op1, %op2
957 ; Don't use SVE for 128-bit vectors.
958 define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
959 ; CHECK-LABEL: fmul_v8f16:
961 ; CHECK-NEXT: fmul v0.8h, v0.8h, v1.8h
963 %res = fmul <8 x half> %op1, %op2
967 define void @fmul_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
968 ; CHECK-LABEL: fmul_v16f16:
970 ; CHECK-NEXT: ptrue p0.h, vl16
971 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
972 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
973 ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
974 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
976 %op1 = load <16 x half>, ptr %a
977 %op2 = load <16 x half>, ptr %b
978 %res = fmul <16 x half> %op1, %op2
979 store <16 x half> %res, ptr %a
983 define void @fmul_v32f16(ptr %a, ptr %b) #0 {
984 ; VBITS_GE_256-LABEL: fmul_v32f16:
985 ; VBITS_GE_256: // %bb.0:
986 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
987 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
988 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
989 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
990 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
991 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
992 ; VBITS_GE_256-NEXT: fmul z0.h, p0/m, z0.h, z1.h
993 ; VBITS_GE_256-NEXT: movprfx z1, z2
994 ; VBITS_GE_256-NEXT: fmul z1.h, p0/m, z1.h, z3.h
995 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
996 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
997 ; VBITS_GE_256-NEXT: ret
999 ; VBITS_GE_512-LABEL: fmul_v32f16:
1000 ; VBITS_GE_512: // %bb.0:
1001 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1002 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1003 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
1004 ; VBITS_GE_512-NEXT: fmul z0.h, p0/m, z0.h, z1.h
1005 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
1006 ; VBITS_GE_512-NEXT: ret
1007 %op1 = load <32 x half>, ptr %a
1008 %op2 = load <32 x half>, ptr %b
1009 %res = fmul <32 x half> %op1, %op2
1010 store <32 x half> %res, ptr %a
1014 define void @fmul_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1015 ; CHECK-LABEL: fmul_v64f16:
1017 ; CHECK-NEXT: ptrue p0.h, vl64
1018 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1019 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1020 ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
1021 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1023 %op1 = load <64 x half>, ptr %a
1024 %op2 = load <64 x half>, ptr %b
1025 %res = fmul <64 x half> %op1, %op2
1026 store <64 x half> %res, ptr %a
1030 define void @fmul_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1031 ; CHECK-LABEL: fmul_v128f16:
1033 ; CHECK-NEXT: ptrue p0.h, vl128
1034 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1035 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1036 ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
1037 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1039 %op1 = load <128 x half>, ptr %a
1040 %op2 = load <128 x half>, ptr %b
1041 %res = fmul <128 x half> %op1, %op2
1042 store <128 x half> %res, ptr %a
1046 ; Don't use SVE for 64-bit vectors.
1047 define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
1048 ; CHECK-LABEL: fmul_v2f32:
1050 ; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s
1052 %res = fmul <2 x float> %op1, %op2
1053 ret <2 x float> %res
1056 ; Don't use SVE for 128-bit vectors.
1057 define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
1058 ; CHECK-LABEL: fmul_v4f32:
1060 ; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
1062 %res = fmul <4 x float> %op1, %op2
1063 ret <4 x float> %res
1066 define void @fmul_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1067 ; CHECK-LABEL: fmul_v8f32:
1069 ; CHECK-NEXT: ptrue p0.s, vl8
1070 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1071 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1072 ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
1073 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1075 %op1 = load <8 x float>, ptr %a
1076 %op2 = load <8 x float>, ptr %b
1077 %res = fmul <8 x float> %op1, %op2
1078 store <8 x float> %res, ptr %a
1082 define void @fmul_v16f32(ptr %a, ptr %b) #0 {
1083 ; VBITS_GE_256-LABEL: fmul_v16f32:
1084 ; VBITS_GE_256: // %bb.0:
1085 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1086 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1087 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1088 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1089 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
1090 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
1091 ; VBITS_GE_256-NEXT: fmul z0.s, p0/m, z0.s, z1.s
1092 ; VBITS_GE_256-NEXT: movprfx z1, z2
1093 ; VBITS_GE_256-NEXT: fmul z1.s, p0/m, z1.s, z3.s
1094 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1095 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1096 ; VBITS_GE_256-NEXT: ret
1098 ; VBITS_GE_512-LABEL: fmul_v16f32:
1099 ; VBITS_GE_512: // %bb.0:
1100 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1101 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1102 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
1103 ; VBITS_GE_512-NEXT: fmul z0.s, p0/m, z0.s, z1.s
1104 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1105 ; VBITS_GE_512-NEXT: ret
1106 %op1 = load <16 x float>, ptr %a
1107 %op2 = load <16 x float>, ptr %b
1108 %res = fmul <16 x float> %op1, %op2
1109 store <16 x float> %res, ptr %a
1113 define void @fmul_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1114 ; CHECK-LABEL: fmul_v32f32:
1116 ; CHECK-NEXT: ptrue p0.s, vl32
1117 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1118 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1119 ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
1120 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1122 %op1 = load <32 x float>, ptr %a
1123 %op2 = load <32 x float>, ptr %b
1124 %res = fmul <32 x float> %op1, %op2
1125 store <32 x float> %res, ptr %a
1129 define void @fmul_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1130 ; CHECK-LABEL: fmul_v64f32:
1132 ; CHECK-NEXT: ptrue p0.s, vl64
1133 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1134 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1135 ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
1136 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1138 %op1 = load <64 x float>, ptr %a
1139 %op2 = load <64 x float>, ptr %b
1140 %res = fmul <64 x float> %op1, %op2
1141 store <64 x float> %res, ptr %a
1145 ; Don't use SVE for 64-bit vectors.
1146 define <1 x double> @fmul_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
1147 ; CHECK-LABEL: fmul_v1f64:
1149 ; CHECK-NEXT: fmul d0, d0, d1
1151 %res = fmul <1 x double> %op1, %op2
1152 ret <1 x double> %res
1155 ; Don't use SVE for 128-bit vectors.
1156 define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
1157 ; CHECK-LABEL: fmul_v2f64:
1159 ; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d
1161 %res = fmul <2 x double> %op1, %op2
1162 ret <2 x double> %res
1165 define void @fmul_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1166 ; CHECK-LABEL: fmul_v4f64:
1168 ; CHECK-NEXT: ptrue p0.d, vl4
1169 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1170 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1171 ; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
1172 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1174 %op1 = load <4 x double>, ptr %a
1175 %op2 = load <4 x double>, ptr %b
1176 %res = fmul <4 x double> %op1, %op2
1177 store <4 x double> %res, ptr %a
1181 define void @fmul_v8f64(ptr %a, ptr %b) #0 {
1182 ; VBITS_GE_256-LABEL: fmul_v8f64:
1183 ; VBITS_GE_256: // %bb.0:
1184 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1185 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1186 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1187 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
1188 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
1189 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
1190 ; VBITS_GE_256-NEXT: fmul z0.d, p0/m, z0.d, z1.d
1191 ; VBITS_GE_256-NEXT: movprfx z1, z2
1192 ; VBITS_GE_256-NEXT: fmul z1.d, p0/m, z1.d, z3.d
1193 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1194 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1195 ; VBITS_GE_256-NEXT: ret
1197 ; VBITS_GE_512-LABEL: fmul_v8f64:
1198 ; VBITS_GE_512: // %bb.0:
1199 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1200 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1201 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
1202 ; VBITS_GE_512-NEXT: fmul z0.d, p0/m, z0.d, z1.d
1203 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1204 ; VBITS_GE_512-NEXT: ret
1205 %op1 = load <8 x double>, ptr %a
1206 %op2 = load <8 x double>, ptr %b
1207 %res = fmul <8 x double> %op1, %op2
1208 store <8 x double> %res, ptr %a
1212 define void @fmul_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1213 ; CHECK-LABEL: fmul_v16f64:
1215 ; CHECK-NEXT: ptrue p0.d, vl16
1216 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1217 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1218 ; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
1219 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1221 %op1 = load <16 x double>, ptr %a
1222 %op2 = load <16 x double>, ptr %b
1223 %res = fmul <16 x double> %op1, %op2
1224 store <16 x double> %res, ptr %a
1228 define void @fmul_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1229 ; CHECK-LABEL: fmul_v32f64:
1231 ; CHECK-NEXT: ptrue p0.d, vl32
1232 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1233 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1234 ; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
1235 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1237 %op1 = load <32 x double>, ptr %a
1238 %op2 = load <32 x double>, ptr %b
1239 %res = fmul <32 x double> %op1, %op2
1240 store <32 x double> %res, ptr %a
1248 ; Don't use SVE for 64-bit vectors.
1249 define <4 x half> @fneg_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
1250 ; CHECK-LABEL: fneg_v4f16:
1252 ; CHECK-NEXT: fneg v0.4h, v0.4h
1254 %res = fneg <4 x half> %op
1258 ; Don't use SVE for 128-bit vectors.
1259 define <8 x half> @fneg_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
1260 ; CHECK-LABEL: fneg_v8f16:
1262 ; CHECK-NEXT: fneg v0.8h, v0.8h
1264 %res = fneg <8 x half> %op
1268 define void @fneg_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
1269 ; CHECK-LABEL: fneg_v16f16:
1271 ; CHECK-NEXT: ptrue p0.h, vl16
1272 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1273 ; CHECK-NEXT: fneg z0.h, p0/m, z0.h
1274 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1276 %op = load <16 x half>, ptr %a
1277 %res = fneg <16 x half> %op
1278 store <16 x half> %res, ptr %a
1282 define void @fneg_v32f16(ptr %a) #0 {
1283 ; VBITS_GE_256-LABEL: fneg_v32f16:
1284 ; VBITS_GE_256: // %bb.0:
1285 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1286 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1287 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1288 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
1289 ; VBITS_GE_256-NEXT: fneg z0.h, p0/m, z0.h
1290 ; VBITS_GE_256-NEXT: fneg z1.h, p0/m, z1.h
1291 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1292 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
1293 ; VBITS_GE_256-NEXT: ret
1295 ; VBITS_GE_512-LABEL: fneg_v32f16:
1296 ; VBITS_GE_512: // %bb.0:
1297 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1298 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1299 ; VBITS_GE_512-NEXT: fneg z0.h, p0/m, z0.h
1300 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
1301 ; VBITS_GE_512-NEXT: ret
1302 %op = load <32 x half>, ptr %a
1303 %res = fneg <32 x half> %op
1304 store <32 x half> %res, ptr %a
1308 define void @fneg_v64f16(ptr %a) vscale_range(8,0) #0 {
1309 ; CHECK-LABEL: fneg_v64f16:
1311 ; CHECK-NEXT: ptrue p0.h, vl64
1312 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1313 ; CHECK-NEXT: fneg z0.h, p0/m, z0.h
1314 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1316 %op = load <64 x half>, ptr %a
1317 %res = fneg <64 x half> %op
1318 store <64 x half> %res, ptr %a
1322 define void @fneg_v128f16(ptr %a) vscale_range(16,0) #0 {
1323 ; CHECK-LABEL: fneg_v128f16:
1325 ; CHECK-NEXT: ptrue p0.h, vl128
1326 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1327 ; CHECK-NEXT: fneg z0.h, p0/m, z0.h
1328 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1330 %op = load <128 x half>, ptr %a
1331 %res = fneg <128 x half> %op
1332 store <128 x half> %res, ptr %a
1336 ; Don't use SVE for 64-bit vectors.
1337 define <2 x float> @fneg_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
1338 ; CHECK-LABEL: fneg_v2f32:
1340 ; CHECK-NEXT: fneg v0.2s, v0.2s
1342 %res = fneg <2 x float> %op
1343 ret <2 x float> %res
1346 ; Don't use SVE for 128-bit vectors.
1347 define <4 x float> @fneg_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
1348 ; CHECK-LABEL: fneg_v4f32:
1350 ; CHECK-NEXT: fneg v0.4s, v0.4s
1352 %res = fneg <4 x float> %op
1353 ret <4 x float> %res
1356 define void @fneg_v8f32(ptr %a) vscale_range(2,0) #0 {
1357 ; CHECK-LABEL: fneg_v8f32:
1359 ; CHECK-NEXT: ptrue p0.s, vl8
1360 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1361 ; CHECK-NEXT: fneg z0.s, p0/m, z0.s
1362 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1364 %op = load <8 x float>, ptr %a
1365 %res = fneg <8 x float> %op
1366 store <8 x float> %res, ptr %a
1370 define void @fneg_v16f32(ptr %a) #0 {
1371 ; VBITS_GE_256-LABEL: fneg_v16f32:
1372 ; VBITS_GE_256: // %bb.0:
1373 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1374 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1375 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1376 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1377 ; VBITS_GE_256-NEXT: fneg z0.s, p0/m, z0.s
1378 ; VBITS_GE_256-NEXT: fneg z1.s, p0/m, z1.s
1379 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1380 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1381 ; VBITS_GE_256-NEXT: ret
1383 ; VBITS_GE_512-LABEL: fneg_v16f32:
1384 ; VBITS_GE_512: // %bb.0:
1385 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1386 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1387 ; VBITS_GE_512-NEXT: fneg z0.s, p0/m, z0.s
1388 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1389 ; VBITS_GE_512-NEXT: ret
1390 %op = load <16 x float>, ptr %a
1391 %res = fneg <16 x float> %op
1392 store <16 x float> %res, ptr %a
1396 define void @fneg_v32f32(ptr %a) vscale_range(8,0) #0 {
1397 ; CHECK-LABEL: fneg_v32f32:
1399 ; CHECK-NEXT: ptrue p0.s, vl32
1400 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1401 ; CHECK-NEXT: fneg z0.s, p0/m, z0.s
1402 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1404 %op = load <32 x float>, ptr %a
1405 %res = fneg <32 x float> %op
1406 store <32 x float> %res, ptr %a
1410 define void @fneg_v64f32(ptr %a) vscale_range(16,0) #0 {
1411 ; CHECK-LABEL: fneg_v64f32:
1413 ; CHECK-NEXT: ptrue p0.s, vl64
1414 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1415 ; CHECK-NEXT: fneg z0.s, p0/m, z0.s
1416 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1418 %op = load <64 x float>, ptr %a
1419 %res = fneg <64 x float> %op
1420 store <64 x float> %res, ptr %a
1424 ; Don't use SVE for 64-bit vectors.
1425 define <1 x double> @fneg_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
1426 ; CHECK-LABEL: fneg_v1f64:
1428 ; CHECK-NEXT: fneg d0, d0
1430 %res = fneg <1 x double> %op
1431 ret <1 x double> %res
1434 ; Don't use SVE for 128-bit vectors.
1435 define <2 x double> @fneg_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
1436 ; CHECK-LABEL: fneg_v2f64:
1438 ; CHECK-NEXT: fneg v0.2d, v0.2d
1440 %res = fneg <2 x double> %op
1441 ret <2 x double> %res
1444 define void @fneg_v4f64(ptr %a) vscale_range(2,0) #0 {
1445 ; CHECK-LABEL: fneg_v4f64:
1447 ; CHECK-NEXT: ptrue p0.d, vl4
1448 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1449 ; CHECK-NEXT: fneg z0.d, p0/m, z0.d
1450 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1452 %op = load <4 x double>, ptr %a
1453 %res = fneg <4 x double> %op
1454 store <4 x double> %res, ptr %a
1458 define void @fneg_v8f64(ptr %a) #0 {
1459 ; VBITS_GE_256-LABEL: fneg_v8f64:
1460 ; VBITS_GE_256: // %bb.0:
1461 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1462 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1463 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1464 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1465 ; VBITS_GE_256-NEXT: fneg z0.d, p0/m, z0.d
1466 ; VBITS_GE_256-NEXT: fneg z1.d, p0/m, z1.d
1467 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1468 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1469 ; VBITS_GE_256-NEXT: ret
1471 ; VBITS_GE_512-LABEL: fneg_v8f64:
1472 ; VBITS_GE_512: // %bb.0:
1473 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1474 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1475 ; VBITS_GE_512-NEXT: fneg z0.d, p0/m, z0.d
1476 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1477 ; VBITS_GE_512-NEXT: ret
1478 %op = load <8 x double>, ptr %a
1479 %res = fneg <8 x double> %op
1480 store <8 x double> %res, ptr %a
1484 define void @fneg_v16f64(ptr %a) vscale_range(8,0) #0 {
1485 ; CHECK-LABEL: fneg_v16f64:
1487 ; CHECK-NEXT: ptrue p0.d, vl16
1488 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1489 ; CHECK-NEXT: fneg z0.d, p0/m, z0.d
1490 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1492 %op = load <16 x double>, ptr %a
1493 %res = fneg <16 x double> %op
1494 store <16 x double> %res, ptr %a
1498 define void @fneg_v32f64(ptr %a) vscale_range(16,0) #0 {
1499 ; CHECK-LABEL: fneg_v32f64:
1501 ; CHECK-NEXT: ptrue p0.d, vl32
1502 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1503 ; CHECK-NEXT: fneg z0.d, p0/m, z0.d
1504 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1506 %op = load <32 x double>, ptr %a
1507 %res = fneg <32 x double> %op
1508 store <32 x double> %res, ptr %a
1516 ; Don't use SVE for 64-bit vectors.
1517 define <4 x half> @fsqrt_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
1518 ; CHECK-LABEL: fsqrt_v4f16:
1520 ; CHECK-NEXT: fsqrt v0.4h, v0.4h
1522 %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
1526 ; Don't use SVE for 128-bit vectors.
1527 define <8 x half> @fsqrt_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
1528 ; CHECK-LABEL: fsqrt_v8f16:
1530 ; CHECK-NEXT: fsqrt v0.8h, v0.8h
1532 %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
1536 define void @fsqrt_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
1537 ; CHECK-LABEL: fsqrt_v16f16:
1539 ; CHECK-NEXT: ptrue p0.h, vl16
1540 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1541 ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h
1542 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1544 %op = load <16 x half>, ptr %a
1545 %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
1546 store <16 x half> %res, ptr %a
1550 define void @fsqrt_v32f16(ptr %a) #0 {
1551 ; VBITS_GE_256-LABEL: fsqrt_v32f16:
1552 ; VBITS_GE_256: // %bb.0:
1553 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1554 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1555 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1556 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
1557 ; VBITS_GE_256-NEXT: fsqrt z0.h, p0/m, z0.h
1558 ; VBITS_GE_256-NEXT: fsqrt z1.h, p0/m, z1.h
1559 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1560 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
1561 ; VBITS_GE_256-NEXT: ret
1563 ; VBITS_GE_512-LABEL: fsqrt_v32f16:
1564 ; VBITS_GE_512: // %bb.0:
1565 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1566 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1567 ; VBITS_GE_512-NEXT: fsqrt z0.h, p0/m, z0.h
1568 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
1569 ; VBITS_GE_512-NEXT: ret
1570 %op = load <32 x half>, ptr %a
1571 %res = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %op)
1572 store <32 x half> %res, ptr %a
1576 define void @fsqrt_v64f16(ptr %a) vscale_range(8,0) #0 {
1577 ; CHECK-LABEL: fsqrt_v64f16:
1579 ; CHECK-NEXT: ptrue p0.h, vl64
1580 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1581 ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h
1582 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1584 %op = load <64 x half>, ptr %a
1585 %res = call <64 x half> @llvm.sqrt.v64f16(<64 x half> %op)
1586 store <64 x half> %res, ptr %a
1590 define void @fsqrt_v128f16(ptr %a) vscale_range(16,0) #0 {
1591 ; CHECK-LABEL: fsqrt_v128f16:
1593 ; CHECK-NEXT: ptrue p0.h, vl128
1594 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1595 ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h
1596 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1598 %op = load <128 x half>, ptr %a
1599 %res = call <128 x half> @llvm.sqrt.v128f16(<128 x half> %op)
1600 store <128 x half> %res, ptr %a
1604 ; Don't use SVE for 64-bit vectors.
1605 define <2 x float> @fsqrt_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
1606 ; CHECK-LABEL: fsqrt_v2f32:
1608 ; CHECK-NEXT: fsqrt v0.2s, v0.2s
1610 %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
1611 ret <2 x float> %res
1614 ; Don't use SVE for 128-bit vectors.
1615 define <4 x float> @fsqrt_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
1616 ; CHECK-LABEL: fsqrt_v4f32:
1618 ; CHECK-NEXT: fsqrt v0.4s, v0.4s
1620 %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
1621 ret <4 x float> %res
1624 define void @fsqrt_v8f32(ptr %a) vscale_range(2,0) #0 {
1625 ; CHECK-LABEL: fsqrt_v8f32:
1627 ; CHECK-NEXT: ptrue p0.s, vl8
1628 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1629 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
1630 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1632 %op = load <8 x float>, ptr %a
1633 %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
1634 store <8 x float> %res, ptr %a
1638 define void @fsqrt_v16f32(ptr %a) #0 {
1639 ; VBITS_GE_256-LABEL: fsqrt_v16f32:
1640 ; VBITS_GE_256: // %bb.0:
1641 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1642 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1643 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1644 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1645 ; VBITS_GE_256-NEXT: fsqrt z0.s, p0/m, z0.s
1646 ; VBITS_GE_256-NEXT: fsqrt z1.s, p0/m, z1.s
1647 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1648 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1649 ; VBITS_GE_256-NEXT: ret
1651 ; VBITS_GE_512-LABEL: fsqrt_v16f32:
1652 ; VBITS_GE_512: // %bb.0:
1653 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1654 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1655 ; VBITS_GE_512-NEXT: fsqrt z0.s, p0/m, z0.s
1656 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1657 ; VBITS_GE_512-NEXT: ret
1658 %op = load <16 x float>, ptr %a
1659 %res = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %op)
1660 store <16 x float> %res, ptr %a
1664 define void @fsqrt_v32f32(ptr %a) vscale_range(8,0) #0 {
1665 ; CHECK-LABEL: fsqrt_v32f32:
1667 ; CHECK-NEXT: ptrue p0.s, vl32
1668 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1669 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
1670 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1672 %op = load <32 x float>, ptr %a
1673 %res = call <32 x float> @llvm.sqrt.v32f32(<32 x float> %op)
1674 store <32 x float> %res, ptr %a
1678 define void @fsqrt_v64f32(ptr %a) vscale_range(16,0) #0 {
1679 ; CHECK-LABEL: fsqrt_v64f32:
1681 ; CHECK-NEXT: ptrue p0.s, vl64
1682 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1683 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
1684 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1686 %op = load <64 x float>, ptr %a
1687 %res = call <64 x float> @llvm.sqrt.v64f32(<64 x float> %op)
1688 store <64 x float> %res, ptr %a
1692 ; Don't use SVE for 64-bit vectors.
1693 define <1 x double> @fsqrt_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
1694 ; CHECK-LABEL: fsqrt_v1f64:
1696 ; CHECK-NEXT: fsqrt d0, d0
1698 %res = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %op)
1699 ret <1 x double> %res
1702 ; Don't use SVE for 128-bit vectors.
1703 define <2 x double> @fsqrt_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
1704 ; CHECK-LABEL: fsqrt_v2f64:
1706 ; CHECK-NEXT: fsqrt v0.2d, v0.2d
1708 %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
1709 ret <2 x double> %res
1712 define void @fsqrt_v4f64(ptr %a) vscale_range(2,0) #0 {
1713 ; CHECK-LABEL: fsqrt_v4f64:
1715 ; CHECK-NEXT: ptrue p0.d, vl4
1716 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1717 ; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d
1718 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1720 %op = load <4 x double>, ptr %a
1721 %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
1722 store <4 x double> %res, ptr %a
1726 define void @fsqrt_v8f64(ptr %a) #0 {
1727 ; VBITS_GE_256-LABEL: fsqrt_v8f64:
1728 ; VBITS_GE_256: // %bb.0:
1729 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1730 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1731 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1732 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1733 ; VBITS_GE_256-NEXT: fsqrt z0.d, p0/m, z0.d
1734 ; VBITS_GE_256-NEXT: fsqrt z1.d, p0/m, z1.d
1735 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1736 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1737 ; VBITS_GE_256-NEXT: ret
1739 ; VBITS_GE_512-LABEL: fsqrt_v8f64:
1740 ; VBITS_GE_512: // %bb.0:
1741 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1742 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1743 ; VBITS_GE_512-NEXT: fsqrt z0.d, p0/m, z0.d
1744 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1745 ; VBITS_GE_512-NEXT: ret
1746 %op = load <8 x double>, ptr %a
1747 %res = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %op)
1748 store <8 x double> %res, ptr %a
1752 define void @fsqrt_v16f64(ptr %a) vscale_range(8,0) #0 {
1753 ; CHECK-LABEL: fsqrt_v16f64:
1755 ; CHECK-NEXT: ptrue p0.d, vl16
1756 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1757 ; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d
1758 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1760 %op = load <16 x double>, ptr %a
1761 %res = call <16 x double> @llvm.sqrt.v16f64(<16 x double> %op)
1762 store <16 x double> %res, ptr %a
1766 define void @fsqrt_v32f64(ptr %a) vscale_range(16,0) #0 {
1767 ; CHECK-LABEL: fsqrt_v32f64:
1769 ; CHECK-NEXT: ptrue p0.d, vl32
1770 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1771 ; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d
1772 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1774 %op = load <32 x double>, ptr %a
1775 %res = call <32 x double> @llvm.sqrt.v32f64(<32 x double> %op)
1776 store <32 x double> %res, ptr %a
1784 ; Don't use SVE for 64-bit vectors.
1785 define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
1786 ; CHECK-LABEL: fsub_v4f16:
1788 ; CHECK-NEXT: fsub v0.4h, v0.4h, v1.4h
1790 %res = fsub <4 x half> %op1, %op2
1794 ; Don't use SVE for 128-bit vectors.
1795 define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
1796 ; CHECK-LABEL: fsub_v8f16:
1798 ; CHECK-NEXT: fsub v0.8h, v0.8h, v1.8h
1800 %res = fsub <8 x half> %op1, %op2
1804 define void @fsub_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
1805 ; CHECK-LABEL: fsub_v16f16:
1807 ; CHECK-NEXT: ptrue p0.h, vl16
1808 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1809 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1810 ; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h
1811 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1813 %op1 = load <16 x half>, ptr %a
1814 %op2 = load <16 x half>, ptr %b
1815 %res = fsub <16 x half> %op1, %op2
1816 store <16 x half> %res, ptr %a
1820 define void @fsub_v32f16(ptr %a, ptr %b) #0 {
1821 ; VBITS_GE_256-LABEL: fsub_v32f16:
1822 ; VBITS_GE_256: // %bb.0:
1823 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1824 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1825 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1826 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
1827 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
1828 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
1829 ; VBITS_GE_256-NEXT: fsub z0.h, p0/m, z0.h, z1.h
1830 ; VBITS_GE_256-NEXT: movprfx z1, z2
1831 ; VBITS_GE_256-NEXT: fsub z1.h, p0/m, z1.h, z3.h
1832 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1833 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
1834 ; VBITS_GE_256-NEXT: ret
1836 ; VBITS_GE_512-LABEL: fsub_v32f16:
1837 ; VBITS_GE_512: // %bb.0:
1838 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1839 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1840 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
1841 ; VBITS_GE_512-NEXT: fsub z0.h, p0/m, z0.h, z1.h
1842 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
1843 ; VBITS_GE_512-NEXT: ret
1844 %op1 = load <32 x half>, ptr %a
1845 %op2 = load <32 x half>, ptr %b
1846 %res = fsub <32 x half> %op1, %op2
1847 store <32 x half> %res, ptr %a
1851 define void @fsub_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1852 ; CHECK-LABEL: fsub_v64f16:
1854 ; CHECK-NEXT: ptrue p0.h, vl64
1855 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1856 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1857 ; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h
1858 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1860 %op1 = load <64 x half>, ptr %a
1861 %op2 = load <64 x half>, ptr %b
1862 %res = fsub <64 x half> %op1, %op2
1863 store <64 x half> %res, ptr %a
1867 define void @fsub_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1868 ; CHECK-LABEL: fsub_v128f16:
1870 ; CHECK-NEXT: ptrue p0.h, vl128
1871 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1872 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1873 ; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h
1874 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1876 %op1 = load <128 x half>, ptr %a
1877 %op2 = load <128 x half>, ptr %b
1878 %res = fsub <128 x half> %op1, %op2
1879 store <128 x half> %res, ptr %a
1883 ; Don't use SVE for 64-bit vectors.
1884 define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
1885 ; CHECK-LABEL: fsub_v2f32:
1887 ; CHECK-NEXT: fsub v0.2s, v0.2s, v1.2s
1889 %res = fsub <2 x float> %op1, %op2
1890 ret <2 x float> %res
1893 ; Don't use SVE for 128-bit vectors.
1894 define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
1895 ; CHECK-LABEL: fsub_v4f32:
1897 ; CHECK-NEXT: fsub v0.4s, v0.4s, v1.4s
1899 %res = fsub <4 x float> %op1, %op2
1900 ret <4 x float> %res
1903 define void @fsub_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1904 ; CHECK-LABEL: fsub_v8f32:
1906 ; CHECK-NEXT: ptrue p0.s, vl8
1907 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1908 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1909 ; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
1910 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1912 %op1 = load <8 x float>, ptr %a
1913 %op2 = load <8 x float>, ptr %b
1914 %res = fsub <8 x float> %op1, %op2
1915 store <8 x float> %res, ptr %a
1919 define void @fsub_v16f32(ptr %a, ptr %b) #0 {
1920 ; VBITS_GE_256-LABEL: fsub_v16f32:
1921 ; VBITS_GE_256: // %bb.0:
1922 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1923 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1924 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1925 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
1926 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
1927 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
1928 ; VBITS_GE_256-NEXT: fsub z0.s, p0/m, z0.s, z1.s
1929 ; VBITS_GE_256-NEXT: movprfx z1, z2
1930 ; VBITS_GE_256-NEXT: fsub z1.s, p0/m, z1.s, z3.s
1931 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1932 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1933 ; VBITS_GE_256-NEXT: ret
1935 ; VBITS_GE_512-LABEL: fsub_v16f32:
1936 ; VBITS_GE_512: // %bb.0:
1937 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1938 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1939 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
1940 ; VBITS_GE_512-NEXT: fsub z0.s, p0/m, z0.s, z1.s
1941 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1942 ; VBITS_GE_512-NEXT: ret
1943 %op1 = load <16 x float>, ptr %a
1944 %op2 = load <16 x float>, ptr %b
1945 %res = fsub <16 x float> %op1, %op2
1946 store <16 x float> %res, ptr %a
1950 define void @fsub_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1951 ; CHECK-LABEL: fsub_v32f32:
1953 ; CHECK-NEXT: ptrue p0.s, vl32
1954 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1955 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1956 ; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
1957 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1959 %op1 = load <32 x float>, ptr %a
1960 %op2 = load <32 x float>, ptr %b
1961 %res = fsub <32 x float> %op1, %op2
1962 store <32 x float> %res, ptr %a
1966 define void @fsub_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1967 ; CHECK-LABEL: fsub_v64f32:
1969 ; CHECK-NEXT: ptrue p0.s, vl64
1970 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1971 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1972 ; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
1973 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1975 %op1 = load <64 x float>, ptr %a
1976 %op2 = load <64 x float>, ptr %b
1977 %res = fsub <64 x float> %op1, %op2
1978 store <64 x float> %res, ptr %a
1982 ; Don't use SVE for 64-bit vectors.
1983 define <1 x double> @fsub_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
1984 ; CHECK-LABEL: fsub_v1f64:
1986 ; CHECK-NEXT: fsub d0, d0, d1
1988 %res = fsub <1 x double> %op1, %op2
1989 ret <1 x double> %res
1992 ; Don't use SVE for 128-bit vectors.
1993 define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
1994 ; CHECK-LABEL: fsub_v2f64:
1996 ; CHECK-NEXT: fsub v0.2d, v0.2d, v1.2d
1998 %res = fsub <2 x double> %op1, %op2
1999 ret <2 x double> %res
2002 define void @fsub_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
2003 ; CHECK-LABEL: fsub_v4f64:
2005 ; CHECK-NEXT: ptrue p0.d, vl4
2006 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2007 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
2008 ; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d
2009 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2011 %op1 = load <4 x double>, ptr %a
2012 %op2 = load <4 x double>, ptr %b
2013 %res = fsub <4 x double> %op1, %op2
2014 store <4 x double> %res, ptr %a
2018 define void @fsub_v8f64(ptr %a, ptr %b) #0 {
2019 ; VBITS_GE_256-LABEL: fsub_v8f64:
2020 ; VBITS_GE_256: // %bb.0:
2021 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
2022 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
2023 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
2024 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
2025 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
2026 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
2027 ; VBITS_GE_256-NEXT: fsub z0.d, p0/m, z0.d, z1.d
2028 ; VBITS_GE_256-NEXT: movprfx z1, z2
2029 ; VBITS_GE_256-NEXT: fsub z1.d, p0/m, z1.d, z3.d
2030 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
2031 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
2032 ; VBITS_GE_256-NEXT: ret
2034 ; VBITS_GE_512-LABEL: fsub_v8f64:
2035 ; VBITS_GE_512: // %bb.0:
2036 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
2037 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
2038 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
2039 ; VBITS_GE_512-NEXT: fsub z0.d, p0/m, z0.d, z1.d
2040 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
2041 ; VBITS_GE_512-NEXT: ret
2042 %op1 = load <8 x double>, ptr %a
2043 %op2 = load <8 x double>, ptr %b
2044 %res = fsub <8 x double> %op1, %op2
2045 store <8 x double> %res, ptr %a
2049 define void @fsub_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
2050 ; CHECK-LABEL: fsub_v16f64:
2052 ; CHECK-NEXT: ptrue p0.d, vl16
2053 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2054 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
2055 ; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d
2056 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2058 %op1 = load <16 x double>, ptr %a
2059 %op2 = load <16 x double>, ptr %b
2060 %res = fsub <16 x double> %op1, %op2
2061 store <16 x double> %res, ptr %a
2065 define void @fsub_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
2066 ; CHECK-LABEL: fsub_v32f64:
2068 ; CHECK-NEXT: ptrue p0.d, vl32
2069 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2070 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
2071 ; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d
2072 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2074 %op1 = load <32 x double>, ptr %a
2075 %op2 = load <32 x double>, ptr %b
2076 %res = fsub <32 x double> %op1, %op2
2077 store <32 x double> %res, ptr %a
2085 ; Don't use SVE for 64-bit vectors.
2086 define <4 x half> @fabs_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
2087 ; CHECK-LABEL: fabs_v4f16:
2089 ; CHECK-NEXT: fabs v0.4h, v0.4h
2091 %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
2095 ; Don't use SVE for 128-bit vectors.
2096 define <8 x half> @fabs_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
2097 ; CHECK-LABEL: fabs_v8f16:
2099 ; CHECK-NEXT: fabs v0.8h, v0.8h
2101 %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
2105 define void @fabs_v16f16(ptr %a) vscale_range(2,0) #0 {
2106 ; CHECK-LABEL: fabs_v16f16:
2108 ; CHECK-NEXT: ptrue p0.h, vl16
2109 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
2110 ; CHECK-NEXT: fabs z0.h, p0/m, z0.h
2111 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
2113 %op = load <16 x half>, ptr %a
2114 %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
2115 store <16 x half> %res, ptr %a
2119 define void @fabs_v32f16(ptr %a) #0 {
2120 ; VBITS_GE_256-LABEL: fabs_v32f16:
2121 ; VBITS_GE_256: // %bb.0:
2122 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
2123 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
2124 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
2125 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
2126 ; VBITS_GE_256-NEXT: fabs z0.h, p0/m, z0.h
2127 ; VBITS_GE_256-NEXT: fabs z1.h, p0/m, z1.h
2128 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
2129 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
2130 ; VBITS_GE_256-NEXT: ret
2132 ; VBITS_GE_512-LABEL: fabs_v32f16:
2133 ; VBITS_GE_512: // %bb.0:
2134 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
2135 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
2136 ; VBITS_GE_512-NEXT: fabs z0.h, p0/m, z0.h
2137 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
2138 ; VBITS_GE_512-NEXT: ret
2139 %op = load <32 x half>, ptr %a
2140 %res = call <32 x half> @llvm.fabs.v32f16(<32 x half> %op)
2141 store <32 x half> %res, ptr %a
2145 define void @fabs_v64f16(ptr %a) vscale_range(8,0) #0 {
2146 ; CHECK-LABEL: fabs_v64f16:
2148 ; CHECK-NEXT: ptrue p0.h, vl64
2149 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
2150 ; CHECK-NEXT: fabs z0.h, p0/m, z0.h
2151 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
2153 %op = load <64 x half>, ptr %a
2154 %res = call <64 x half> @llvm.fabs.v64f16(<64 x half> %op)
2155 store <64 x half> %res, ptr %a
2159 define void @fabs_v128f16(ptr %a) vscale_range(16,0) #0 {
2160 ; CHECK-LABEL: fabs_v128f16:
2162 ; CHECK-NEXT: ptrue p0.h, vl128
2163 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
2164 ; CHECK-NEXT: fabs z0.h, p0/m, z0.h
2165 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
2167 %op = load <128 x half>, ptr %a
2168 %res = call <128 x half> @llvm.fabs.v128f16(<128 x half> %op)
2169 store <128 x half> %res, ptr %a
2173 ; Don't use SVE for 64-bit vectors.
2174 define <2 x float> @fabs_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
2175 ; CHECK-LABEL: fabs_v2f32:
2177 ; CHECK-NEXT: fabs v0.2s, v0.2s
2179 %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
2180 ret <2 x float> %res
2183 ; Don't use SVE for 128-bit vectors.
2184 define <4 x float> @fabs_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
2185 ; CHECK-LABEL: fabs_v4f32:
2187 ; CHECK-NEXT: fabs v0.4s, v0.4s
2189 %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
2190 ret <4 x float> %res
2193 define void @fabs_v8f32(ptr %a) vscale_range(2,0) #0 {
2194 ; CHECK-LABEL: fabs_v8f32:
2196 ; CHECK-NEXT: ptrue p0.s, vl8
2197 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
2198 ; CHECK-NEXT: fabs z0.s, p0/m, z0.s
2199 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
2201 %op = load <8 x float>, ptr %a
2202 %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
2203 store <8 x float> %res, ptr %a
2207 define void @fabs_v16f32(ptr %a) #0 {
2208 ; VBITS_GE_256-LABEL: fabs_v16f32:
2209 ; VBITS_GE_256: // %bb.0:
2210 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
2211 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
2212 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
2213 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
2214 ; VBITS_GE_256-NEXT: fabs z0.s, p0/m, z0.s
2215 ; VBITS_GE_256-NEXT: fabs z1.s, p0/m, z1.s
2216 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
2217 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
2218 ; VBITS_GE_256-NEXT: ret
2220 ; VBITS_GE_512-LABEL: fabs_v16f32:
2221 ; VBITS_GE_512: // %bb.0:
2222 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
2223 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
2224 ; VBITS_GE_512-NEXT: fabs z0.s, p0/m, z0.s
2225 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
2226 ; VBITS_GE_512-NEXT: ret
2227 %op = load <16 x float>, ptr %a
2228 %res = call <16 x float> @llvm.fabs.v16f32(<16 x float> %op)
2229 store <16 x float> %res, ptr %a
2233 define void @fabs_v32f32(ptr %a) vscale_range(8,0) #0 {
2234 ; CHECK-LABEL: fabs_v32f32:
2236 ; CHECK-NEXT: ptrue p0.s, vl32
2237 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
2238 ; CHECK-NEXT: fabs z0.s, p0/m, z0.s
2239 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
2241 %op = load <32 x float>, ptr %a
2242 %res = call <32 x float> @llvm.fabs.v32f32(<32 x float> %op)
2243 store <32 x float> %res, ptr %a
2247 define void @fabs_v64f32(ptr %a) vscale_range(16,0) #0 {
2248 ; CHECK-LABEL: fabs_v64f32:
2250 ; CHECK-NEXT: ptrue p0.s, vl64
2251 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
2252 ; CHECK-NEXT: fabs z0.s, p0/m, z0.s
2253 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
2255 %op = load <64 x float>, ptr %a
2256 %res = call <64 x float> @llvm.fabs.v64f32(<64 x float> %op)
2257 store <64 x float> %res, ptr %a
2261 ; Don't use SVE for 64-bit vectors.
2262 define <1 x double> @fabs_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
2263 ; CHECK-LABEL: fabs_v1f64:
2265 ; CHECK-NEXT: fabs d0, d0
2267 %res = call <1 x double> @llvm.fabs.v1f64(<1 x double> %op)
2268 ret <1 x double> %res
2271 ; Don't use SVE for 128-bit vectors.
2272 define <2 x double> @fabs_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
2273 ; CHECK-LABEL: fabs_v2f64:
2275 ; CHECK-NEXT: fabs v0.2d, v0.2d
2277 %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
2278 ret <2 x double> %res
2281 define void @fabs_v4f64(ptr %a) vscale_range(2,0) #0 {
2282 ; CHECK-LABEL: fabs_v4f64:
2284 ; CHECK-NEXT: ptrue p0.d, vl4
2285 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2286 ; CHECK-NEXT: fabs z0.d, p0/m, z0.d
2287 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2289 %op = load <4 x double>, ptr %a
2290 %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
2291 store <4 x double> %res, ptr %a
2295 define void @fabs_v8f64(ptr %a) #0 {
2296 ; VBITS_GE_256-LABEL: fabs_v8f64:
2297 ; VBITS_GE_256: // %bb.0:
2298 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
2299 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
2300 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
2301 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
2302 ; VBITS_GE_256-NEXT: fabs z0.d, p0/m, z0.d
2303 ; VBITS_GE_256-NEXT: fabs z1.d, p0/m, z1.d
2304 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
2305 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
2306 ; VBITS_GE_256-NEXT: ret
2308 ; VBITS_GE_512-LABEL: fabs_v8f64:
2309 ; VBITS_GE_512: // %bb.0:
2310 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
2311 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
2312 ; VBITS_GE_512-NEXT: fabs z0.d, p0/m, z0.d
2313 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
2314 ; VBITS_GE_512-NEXT: ret
2315 %op = load <8 x double>, ptr %a
2316 %res = call <8 x double> @llvm.fabs.v8f64(<8 x double> %op)
2317 store <8 x double> %res, ptr %a
2321 define void @fabs_v16f64(ptr %a) vscale_range(8,0) #0 {
2322 ; CHECK-LABEL: fabs_v16f64:
2324 ; CHECK-NEXT: ptrue p0.d, vl16
2325 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2326 ; CHECK-NEXT: fabs z0.d, p0/m, z0.d
2327 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2329 %op = load <16 x double>, ptr %a
2330 %res = call <16 x double> @llvm.fabs.v16f64(<16 x double> %op)
2331 store <16 x double> %res, ptr %a
2335 define void @fabs_v32f64(ptr %a) vscale_range(16,0) #0 {
2336 ; CHECK-LABEL: fabs_v32f64:
2338 ; CHECK-NEXT: ptrue p0.d, vl32
2339 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2340 ; CHECK-NEXT: fabs z0.d, p0/m, z0.d
2341 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2343 %op = load <32 x double>, ptr %a
2344 %res = call <32 x double> @llvm.fabs.v32f64(<32 x double> %op)
2345 store <32 x double> %res, ptr %a
2349 attributes #0 = { "target-features"="+sve" }
2351 declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
2352 declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
2353 declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>)
2354 declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
2355 declare <64 x half> @llvm.fma.v64f16(<64 x half>, <64 x half>, <64 x half>)
2356 declare <128 x half> @llvm.fma.v128f16(<128 x half>, <128 x half>, <128 x half>)
2357 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
2358 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
2359 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
2360 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>)
2361 declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
2362 declare <64 x float> @llvm.fma.v64f32(<64 x float>, <64 x float>, <64 x float>)
2363 declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
2364 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
2365 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
2366 declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>)
2367 declare <16 x double> @llvm.fma.v16f64(<16 x double>, <16 x double>, <16 x double>)
2368 declare <32 x double> @llvm.fma.v32f64(<32 x double>, <32 x double>, <32 x double>)
2370 declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
2371 declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
2372 declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
2373 declare <32 x half> @llvm.sqrt.v32f16(<32 x half>)
2374 declare <64 x half> @llvm.sqrt.v64f16(<64 x half>)
2375 declare <128 x half> @llvm.sqrt.v128f16(<128 x half>)
2376 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
2377 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
2378 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
2379 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
2380 declare <32 x float> @llvm.sqrt.v32f32(<32 x float>)
2381 declare <64 x float> @llvm.sqrt.v64f32(<64 x float>)
2382 declare <1 x double> @llvm.sqrt.v1f64(<1 x double>)
2383 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
2384 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
2385 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
2386 declare <16 x double> @llvm.sqrt.v16f64(<16 x double>)
2387 declare <32 x double> @llvm.sqrt.v32f64(<32 x double>)
2389 declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
2390 declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
2391 declare <16 x half> @llvm.fabs.v16f16(<16 x half>)
2392 declare <32 x half> @llvm.fabs.v32f16(<32 x half>)
2393 declare <64 x half> @llvm.fabs.v64f16(<64 x half>)
2394 declare <128 x half> @llvm.fabs.v128f16(<128 x half>)
2395 declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
2396 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
2397 declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
2398 declare <16 x float> @llvm.fabs.v16f32(<16 x float>)
2399 declare <32 x float> @llvm.fabs.v32f32(<32 x float>)
2400 declare <64 x float> @llvm.fabs.v64f32(<64 x float>)
2401 declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
2402 declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
2403 declare <4 x double> @llvm.fabs.v4f64(<4 x double>)
2404 declare <8 x double> @llvm.fabs.v8f64(<8 x double>)
2405 declare <16 x double> @llvm.fabs.v16f64(<16 x double>)
2406 declare <32 x double> @llvm.fabs.v32f64(<32 x double>)