1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; Don't use SVE for 64-bit vectors.
13 define <4 x half> @fadd_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
14 ; CHECK-LABEL: fadd_v4f16:
16 ; CHECK-NEXT: fadd v0.4h, v0.4h, v1.4h
18 %res = fadd <4 x half> %op1, %op2
22 ; Don't use SVE for 128-bit vectors.
23 define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
24 ; CHECK-LABEL: fadd_v8f16:
26 ; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h
28 %res = fadd <8 x half> %op1, %op2
32 define void @fadd_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
33 ; CHECK-LABEL: fadd_v16f16:
35 ; CHECK-NEXT: ptrue p0.h, vl16
36 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
37 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
38 ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h
39 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
41 %op1 = load <16 x half>, ptr %a
42 %op2 = load <16 x half>, ptr %b
43 %res = fadd <16 x half> %op1, %op2
44 store <16 x half> %res, ptr %a
48 define void @fadd_v32f16(ptr %a, ptr %b) #0 {
49 ; VBITS_GE_256-LABEL: fadd_v32f16:
50 ; VBITS_GE_256: // %bb.0:
51 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
52 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
53 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
54 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
55 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
56 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
57 ; VBITS_GE_256-NEXT: fadd z0.h, p0/m, z0.h, z2.h
58 ; VBITS_GE_256-NEXT: fadd z1.h, p0/m, z1.h, z3.h
59 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
60 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
61 ; VBITS_GE_256-NEXT: ret
63 ; VBITS_GE_512-LABEL: fadd_v32f16:
64 ; VBITS_GE_512: // %bb.0:
65 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
66 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
67 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
68 ; VBITS_GE_512-NEXT: fadd z0.h, p0/m, z0.h, z1.h
69 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
70 ; VBITS_GE_512-NEXT: ret
71 %op1 = load <32 x half>, ptr %a
72 %op2 = load <32 x half>, ptr %b
73 %res = fadd <32 x half> %op1, %op2
74 store <32 x half> %res, ptr %a
78 define void @fadd_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
79 ; CHECK-LABEL: fadd_v64f16:
81 ; CHECK-NEXT: ptrue p0.h, vl64
82 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
83 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
84 ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h
85 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
87 %op1 = load <64 x half>, ptr %a
88 %op2 = load <64 x half>, ptr %b
89 %res = fadd <64 x half> %op1, %op2
90 store <64 x half> %res, ptr %a
94 define void @fadd_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
95 ; CHECK-LABEL: fadd_v128f16:
97 ; CHECK-NEXT: ptrue p0.h, vl128
98 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
99 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
100 ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h
101 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
103 %op1 = load <128 x half>, ptr %a
104 %op2 = load <128 x half>, ptr %b
105 %res = fadd <128 x half> %op1, %op2
106 store <128 x half> %res, ptr %a
110 ; Don't use SVE for 64-bit vectors.
111 define <2 x float> @fadd_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
112 ; CHECK-LABEL: fadd_v2f32:
114 ; CHECK-NEXT: fadd v0.2s, v0.2s, v1.2s
116 %res = fadd <2 x float> %op1, %op2
120 ; Don't use SVE for 128-bit vectors.
121 define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
122 ; CHECK-LABEL: fadd_v4f32:
124 ; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
126 %res = fadd <4 x float> %op1, %op2
130 define void @fadd_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
131 ; CHECK-LABEL: fadd_v8f32:
133 ; CHECK-NEXT: ptrue p0.s, vl8
134 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
135 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
136 ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
137 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
139 %op1 = load <8 x float>, ptr %a
140 %op2 = load <8 x float>, ptr %b
141 %res = fadd <8 x float> %op1, %op2
142 store <8 x float> %res, ptr %a
146 define void @fadd_v16f32(ptr %a, ptr %b) #0 {
147 ; VBITS_GE_256-LABEL: fadd_v16f32:
148 ; VBITS_GE_256: // %bb.0:
149 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
150 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
151 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
152 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
153 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
154 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
155 ; VBITS_GE_256-NEXT: fadd z0.s, p0/m, z0.s, z2.s
156 ; VBITS_GE_256-NEXT: fadd z1.s, p0/m, z1.s, z3.s
157 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
158 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
159 ; VBITS_GE_256-NEXT: ret
161 ; VBITS_GE_512-LABEL: fadd_v16f32:
162 ; VBITS_GE_512: // %bb.0:
163 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
164 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
165 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
166 ; VBITS_GE_512-NEXT: fadd z0.s, p0/m, z0.s, z1.s
167 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
168 ; VBITS_GE_512-NEXT: ret
169 %op1 = load <16 x float>, ptr %a
170 %op2 = load <16 x float>, ptr %b
171 %res = fadd <16 x float> %op1, %op2
172 store <16 x float> %res, ptr %a
176 define void @fadd_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
177 ; CHECK-LABEL: fadd_v32f32:
179 ; CHECK-NEXT: ptrue p0.s, vl32
180 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
181 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
182 ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
183 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
185 %op1 = load <32 x float>, ptr %a
186 %op2 = load <32 x float>, ptr %b
187 %res = fadd <32 x float> %op1, %op2
188 store <32 x float> %res, ptr %a
192 define void @fadd_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
193 ; CHECK-LABEL: fadd_v64f32:
195 ; CHECK-NEXT: ptrue p0.s, vl64
196 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
197 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
198 ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
199 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
201 %op1 = load <64 x float>, ptr %a
202 %op2 = load <64 x float>, ptr %b
203 %res = fadd <64 x float> %op1, %op2
204 store <64 x float> %res, ptr %a
208 ; Don't use SVE for 64-bit vectors.
209 define <1 x double> @fadd_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
210 ; CHECK-LABEL: fadd_v1f64:
212 ; CHECK-NEXT: fadd d0, d0, d1
214 %res = fadd <1 x double> %op1, %op2
215 ret <1 x double> %res
218 ; Don't use SVE for 128-bit vectors.
219 define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
220 ; CHECK-LABEL: fadd_v2f64:
222 ; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
224 %res = fadd <2 x double> %op1, %op2
225 ret <2 x double> %res
228 define void @fadd_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
229 ; CHECK-LABEL: fadd_v4f64:
231 ; CHECK-NEXT: ptrue p0.d, vl4
232 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
233 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
234 ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d
235 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
237 %op1 = load <4 x double>, ptr %a
238 %op2 = load <4 x double>, ptr %b
239 %res = fadd <4 x double> %op1, %op2
240 store <4 x double> %res, ptr %a
244 define void @fadd_v8f64(ptr %a, ptr %b) #0 {
245 ; VBITS_GE_256-LABEL: fadd_v8f64:
246 ; VBITS_GE_256: // %bb.0:
247 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
248 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
249 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
250 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
251 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
252 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
253 ; VBITS_GE_256-NEXT: fadd z0.d, p0/m, z0.d, z2.d
254 ; VBITS_GE_256-NEXT: fadd z1.d, p0/m, z1.d, z3.d
255 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
256 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
257 ; VBITS_GE_256-NEXT: ret
259 ; VBITS_GE_512-LABEL: fadd_v8f64:
260 ; VBITS_GE_512: // %bb.0:
261 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
262 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
263 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
264 ; VBITS_GE_512-NEXT: fadd z0.d, p0/m, z0.d, z1.d
265 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
266 ; VBITS_GE_512-NEXT: ret
267 %op1 = load <8 x double>, ptr %a
268 %op2 = load <8 x double>, ptr %b
269 %res = fadd <8 x double> %op1, %op2
270 store <8 x double> %res, ptr %a
274 define void @fadd_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
275 ; CHECK-LABEL: fadd_v16f64:
277 ; CHECK-NEXT: ptrue p0.d, vl16
278 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
279 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
280 ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d
281 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
283 %op1 = load <16 x double>, ptr %a
284 %op2 = load <16 x double>, ptr %b
285 %res = fadd <16 x double> %op1, %op2
286 store <16 x double> %res, ptr %a
290 define void @fadd_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
291 ; CHECK-LABEL: fadd_v32f64:
293 ; CHECK-NEXT: ptrue p0.d, vl32
294 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
295 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
296 ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d
297 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
299 %op1 = load <32 x double>, ptr %a
300 %op2 = load <32 x double>, ptr %b
301 %res = fadd <32 x double> %op1, %op2
302 store <32 x double> %res, ptr %a
310 ; Don't use SVE for 64-bit vectors.
311 define <4 x half> @fdiv_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
312 ; CHECK-LABEL: fdiv_v4f16:
314 ; CHECK-NEXT: fdiv v0.4h, v0.4h, v1.4h
316 %res = fdiv <4 x half> %op1, %op2
320 ; Don't use SVE for 128-bit vectors.
321 define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
322 ; CHECK-LABEL: fdiv_v8f16:
324 ; CHECK-NEXT: fdiv v0.8h, v0.8h, v1.8h
326 %res = fdiv <8 x half> %op1, %op2
330 define void @fdiv_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
331 ; CHECK-LABEL: fdiv_v16f16:
333 ; CHECK-NEXT: ptrue p0.h, vl16
334 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
335 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
336 ; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
337 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
339 %op1 = load <16 x half>, ptr %a
340 %op2 = load <16 x half>, ptr %b
341 %res = fdiv <16 x half> %op1, %op2
342 store <16 x half> %res, ptr %a
346 define void @fdiv_v32f16(ptr %a, ptr %b) #0 {
347 ; VBITS_GE_256-LABEL: fdiv_v32f16:
348 ; VBITS_GE_256: // %bb.0:
349 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
350 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
351 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
352 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
353 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
354 ; VBITS_GE_256-NEXT: fdiv z0.h, p0/m, z0.h, z2.h
355 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1]
356 ; VBITS_GE_256-NEXT: fdiv z1.h, p0/m, z1.h, z2.h
357 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
358 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
359 ; VBITS_GE_256-NEXT: ret
361 ; VBITS_GE_512-LABEL: fdiv_v32f16:
362 ; VBITS_GE_512: // %bb.0:
363 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
364 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
365 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
366 ; VBITS_GE_512-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
367 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
368 ; VBITS_GE_512-NEXT: ret
369 %op1 = load <32 x half>, ptr %a
370 %op2 = load <32 x half>, ptr %b
371 %res = fdiv <32 x half> %op1, %op2
372 store <32 x half> %res, ptr %a
376 define void @fdiv_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
377 ; CHECK-LABEL: fdiv_v64f16:
379 ; CHECK-NEXT: ptrue p0.h, vl64
380 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
381 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
382 ; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
383 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
385 %op1 = load <64 x half>, ptr %a
386 %op2 = load <64 x half>, ptr %b
387 %res = fdiv <64 x half> %op1, %op2
388 store <64 x half> %res, ptr %a
392 define void @fdiv_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
393 ; CHECK-LABEL: fdiv_v128f16:
395 ; CHECK-NEXT: ptrue p0.h, vl128
396 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
397 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
398 ; CHECK-NEXT: fdiv z0.h, p0/m, z0.h, z1.h
399 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
401 %op1 = load <128 x half>, ptr %a
402 %op2 = load <128 x half>, ptr %b
403 %res = fdiv <128 x half> %op1, %op2
404 store <128 x half> %res, ptr %a
408 ; Don't use SVE for 64-bit vectors.
409 define <2 x float> @fdiv_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
410 ; CHECK-LABEL: fdiv_v2f32:
412 ; CHECK-NEXT: fdiv v0.2s, v0.2s, v1.2s
414 %res = fdiv <2 x float> %op1, %op2
418 ; Don't use SVE for 128-bit vectors.
419 define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
420 ; CHECK-LABEL: fdiv_v4f32:
422 ; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s
424 %res = fdiv <4 x float> %op1, %op2
428 define void @fdiv_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
429 ; CHECK-LABEL: fdiv_v8f32:
431 ; CHECK-NEXT: ptrue p0.s, vl8
432 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
433 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
434 ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
435 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
437 %op1 = load <8 x float>, ptr %a
438 %op2 = load <8 x float>, ptr %b
439 %res = fdiv <8 x float> %op1, %op2
440 store <8 x float> %res, ptr %a
444 define void @fdiv_v16f32(ptr %a, ptr %b) #0 {
445 ; VBITS_GE_256-LABEL: fdiv_v16f32:
446 ; VBITS_GE_256: // %bb.0:
447 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
448 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
449 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
450 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
451 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
452 ; VBITS_GE_256-NEXT: fdiv z0.s, p0/m, z0.s, z2.s
453 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1]
454 ; VBITS_GE_256-NEXT: fdiv z1.s, p0/m, z1.s, z2.s
455 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
456 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
457 ; VBITS_GE_256-NEXT: ret
459 ; VBITS_GE_512-LABEL: fdiv_v16f32:
460 ; VBITS_GE_512: // %bb.0:
461 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
462 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
463 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
464 ; VBITS_GE_512-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
465 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
466 ; VBITS_GE_512-NEXT: ret
467 %op1 = load <16 x float>, ptr %a
468 %op2 = load <16 x float>, ptr %b
469 %res = fdiv <16 x float> %op1, %op2
470 store <16 x float> %res, ptr %a
474 define void @fdiv_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
475 ; CHECK-LABEL: fdiv_v32f32:
477 ; CHECK-NEXT: ptrue p0.s, vl32
478 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
479 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
480 ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
481 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
483 %op1 = load <32 x float>, ptr %a
484 %op2 = load <32 x float>, ptr %b
485 %res = fdiv <32 x float> %op1, %op2
486 store <32 x float> %res, ptr %a
490 define void @fdiv_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
491 ; CHECK-LABEL: fdiv_v64f32:
493 ; CHECK-NEXT: ptrue p0.s, vl64
494 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
495 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
496 ; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
497 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
499 %op1 = load <64 x float>, ptr %a
500 %op2 = load <64 x float>, ptr %b
501 %res = fdiv <64 x float> %op1, %op2
502 store <64 x float> %res, ptr %a
506 ; Don't use SVE for 64-bit vectors.
507 define <1 x double> @fdiv_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
508 ; CHECK-LABEL: fdiv_v1f64:
510 ; CHECK-NEXT: fdiv d0, d0, d1
512 %res = fdiv <1 x double> %op1, %op2
513 ret <1 x double> %res
516 ; Don't use SVE for 128-bit vectors.
517 define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
518 ; CHECK-LABEL: fdiv_v2f64:
520 ; CHECK-NEXT: fdiv v0.2d, v0.2d, v1.2d
522 %res = fdiv <2 x double> %op1, %op2
523 ret <2 x double> %res
526 define void @fdiv_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
527 ; CHECK-LABEL: fdiv_v4f64:
529 ; CHECK-NEXT: ptrue p0.d, vl4
530 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
531 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
532 ; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
533 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
535 %op1 = load <4 x double>, ptr %a
536 %op2 = load <4 x double>, ptr %b
537 %res = fdiv <4 x double> %op1, %op2
538 store <4 x double> %res, ptr %a
542 define void @fdiv_v8f64(ptr %a, ptr %b) #0 {
543 ; VBITS_GE_256-LABEL: fdiv_v8f64:
544 ; VBITS_GE_256: // %bb.0:
545 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
546 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
547 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
548 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
549 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
550 ; VBITS_GE_256-NEXT: fdiv z0.d, p0/m, z0.d, z2.d
551 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1]
552 ; VBITS_GE_256-NEXT: fdiv z1.d, p0/m, z1.d, z2.d
553 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
554 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
555 ; VBITS_GE_256-NEXT: ret
557 ; VBITS_GE_512-LABEL: fdiv_v8f64:
558 ; VBITS_GE_512: // %bb.0:
559 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
560 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
561 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
562 ; VBITS_GE_512-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
563 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
564 ; VBITS_GE_512-NEXT: ret
565 %op1 = load <8 x double>, ptr %a
566 %op2 = load <8 x double>, ptr %b
567 %res = fdiv <8 x double> %op1, %op2
568 store <8 x double> %res, ptr %a
572 define void @fdiv_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
573 ; CHECK-LABEL: fdiv_v16f64:
575 ; CHECK-NEXT: ptrue p0.d, vl16
576 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
577 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
578 ; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
579 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
581 %op1 = load <16 x double>, ptr %a
582 %op2 = load <16 x double>, ptr %b
583 %res = fdiv <16 x double> %op1, %op2
584 store <16 x double> %res, ptr %a
588 define void @fdiv_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
589 ; CHECK-LABEL: fdiv_v32f64:
591 ; CHECK-NEXT: ptrue p0.d, vl32
592 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
593 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
594 ; CHECK-NEXT: fdiv z0.d, p0/m, z0.d, z1.d
595 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
597 %op1 = load <32 x double>, ptr %a
598 %op2 = load <32 x double>, ptr %b
599 %res = fdiv <32 x double> %op1, %op2
600 store <32 x double> %res, ptr %a
608 ; Don't use SVE for 64-bit vectors.
609 define <4 x half> @fma_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3) vscale_range(2,0) #0 {
610 ; CHECK-LABEL: fma_v4f16:
612 ; CHECK-NEXT: fmla v2.4h, v1.4h, v0.4h
613 ; CHECK-NEXT: fmov d0, d2
615 %res = call <4 x half> @llvm.fma.v4f16(<4 x half> %op1, <4 x half> %op2, <4 x half> %op3)
619 ; Don't use SVE for 128-bit vectors.
620 define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) vscale_range(2,0) #0 {
621 ; CHECK-LABEL: fma_v8f16:
623 ; CHECK-NEXT: fmla v2.8h, v1.8h, v0.8h
624 ; CHECK-NEXT: mov v0.16b, v2.16b
626 %res = call <8 x half> @llvm.fma.v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
630 define void @fma_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
631 ; CHECK-LABEL: fma_v16f16:
633 ; CHECK-NEXT: ptrue p0.h, vl16
634 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
635 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
636 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
637 ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
638 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
640 %op1 = load <16 x half>, ptr %a
641 %op2 = load <16 x half>, ptr %b
642 %op3 = load <16 x half>, ptr %c
643 %res = call <16 x half> @llvm.fma.v16f16(<16 x half> %op1, <16 x half> %op2, <16 x half> %op3)
644 store <16 x half> %res, ptr %a
648 define void @fma_v32f16(ptr %a, ptr %b, ptr %c) #0 {
649 ; VBITS_GE_256-LABEL: fma_v32f16:
650 ; VBITS_GE_256: // %bb.0:
651 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
652 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
653 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
654 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
655 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
656 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
657 ; VBITS_GE_256-NEXT: ld1h { z4.h }, p0/z, [x2, x8, lsl #1]
658 ; VBITS_GE_256-NEXT: ld1h { z5.h }, p0/z, [x2]
659 ; VBITS_GE_256-NEXT: fmad z0.h, p0/m, z2.h, z4.h
660 ; VBITS_GE_256-NEXT: fmad z1.h, p0/m, z3.h, z5.h
661 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
662 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
663 ; VBITS_GE_256-NEXT: ret
665 ; VBITS_GE_512-LABEL: fma_v32f16:
666 ; VBITS_GE_512: // %bb.0:
667 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
668 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
669 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
670 ; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x2]
671 ; VBITS_GE_512-NEXT: fmad z0.h, p0/m, z1.h, z2.h
672 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
673 ; VBITS_GE_512-NEXT: ret
674 %op1 = load <32 x half>, ptr %a
675 %op2 = load <32 x half>, ptr %b
676 %op3 = load <32 x half>, ptr %c
677 %res = call <32 x half> @llvm.fma.v32f16(<32 x half> %op1, <32 x half> %op2, <32 x half> %op3)
678 store <32 x half> %res, ptr %a
682 define void @fma_v64f16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
683 ; CHECK-LABEL: fma_v64f16:
685 ; CHECK-NEXT: ptrue p0.h, vl64
686 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
687 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
688 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
689 ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
690 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
692 %op1 = load <64 x half>, ptr %a
693 %op2 = load <64 x half>, ptr %b
694 %op3 = load <64 x half>, ptr %c
695 %res = call <64 x half> @llvm.fma.v64f16(<64 x half> %op1, <64 x half> %op2, <64 x half> %op3)
696 store <64 x half> %res, ptr %a
700 define void @fma_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
701 ; CHECK-LABEL: fma_v128f16:
703 ; CHECK-NEXT: ptrue p0.h, vl128
704 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
705 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
706 ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x2]
707 ; CHECK-NEXT: fmad z0.h, p0/m, z1.h, z2.h
708 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
710 %op1 = load <128 x half>, ptr %a
711 %op2 = load <128 x half>, ptr %b
712 %op3 = load <128 x half>, ptr %c
713 %res = call <128 x half> @llvm.fma.v128f16(<128 x half> %op1, <128 x half> %op2, <128 x half> %op3)
714 store <128 x half> %res, ptr %a
718 ; Don't use SVE for 64-bit vectors.
719 define <2 x float> @fma_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3) vscale_range(2,0) #0 {
720 ; CHECK-LABEL: fma_v2f32:
722 ; CHECK-NEXT: fmla v2.2s, v1.2s, v0.2s
723 ; CHECK-NEXT: fmov d0, d2
725 %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %op1, <2 x float> %op2, <2 x float> %op3)
729 ; Don't use SVE for 128-bit vectors.
730 define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3) vscale_range(2,0) #0 {
731 ; CHECK-LABEL: fma_v4f32:
733 ; CHECK-NEXT: fmla v2.4s, v1.4s, v0.4s
734 ; CHECK-NEXT: mov v0.16b, v2.16b
736 %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %op3)
740 define void @fma_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
741 ; CHECK-LABEL: fma_v8f32:
743 ; CHECK-NEXT: ptrue p0.s, vl8
744 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
745 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
746 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
747 ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
748 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
750 %op1 = load <8 x float>, ptr %a
751 %op2 = load <8 x float>, ptr %b
752 %op3 = load <8 x float>, ptr %c
753 %res = call <8 x float> @llvm.fma.v8f32(<8 x float> %op1, <8 x float> %op2, <8 x float> %op3)
754 store <8 x float> %res, ptr %a
758 define void @fma_v16f32(ptr %a, ptr %b, ptr %c) #0 {
759 ; VBITS_GE_256-LABEL: fma_v16f32:
760 ; VBITS_GE_256: // %bb.0:
761 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
762 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
763 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
764 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
765 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
766 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
767 ; VBITS_GE_256-NEXT: ld1w { z4.s }, p0/z, [x2, x8, lsl #2]
768 ; VBITS_GE_256-NEXT: ld1w { z5.s }, p0/z, [x2]
769 ; VBITS_GE_256-NEXT: fmad z0.s, p0/m, z2.s, z4.s
770 ; VBITS_GE_256-NEXT: fmad z1.s, p0/m, z3.s, z5.s
771 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
772 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
773 ; VBITS_GE_256-NEXT: ret
775 ; VBITS_GE_512-LABEL: fma_v16f32:
776 ; VBITS_GE_512: // %bb.0:
777 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
778 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
779 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
780 ; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x2]
781 ; VBITS_GE_512-NEXT: fmad z0.s, p0/m, z1.s, z2.s
782 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
783 ; VBITS_GE_512-NEXT: ret
784 %op1 = load <16 x float>, ptr %a
785 %op2 = load <16 x float>, ptr %b
786 %op3 = load <16 x float>, ptr %c
787 %res = call <16 x float> @llvm.fma.v16f32(<16 x float> %op1, <16 x float> %op2, <16 x float> %op3)
788 store <16 x float> %res, ptr %a
792 define void @fma_v32f32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
793 ; CHECK-LABEL: fma_v32f32:
795 ; CHECK-NEXT: ptrue p0.s, vl32
796 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
797 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
798 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
799 ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
800 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
802 %op1 = load <32 x float>, ptr %a
803 %op2 = load <32 x float>, ptr %b
804 %op3 = load <32 x float>, ptr %c
805 %res = call <32 x float> @llvm.fma.v32f32(<32 x float> %op1, <32 x float> %op2, <32 x float> %op3)
806 store <32 x float> %res, ptr %a
810 define void @fma_v64f32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
811 ; CHECK-LABEL: fma_v64f32:
813 ; CHECK-NEXT: ptrue p0.s, vl64
814 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
815 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
816 ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x2]
817 ; CHECK-NEXT: fmad z0.s, p0/m, z1.s, z2.s
818 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
820 %op1 = load <64 x float>, ptr %a
821 %op2 = load <64 x float>, ptr %b
822 %op3 = load <64 x float>, ptr %c
823 %res = call <64 x float> @llvm.fma.v64f32(<64 x float> %op1, <64 x float> %op2, <64 x float> %op3)
824 store <64 x float> %res, ptr %a
828 ; Don't use SVE for 64-bit vectors.
829 define <1 x double> @fma_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3) vscale_range(2,0) #0 {
830 ; CHECK-LABEL: fma_v1f64:
832 ; CHECK-NEXT: fmadd d0, d0, d1, d2
834 %res = call <1 x double> @llvm.fma.v1f64(<1 x double> %op1, <1 x double> %op2, <1 x double> %op3)
835 ret <1 x double> %res
838 ; Don't use SVE for 128-bit vectors.
839 define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3) vscale_range(2,0) #0 {
840 ; CHECK-LABEL: fma_v2f64:
842 ; CHECK-NEXT: fmla v2.2d, v1.2d, v0.2d
843 ; CHECK-NEXT: mov v0.16b, v2.16b
845 %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double> %op3)
846 ret <2 x double> %res
849 define void @fma_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
850 ; CHECK-LABEL: fma_v4f64:
852 ; CHECK-NEXT: ptrue p0.d, vl4
853 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
854 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
855 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
856 ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
857 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
859 %op1 = load <4 x double>, ptr %a
860 %op2 = load <4 x double>, ptr %b
861 %op3 = load <4 x double>, ptr %c
862 %res = call <4 x double> @llvm.fma.v4f64(<4 x double> %op1, <4 x double> %op2, <4 x double> %op3)
863 store <4 x double> %res, ptr %a
867 define void @fma_v8f64(ptr %a, ptr %b, ptr %c) #0 {
868 ; VBITS_GE_256-LABEL: fma_v8f64:
869 ; VBITS_GE_256: // %bb.0:
870 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
871 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
872 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
873 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
874 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
875 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
876 ; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x2, x8, lsl #3]
877 ; VBITS_GE_256-NEXT: ld1d { z5.d }, p0/z, [x2]
878 ; VBITS_GE_256-NEXT: fmad z0.d, p0/m, z2.d, z4.d
879 ; VBITS_GE_256-NEXT: fmad z1.d, p0/m, z3.d, z5.d
880 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
881 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
882 ; VBITS_GE_256-NEXT: ret
884 ; VBITS_GE_512-LABEL: fma_v8f64:
885 ; VBITS_GE_512: // %bb.0:
886 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
887 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
888 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
889 ; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x2]
890 ; VBITS_GE_512-NEXT: fmad z0.d, p0/m, z1.d, z2.d
891 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
892 ; VBITS_GE_512-NEXT: ret
893 %op1 = load <8 x double>, ptr %a
894 %op2 = load <8 x double>, ptr %b
895 %op3 = load <8 x double>, ptr %c
896 %res = call <8 x double> @llvm.fma.v8f64(<8 x double> %op1, <8 x double> %op2, <8 x double> %op3)
897 store <8 x double> %res, ptr %a
901 define void @fma_v16f64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
902 ; CHECK-LABEL: fma_v16f64:
904 ; CHECK-NEXT: ptrue p0.d, vl16
905 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
906 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
907 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
908 ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
909 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
911 %op1 = load <16 x double>, ptr %a
912 %op2 = load <16 x double>, ptr %b
913 %op3 = load <16 x double>, ptr %c
914 %res = call <16 x double> @llvm.fma.v16f64(<16 x double> %op1, <16 x double> %op2, <16 x double> %op3)
915 store <16 x double> %res, ptr %a
919 define void @fma_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
920 ; CHECK-LABEL: fma_v32f64:
922 ; CHECK-NEXT: ptrue p0.d, vl32
923 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
924 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
925 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2]
926 ; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d
927 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
929 %op1 = load <32 x double>, ptr %a
930 %op2 = load <32 x double>, ptr %b
931 %op3 = load <32 x double>, ptr %c
932 %res = call <32 x double> @llvm.fma.v32f64(<32 x double> %op1, <32 x double> %op2, <32 x double> %op3)
933 store <32 x double> %res, ptr %a
941 ; Don't use SVE for 64-bit vectors.
942 define <4 x half> @fmul_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
943 ; CHECK-LABEL: fmul_v4f16:
945 ; CHECK-NEXT: fmul v0.4h, v0.4h, v1.4h
947 %res = fmul <4 x half> %op1, %op2
951 ; Don't use SVE for 128-bit vectors.
952 define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
953 ; CHECK-LABEL: fmul_v8f16:
955 ; CHECK-NEXT: fmul v0.8h, v0.8h, v1.8h
957 %res = fmul <8 x half> %op1, %op2
961 define void @fmul_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
962 ; CHECK-LABEL: fmul_v16f16:
964 ; CHECK-NEXT: ptrue p0.h, vl16
965 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
966 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
967 ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
968 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
970 %op1 = load <16 x half>, ptr %a
971 %op2 = load <16 x half>, ptr %b
972 %res = fmul <16 x half> %op1, %op2
973 store <16 x half> %res, ptr %a
977 define void @fmul_v32f16(ptr %a, ptr %b) #0 {
978 ; VBITS_GE_256-LABEL: fmul_v32f16:
979 ; VBITS_GE_256: // %bb.0:
980 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
981 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
982 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
983 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
984 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
985 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
986 ; VBITS_GE_256-NEXT: fmul z0.h, p0/m, z0.h, z2.h
987 ; VBITS_GE_256-NEXT: fmul z1.h, p0/m, z1.h, z3.h
988 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
989 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
990 ; VBITS_GE_256-NEXT: ret
992 ; VBITS_GE_512-LABEL: fmul_v32f16:
993 ; VBITS_GE_512: // %bb.0:
994 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
995 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
996 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
997 ; VBITS_GE_512-NEXT: fmul z0.h, p0/m, z0.h, z1.h
998 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
999 ; VBITS_GE_512-NEXT: ret
1000 %op1 = load <32 x half>, ptr %a
1001 %op2 = load <32 x half>, ptr %b
1002 %res = fmul <32 x half> %op1, %op2
1003 store <32 x half> %res, ptr %a
1007 define void @fmul_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1008 ; CHECK-LABEL: fmul_v64f16:
1010 ; CHECK-NEXT: ptrue p0.h, vl64
1011 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1012 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1013 ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
1014 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1016 %op1 = load <64 x half>, ptr %a
1017 %op2 = load <64 x half>, ptr %b
1018 %res = fmul <64 x half> %op1, %op2
1019 store <64 x half> %res, ptr %a
1023 define void @fmul_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1024 ; CHECK-LABEL: fmul_v128f16:
1026 ; CHECK-NEXT: ptrue p0.h, vl128
1027 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1028 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1029 ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
1030 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1032 %op1 = load <128 x half>, ptr %a
1033 %op2 = load <128 x half>, ptr %b
1034 %res = fmul <128 x half> %op1, %op2
1035 store <128 x half> %res, ptr %a
1039 ; Don't use SVE for 64-bit vectors.
1040 define <2 x float> @fmul_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
1041 ; CHECK-LABEL: fmul_v2f32:
1043 ; CHECK-NEXT: fmul v0.2s, v0.2s, v1.2s
1045 %res = fmul <2 x float> %op1, %op2
1046 ret <2 x float> %res
1049 ; Don't use SVE for 128-bit vectors.
1050 define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
1051 ; CHECK-LABEL: fmul_v4f32:
1053 ; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
1055 %res = fmul <4 x float> %op1, %op2
1056 ret <4 x float> %res
1059 define void @fmul_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1060 ; CHECK-LABEL: fmul_v8f32:
1062 ; CHECK-NEXT: ptrue p0.s, vl8
1063 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1064 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1065 ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
1066 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1068 %op1 = load <8 x float>, ptr %a
1069 %op2 = load <8 x float>, ptr %b
1070 %res = fmul <8 x float> %op1, %op2
1071 store <8 x float> %res, ptr %a
1075 define void @fmul_v16f32(ptr %a, ptr %b) #0 {
1076 ; VBITS_GE_256-LABEL: fmul_v16f32:
1077 ; VBITS_GE_256: // %bb.0:
1078 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1079 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1080 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1081 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1082 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
1083 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
1084 ; VBITS_GE_256-NEXT: fmul z0.s, p0/m, z0.s, z2.s
1085 ; VBITS_GE_256-NEXT: fmul z1.s, p0/m, z1.s, z3.s
1086 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1087 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1088 ; VBITS_GE_256-NEXT: ret
1090 ; VBITS_GE_512-LABEL: fmul_v16f32:
1091 ; VBITS_GE_512: // %bb.0:
1092 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1093 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1094 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
1095 ; VBITS_GE_512-NEXT: fmul z0.s, p0/m, z0.s, z1.s
1096 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1097 ; VBITS_GE_512-NEXT: ret
1098 %op1 = load <16 x float>, ptr %a
1099 %op2 = load <16 x float>, ptr %b
1100 %res = fmul <16 x float> %op1, %op2
1101 store <16 x float> %res, ptr %a
1105 define void @fmul_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1106 ; CHECK-LABEL: fmul_v32f32:
1108 ; CHECK-NEXT: ptrue p0.s, vl32
1109 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1110 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1111 ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
1112 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1114 %op1 = load <32 x float>, ptr %a
1115 %op2 = load <32 x float>, ptr %b
1116 %res = fmul <32 x float> %op1, %op2
1117 store <32 x float> %res, ptr %a
1121 define void @fmul_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1122 ; CHECK-LABEL: fmul_v64f32:
1124 ; CHECK-NEXT: ptrue p0.s, vl64
1125 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1126 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1127 ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
1128 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1130 %op1 = load <64 x float>, ptr %a
1131 %op2 = load <64 x float>, ptr %b
1132 %res = fmul <64 x float> %op1, %op2
1133 store <64 x float> %res, ptr %a
1137 ; Don't use SVE for 64-bit vectors.
1138 define <1 x double> @fmul_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
1139 ; CHECK-LABEL: fmul_v1f64:
1141 ; CHECK-NEXT: fmul d0, d0, d1
1143 %res = fmul <1 x double> %op1, %op2
1144 ret <1 x double> %res
1147 ; Don't use SVE for 128-bit vectors.
1148 define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
1149 ; CHECK-LABEL: fmul_v2f64:
1151 ; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d
1153 %res = fmul <2 x double> %op1, %op2
1154 ret <2 x double> %res
1157 define void @fmul_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1158 ; CHECK-LABEL: fmul_v4f64:
1160 ; CHECK-NEXT: ptrue p0.d, vl4
1161 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1162 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1163 ; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
1164 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1166 %op1 = load <4 x double>, ptr %a
1167 %op2 = load <4 x double>, ptr %b
1168 %res = fmul <4 x double> %op1, %op2
1169 store <4 x double> %res, ptr %a
1173 define void @fmul_v8f64(ptr %a, ptr %b) #0 {
1174 ; VBITS_GE_256-LABEL: fmul_v8f64:
1175 ; VBITS_GE_256: // %bb.0:
1176 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1177 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1178 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1179 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1180 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
1181 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
1182 ; VBITS_GE_256-NEXT: fmul z0.d, p0/m, z0.d, z2.d
1183 ; VBITS_GE_256-NEXT: fmul z1.d, p0/m, z1.d, z3.d
1184 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1185 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1186 ; VBITS_GE_256-NEXT: ret
1188 ; VBITS_GE_512-LABEL: fmul_v8f64:
1189 ; VBITS_GE_512: // %bb.0:
1190 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1191 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1192 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
1193 ; VBITS_GE_512-NEXT: fmul z0.d, p0/m, z0.d, z1.d
1194 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1195 ; VBITS_GE_512-NEXT: ret
1196 %op1 = load <8 x double>, ptr %a
1197 %op2 = load <8 x double>, ptr %b
1198 %res = fmul <8 x double> %op1, %op2
1199 store <8 x double> %res, ptr %a
1203 define void @fmul_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
1204 ; CHECK-LABEL: fmul_v16f64:
1206 ; CHECK-NEXT: ptrue p0.d, vl16
1207 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1208 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1209 ; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
1210 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1212 %op1 = load <16 x double>, ptr %a
1213 %op2 = load <16 x double>, ptr %b
1214 %res = fmul <16 x double> %op1, %op2
1215 store <16 x double> %res, ptr %a
1219 define void @fmul_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
1220 ; CHECK-LABEL: fmul_v32f64:
1222 ; CHECK-NEXT: ptrue p0.d, vl32
1223 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1224 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1225 ; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
1226 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1228 %op1 = load <32 x double>, ptr %a
1229 %op2 = load <32 x double>, ptr %b
1230 %res = fmul <32 x double> %op1, %op2
1231 store <32 x double> %res, ptr %a
1239 ; Don't use SVE for 64-bit vectors.
1240 define <4 x half> @fneg_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
1241 ; CHECK-LABEL: fneg_v4f16:
1243 ; CHECK-NEXT: fneg v0.4h, v0.4h
1245 %res = fneg <4 x half> %op
1249 ; Don't use SVE for 128-bit vectors.
1250 define <8 x half> @fneg_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
1251 ; CHECK-LABEL: fneg_v8f16:
1253 ; CHECK-NEXT: fneg v0.8h, v0.8h
1255 %res = fneg <8 x half> %op
1259 define void @fneg_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
1260 ; CHECK-LABEL: fneg_v16f16:
1262 ; CHECK-NEXT: ptrue p0.h, vl16
1263 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1264 ; CHECK-NEXT: fneg z0.h, p0/m, z0.h
1265 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1267 %op = load <16 x half>, ptr %a
1268 %res = fneg <16 x half> %op
1269 store <16 x half> %res, ptr %a
1273 define void @fneg_v32f16(ptr %a) #0 {
1274 ; VBITS_GE_256-LABEL: fneg_v32f16:
1275 ; VBITS_GE_256: // %bb.0:
1276 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1277 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1278 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1279 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
1280 ; VBITS_GE_256-NEXT: fneg z0.h, p0/m, z0.h
1281 ; VBITS_GE_256-NEXT: fneg z1.h, p0/m, z1.h
1282 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1283 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
1284 ; VBITS_GE_256-NEXT: ret
1286 ; VBITS_GE_512-LABEL: fneg_v32f16:
1287 ; VBITS_GE_512: // %bb.0:
1288 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1289 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1290 ; VBITS_GE_512-NEXT: fneg z0.h, p0/m, z0.h
1291 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
1292 ; VBITS_GE_512-NEXT: ret
1293 %op = load <32 x half>, ptr %a
1294 %res = fneg <32 x half> %op
1295 store <32 x half> %res, ptr %a
1299 define void @fneg_v64f16(ptr %a) vscale_range(8,0) #0 {
1300 ; CHECK-LABEL: fneg_v64f16:
1302 ; CHECK-NEXT: ptrue p0.h, vl64
1303 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1304 ; CHECK-NEXT: fneg z0.h, p0/m, z0.h
1305 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1307 %op = load <64 x half>, ptr %a
1308 %res = fneg <64 x half> %op
1309 store <64 x half> %res, ptr %a
1313 define void @fneg_v128f16(ptr %a) vscale_range(16,0) #0 {
1314 ; CHECK-LABEL: fneg_v128f16:
1316 ; CHECK-NEXT: ptrue p0.h, vl128
1317 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1318 ; CHECK-NEXT: fneg z0.h, p0/m, z0.h
1319 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1321 %op = load <128 x half>, ptr %a
1322 %res = fneg <128 x half> %op
1323 store <128 x half> %res, ptr %a
1327 ; Don't use SVE for 64-bit vectors.
1328 define <2 x float> @fneg_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
1329 ; CHECK-LABEL: fneg_v2f32:
1331 ; CHECK-NEXT: fneg v0.2s, v0.2s
1333 %res = fneg <2 x float> %op
1334 ret <2 x float> %res
1337 ; Don't use SVE for 128-bit vectors.
1338 define <4 x float> @fneg_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
1339 ; CHECK-LABEL: fneg_v4f32:
1341 ; CHECK-NEXT: fneg v0.4s, v0.4s
1343 %res = fneg <4 x float> %op
1344 ret <4 x float> %res
1347 define void @fneg_v8f32(ptr %a) vscale_range(2,0) #0 {
1348 ; CHECK-LABEL: fneg_v8f32:
1350 ; CHECK-NEXT: ptrue p0.s, vl8
1351 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1352 ; CHECK-NEXT: fneg z0.s, p0/m, z0.s
1353 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1355 %op = load <8 x float>, ptr %a
1356 %res = fneg <8 x float> %op
1357 store <8 x float> %res, ptr %a
1361 define void @fneg_v16f32(ptr %a) #0 {
1362 ; VBITS_GE_256-LABEL: fneg_v16f32:
1363 ; VBITS_GE_256: // %bb.0:
1364 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1365 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1366 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1367 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1368 ; VBITS_GE_256-NEXT: fneg z0.s, p0/m, z0.s
1369 ; VBITS_GE_256-NEXT: fneg z1.s, p0/m, z1.s
1370 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1371 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1372 ; VBITS_GE_256-NEXT: ret
1374 ; VBITS_GE_512-LABEL: fneg_v16f32:
1375 ; VBITS_GE_512: // %bb.0:
1376 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1377 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1378 ; VBITS_GE_512-NEXT: fneg z0.s, p0/m, z0.s
1379 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1380 ; VBITS_GE_512-NEXT: ret
1381 %op = load <16 x float>, ptr %a
1382 %res = fneg <16 x float> %op
1383 store <16 x float> %res, ptr %a
1387 define void @fneg_v32f32(ptr %a) vscale_range(8,0) #0 {
1388 ; CHECK-LABEL: fneg_v32f32:
1390 ; CHECK-NEXT: ptrue p0.s, vl32
1391 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1392 ; CHECK-NEXT: fneg z0.s, p0/m, z0.s
1393 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1395 %op = load <32 x float>, ptr %a
1396 %res = fneg <32 x float> %op
1397 store <32 x float> %res, ptr %a
1401 define void @fneg_v64f32(ptr %a) vscale_range(16,0) #0 {
1402 ; CHECK-LABEL: fneg_v64f32:
1404 ; CHECK-NEXT: ptrue p0.s, vl64
1405 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1406 ; CHECK-NEXT: fneg z0.s, p0/m, z0.s
1407 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1409 %op = load <64 x float>, ptr %a
1410 %res = fneg <64 x float> %op
1411 store <64 x float> %res, ptr %a
1415 ; Don't use SVE for 64-bit vectors.
1416 define <1 x double> @fneg_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
1417 ; CHECK-LABEL: fneg_v1f64:
1419 ; CHECK-NEXT: fneg d0, d0
1421 %res = fneg <1 x double> %op
1422 ret <1 x double> %res
1425 ; Don't use SVE for 128-bit vectors.
1426 define <2 x double> @fneg_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
1427 ; CHECK-LABEL: fneg_v2f64:
1429 ; CHECK-NEXT: fneg v0.2d, v0.2d
1431 %res = fneg <2 x double> %op
1432 ret <2 x double> %res
1435 define void @fneg_v4f64(ptr %a) vscale_range(2,0) #0 {
1436 ; CHECK-LABEL: fneg_v4f64:
1438 ; CHECK-NEXT: ptrue p0.d, vl4
1439 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1440 ; CHECK-NEXT: fneg z0.d, p0/m, z0.d
1441 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1443 %op = load <4 x double>, ptr %a
1444 %res = fneg <4 x double> %op
1445 store <4 x double> %res, ptr %a
1449 define void @fneg_v8f64(ptr %a) #0 {
1450 ; VBITS_GE_256-LABEL: fneg_v8f64:
1451 ; VBITS_GE_256: // %bb.0:
1452 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1453 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1454 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1455 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1456 ; VBITS_GE_256-NEXT: fneg z0.d, p0/m, z0.d
1457 ; VBITS_GE_256-NEXT: fneg z1.d, p0/m, z1.d
1458 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1459 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1460 ; VBITS_GE_256-NEXT: ret
1462 ; VBITS_GE_512-LABEL: fneg_v8f64:
1463 ; VBITS_GE_512: // %bb.0:
1464 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1465 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1466 ; VBITS_GE_512-NEXT: fneg z0.d, p0/m, z0.d
1467 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1468 ; VBITS_GE_512-NEXT: ret
1469 %op = load <8 x double>, ptr %a
1470 %res = fneg <8 x double> %op
1471 store <8 x double> %res, ptr %a
1475 define void @fneg_v16f64(ptr %a) vscale_range(8,0) #0 {
1476 ; CHECK-LABEL: fneg_v16f64:
1478 ; CHECK-NEXT: ptrue p0.d, vl16
1479 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1480 ; CHECK-NEXT: fneg z0.d, p0/m, z0.d
1481 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1483 %op = load <16 x double>, ptr %a
1484 %res = fneg <16 x double> %op
1485 store <16 x double> %res, ptr %a
1489 define void @fneg_v32f64(ptr %a) vscale_range(16,0) #0 {
1490 ; CHECK-LABEL: fneg_v32f64:
1492 ; CHECK-NEXT: ptrue p0.d, vl32
1493 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1494 ; CHECK-NEXT: fneg z0.d, p0/m, z0.d
1495 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1497 %op = load <32 x double>, ptr %a
1498 %res = fneg <32 x double> %op
1499 store <32 x double> %res, ptr %a
1507 ; Don't use SVE for 64-bit vectors.
1508 define <4 x half> @fsqrt_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
1509 ; CHECK-LABEL: fsqrt_v4f16:
1511 ; CHECK-NEXT: fsqrt v0.4h, v0.4h
1513 %res = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %op)
1517 ; Don't use SVE for 128-bit vectors.
1518 define <8 x half> @fsqrt_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
1519 ; CHECK-LABEL: fsqrt_v8f16:
1521 ; CHECK-NEXT: fsqrt v0.8h, v0.8h
1523 %res = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %op)
1527 define void @fsqrt_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
1528 ; CHECK-LABEL: fsqrt_v16f16:
1530 ; CHECK-NEXT: ptrue p0.h, vl16
1531 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1532 ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h
1533 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1535 %op = load <16 x half>, ptr %a
1536 %res = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %op)
1537 store <16 x half> %res, ptr %a
1541 define void @fsqrt_v32f16(ptr %a) #0 {
1542 ; VBITS_GE_256-LABEL: fsqrt_v32f16:
1543 ; VBITS_GE_256: // %bb.0:
1544 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1545 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1546 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1547 ; VBITS_GE_256-NEXT: fsqrt z0.h, p0/m, z0.h
1548 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
1549 ; VBITS_GE_256-NEXT: fsqrt z1.h, p0/m, z1.h
1550 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1551 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
1552 ; VBITS_GE_256-NEXT: ret
1554 ; VBITS_GE_512-LABEL: fsqrt_v32f16:
1555 ; VBITS_GE_512: // %bb.0:
1556 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1557 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1558 ; VBITS_GE_512-NEXT: fsqrt z0.h, p0/m, z0.h
1559 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
1560 ; VBITS_GE_512-NEXT: ret
1561 %op = load <32 x half>, ptr %a
1562 %res = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %op)
1563 store <32 x half> %res, ptr %a
1567 define void @fsqrt_v64f16(ptr %a) vscale_range(8,0) #0 {
1568 ; CHECK-LABEL: fsqrt_v64f16:
1570 ; CHECK-NEXT: ptrue p0.h, vl64
1571 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1572 ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h
1573 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1575 %op = load <64 x half>, ptr %a
1576 %res = call <64 x half> @llvm.sqrt.v64f16(<64 x half> %op)
1577 store <64 x half> %res, ptr %a
1581 define void @fsqrt_v128f16(ptr %a) vscale_range(16,0) #0 {
1582 ; CHECK-LABEL: fsqrt_v128f16:
1584 ; CHECK-NEXT: ptrue p0.h, vl128
1585 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1586 ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h
1587 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1589 %op = load <128 x half>, ptr %a
1590 %res = call <128 x half> @llvm.sqrt.v128f16(<128 x half> %op)
1591 store <128 x half> %res, ptr %a
1595 ; Don't use SVE for 64-bit vectors.
1596 define <2 x float> @fsqrt_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
1597 ; CHECK-LABEL: fsqrt_v2f32:
1599 ; CHECK-NEXT: fsqrt v0.2s, v0.2s
1601 %res = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %op)
1602 ret <2 x float> %res
1605 ; Don't use SVE for 128-bit vectors.
1606 define <4 x float> @fsqrt_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
1607 ; CHECK-LABEL: fsqrt_v4f32:
1609 ; CHECK-NEXT: fsqrt v0.4s, v0.4s
1611 %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %op)
1612 ret <4 x float> %res
1615 define void @fsqrt_v8f32(ptr %a) vscale_range(2,0) #0 {
1616 ; CHECK-LABEL: fsqrt_v8f32:
1618 ; CHECK-NEXT: ptrue p0.s, vl8
1619 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1620 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
1621 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1623 %op = load <8 x float>, ptr %a
1624 %res = call <8 x float> @llvm.sqrt.v8f32(<8 x float> %op)
1625 store <8 x float> %res, ptr %a
1629 define void @fsqrt_v16f32(ptr %a) #0 {
1630 ; VBITS_GE_256-LABEL: fsqrt_v16f32:
1631 ; VBITS_GE_256: // %bb.0:
1632 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1633 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1634 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1635 ; VBITS_GE_256-NEXT: fsqrt z0.s, p0/m, z0.s
1636 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1637 ; VBITS_GE_256-NEXT: fsqrt z1.s, p0/m, z1.s
1638 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1639 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1640 ; VBITS_GE_256-NEXT: ret
1642 ; VBITS_GE_512-LABEL: fsqrt_v16f32:
1643 ; VBITS_GE_512: // %bb.0:
1644 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1645 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1646 ; VBITS_GE_512-NEXT: fsqrt z0.s, p0/m, z0.s
1647 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1648 ; VBITS_GE_512-NEXT: ret
1649 %op = load <16 x float>, ptr %a
1650 %res = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %op)
1651 store <16 x float> %res, ptr %a
1655 define void @fsqrt_v32f32(ptr %a) vscale_range(8,0) #0 {
1656 ; CHECK-LABEL: fsqrt_v32f32:
1658 ; CHECK-NEXT: ptrue p0.s, vl32
1659 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1660 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
1661 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1663 %op = load <32 x float>, ptr %a
1664 %res = call <32 x float> @llvm.sqrt.v32f32(<32 x float> %op)
1665 store <32 x float> %res, ptr %a
1669 define void @fsqrt_v64f32(ptr %a) vscale_range(16,0) #0 {
1670 ; CHECK-LABEL: fsqrt_v64f32:
1672 ; CHECK-NEXT: ptrue p0.s, vl64
1673 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1674 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s
1675 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1677 %op = load <64 x float>, ptr %a
1678 %res = call <64 x float> @llvm.sqrt.v64f32(<64 x float> %op)
1679 store <64 x float> %res, ptr %a
1683 ; Don't use SVE for 64-bit vectors.
1684 define <1 x double> @fsqrt_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
1685 ; CHECK-LABEL: fsqrt_v1f64:
1687 ; CHECK-NEXT: fsqrt d0, d0
1689 %res = call <1 x double> @llvm.sqrt.v1f64(<1 x double> %op)
1690 ret <1 x double> %res
1693 ; Don't use SVE for 128-bit vectors.
1694 define <2 x double> @fsqrt_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
1695 ; CHECK-LABEL: fsqrt_v2f64:
1697 ; CHECK-NEXT: fsqrt v0.2d, v0.2d
1699 %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %op)
1700 ret <2 x double> %res
1703 define void @fsqrt_v4f64(ptr %a) vscale_range(2,0) #0 {
1704 ; CHECK-LABEL: fsqrt_v4f64:
1706 ; CHECK-NEXT: ptrue p0.d, vl4
1707 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1708 ; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d
1709 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1711 %op = load <4 x double>, ptr %a
1712 %res = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %op)
1713 store <4 x double> %res, ptr %a
1717 define void @fsqrt_v8f64(ptr %a) #0 {
1718 ; VBITS_GE_256-LABEL: fsqrt_v8f64:
1719 ; VBITS_GE_256: // %bb.0:
1720 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1721 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1722 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1723 ; VBITS_GE_256-NEXT: fsqrt z0.d, p0/m, z0.d
1724 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1725 ; VBITS_GE_256-NEXT: fsqrt z1.d, p0/m, z1.d
1726 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
1727 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
1728 ; VBITS_GE_256-NEXT: ret
1730 ; VBITS_GE_512-LABEL: fsqrt_v8f64:
1731 ; VBITS_GE_512: // %bb.0:
1732 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1733 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1734 ; VBITS_GE_512-NEXT: fsqrt z0.d, p0/m, z0.d
1735 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
1736 ; VBITS_GE_512-NEXT: ret
1737 %op = load <8 x double>, ptr %a
1738 %res = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %op)
1739 store <8 x double> %res, ptr %a
1743 define void @fsqrt_v16f64(ptr %a) vscale_range(8,0) #0 {
1744 ; CHECK-LABEL: fsqrt_v16f64:
1746 ; CHECK-NEXT: ptrue p0.d, vl16
1747 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1748 ; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d
1749 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1751 %op = load <16 x double>, ptr %a
1752 %res = call <16 x double> @llvm.sqrt.v16f64(<16 x double> %op)
1753 store <16 x double> %res, ptr %a
1757 define void @fsqrt_v32f64(ptr %a) vscale_range(16,0) #0 {
1758 ; CHECK-LABEL: fsqrt_v32f64:
1760 ; CHECK-NEXT: ptrue p0.d, vl32
1761 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1762 ; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d
1763 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
1765 %op = load <32 x double>, ptr %a
1766 %res = call <32 x double> @llvm.sqrt.v32f64(<32 x double> %op)
1767 store <32 x double> %res, ptr %a
1775 ; Don't use SVE for 64-bit vectors.
1776 define <4 x half> @fsub_v4f16(<4 x half> %op1, <4 x half> %op2) vscale_range(2,0) #0 {
1777 ; CHECK-LABEL: fsub_v4f16:
1779 ; CHECK-NEXT: fsub v0.4h, v0.4h, v1.4h
1781 %res = fsub <4 x half> %op1, %op2
1785 ; Don't use SVE for 128-bit vectors.
1786 define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) vscale_range(2,0) #0 {
1787 ; CHECK-LABEL: fsub_v8f16:
1789 ; CHECK-NEXT: fsub v0.8h, v0.8h, v1.8h
1791 %res = fsub <8 x half> %op1, %op2
1795 define void @fsub_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
1796 ; CHECK-LABEL: fsub_v16f16:
1798 ; CHECK-NEXT: ptrue p0.h, vl16
1799 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1800 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1801 ; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h
1802 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1804 %op1 = load <16 x half>, ptr %a
1805 %op2 = load <16 x half>, ptr %b
1806 %res = fsub <16 x half> %op1, %op2
1807 store <16 x half> %res, ptr %a
1811 define void @fsub_v32f16(ptr %a, ptr %b) #0 {
1812 ; VBITS_GE_256-LABEL: fsub_v32f16:
1813 ; VBITS_GE_256: // %bb.0:
1814 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1815 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1816 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1817 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
1818 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
1819 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
1820 ; VBITS_GE_256-NEXT: fsub z0.h, p0/m, z0.h, z2.h
1821 ; VBITS_GE_256-NEXT: fsub z1.h, p0/m, z1.h, z3.h
1822 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
1823 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
1824 ; VBITS_GE_256-NEXT: ret
1826 ; VBITS_GE_512-LABEL: fsub_v32f16:
1827 ; VBITS_GE_512: // %bb.0:
1828 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1829 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1830 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
1831 ; VBITS_GE_512-NEXT: fsub z0.h, p0/m, z0.h, z1.h
1832 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
1833 ; VBITS_GE_512-NEXT: ret
1834 %op1 = load <32 x half>, ptr %a
1835 %op2 = load <32 x half>, ptr %b
1836 %res = fsub <32 x half> %op1, %op2
1837 store <32 x half> %res, ptr %a
1841 define void @fsub_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
1842 ; CHECK-LABEL: fsub_v64f16:
1844 ; CHECK-NEXT: ptrue p0.h, vl64
1845 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1846 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1847 ; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h
1848 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1850 %op1 = load <64 x half>, ptr %a
1851 %op2 = load <64 x half>, ptr %b
1852 %res = fsub <64 x half> %op1, %op2
1853 store <64 x half> %res, ptr %a
1857 define void @fsub_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
1858 ; CHECK-LABEL: fsub_v128f16:
1860 ; CHECK-NEXT: ptrue p0.h, vl128
1861 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1862 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
1863 ; CHECK-NEXT: fsub z0.h, p0/m, z0.h, z1.h
1864 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
1866 %op1 = load <128 x half>, ptr %a
1867 %op2 = load <128 x half>, ptr %b
1868 %res = fsub <128 x half> %op1, %op2
1869 store <128 x half> %res, ptr %a
1873 ; Don't use SVE for 64-bit vectors.
1874 define <2 x float> @fsub_v2f32(<2 x float> %op1, <2 x float> %op2) vscale_range(2,0) #0 {
1875 ; CHECK-LABEL: fsub_v2f32:
1877 ; CHECK-NEXT: fsub v0.2s, v0.2s, v1.2s
1879 %res = fsub <2 x float> %op1, %op2
1880 ret <2 x float> %res
1883 ; Don't use SVE for 128-bit vectors.
1884 define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) vscale_range(2,0) #0 {
1885 ; CHECK-LABEL: fsub_v4f32:
1887 ; CHECK-NEXT: fsub v0.4s, v0.4s, v1.4s
1889 %res = fsub <4 x float> %op1, %op2
1890 ret <4 x float> %res
1893 define void @fsub_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
1894 ; CHECK-LABEL: fsub_v8f32:
1896 ; CHECK-NEXT: ptrue p0.s, vl8
1897 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1898 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1899 ; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
1900 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1902 %op1 = load <8 x float>, ptr %a
1903 %op2 = load <8 x float>, ptr %b
1904 %res = fsub <8 x float> %op1, %op2
1905 store <8 x float> %res, ptr %a
1909 define void @fsub_v16f32(ptr %a, ptr %b) #0 {
1910 ; VBITS_GE_256-LABEL: fsub_v16f32:
1911 ; VBITS_GE_256: // %bb.0:
1912 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1913 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1914 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1915 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1916 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
1917 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
1918 ; VBITS_GE_256-NEXT: fsub z0.s, p0/m, z0.s, z2.s
1919 ; VBITS_GE_256-NEXT: fsub z1.s, p0/m, z1.s, z3.s
1920 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
1921 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
1922 ; VBITS_GE_256-NEXT: ret
1924 ; VBITS_GE_512-LABEL: fsub_v16f32:
1925 ; VBITS_GE_512: // %bb.0:
1926 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1927 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1928 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
1929 ; VBITS_GE_512-NEXT: fsub z0.s, p0/m, z0.s, z1.s
1930 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
1931 ; VBITS_GE_512-NEXT: ret
1932 %op1 = load <16 x float>, ptr %a
1933 %op2 = load <16 x float>, ptr %b
1934 %res = fsub <16 x float> %op1, %op2
1935 store <16 x float> %res, ptr %a
1939 define void @fsub_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
1940 ; CHECK-LABEL: fsub_v32f32:
1942 ; CHECK-NEXT: ptrue p0.s, vl32
1943 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1944 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1945 ; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
1946 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1948 %op1 = load <32 x float>, ptr %a
1949 %op2 = load <32 x float>, ptr %b
1950 %res = fsub <32 x float> %op1, %op2
1951 store <32 x float> %res, ptr %a
1955 define void @fsub_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
1956 ; CHECK-LABEL: fsub_v64f32:
1958 ; CHECK-NEXT: ptrue p0.s, vl64
1959 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1960 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
1961 ; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
1962 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
1964 %op1 = load <64 x float>, ptr %a
1965 %op2 = load <64 x float>, ptr %b
1966 %res = fsub <64 x float> %op1, %op2
1967 store <64 x float> %res, ptr %a
1971 ; Don't use SVE for 64-bit vectors.
1972 define <1 x double> @fsub_v1f64(<1 x double> %op1, <1 x double> %op2) vscale_range(2,0) #0 {
1973 ; CHECK-LABEL: fsub_v1f64:
1975 ; CHECK-NEXT: fsub d0, d0, d1
1977 %res = fsub <1 x double> %op1, %op2
1978 ret <1 x double> %res
1981 ; Don't use SVE for 128-bit vectors.
1982 define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) vscale_range(2,0) #0 {
1983 ; CHECK-LABEL: fsub_v2f64:
1985 ; CHECK-NEXT: fsub v0.2d, v0.2d, v1.2d
1987 %res = fsub <2 x double> %op1, %op2
1988 ret <2 x double> %res
1991 define void @fsub_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
1992 ; CHECK-LABEL: fsub_v4f64:
1994 ; CHECK-NEXT: ptrue p0.d, vl4
1995 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1996 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
1997 ; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d
1998 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2000 %op1 = load <4 x double>, ptr %a
2001 %op2 = load <4 x double>, ptr %b
2002 %res = fsub <4 x double> %op1, %op2
2003 store <4 x double> %res, ptr %a
2007 define void @fsub_v8f64(ptr %a, ptr %b) #0 {
2008 ; VBITS_GE_256-LABEL: fsub_v8f64:
2009 ; VBITS_GE_256: // %bb.0:
2010 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
2011 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
2012 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
2013 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
2014 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
2015 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
2016 ; VBITS_GE_256-NEXT: fsub z0.d, p0/m, z0.d, z2.d
2017 ; VBITS_GE_256-NEXT: fsub z1.d, p0/m, z1.d, z3.d
2018 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
2019 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
2020 ; VBITS_GE_256-NEXT: ret
2022 ; VBITS_GE_512-LABEL: fsub_v8f64:
2023 ; VBITS_GE_512: // %bb.0:
2024 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
2025 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
2026 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
2027 ; VBITS_GE_512-NEXT: fsub z0.d, p0/m, z0.d, z1.d
2028 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
2029 ; VBITS_GE_512-NEXT: ret
2030 %op1 = load <8 x double>, ptr %a
2031 %op2 = load <8 x double>, ptr %b
2032 %res = fsub <8 x double> %op1, %op2
2033 store <8 x double> %res, ptr %a
2037 define void @fsub_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
2038 ; CHECK-LABEL: fsub_v16f64:
2040 ; CHECK-NEXT: ptrue p0.d, vl16
2041 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2042 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
2043 ; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d
2044 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2046 %op1 = load <16 x double>, ptr %a
2047 %op2 = load <16 x double>, ptr %b
2048 %res = fsub <16 x double> %op1, %op2
2049 store <16 x double> %res, ptr %a
2053 define void @fsub_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
2054 ; CHECK-LABEL: fsub_v32f64:
2056 ; CHECK-NEXT: ptrue p0.d, vl32
2057 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2058 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
2059 ; CHECK-NEXT: fsub z0.d, p0/m, z0.d, z1.d
2060 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2062 %op1 = load <32 x double>, ptr %a
2063 %op2 = load <32 x double>, ptr %b
2064 %res = fsub <32 x double> %op1, %op2
2065 store <32 x double> %res, ptr %a
2073 ; Don't use SVE for 64-bit vectors.
2074 define <4 x half> @fabs_v4f16(<4 x half> %op) vscale_range(2,0) #0 {
2075 ; CHECK-LABEL: fabs_v4f16:
2077 ; CHECK-NEXT: fabs v0.4h, v0.4h
2079 %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op)
2083 ; Don't use SVE for 128-bit vectors.
2084 define <8 x half> @fabs_v8f16(<8 x half> %op) vscale_range(2,0) #0 {
2085 ; CHECK-LABEL: fabs_v8f16:
2087 ; CHECK-NEXT: fabs v0.8h, v0.8h
2089 %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op)
2093 define void @fabs_v16f16(ptr %a) vscale_range(2,0) #0 {
2094 ; CHECK-LABEL: fabs_v16f16:
2096 ; CHECK-NEXT: ptrue p0.h, vl16
2097 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
2098 ; CHECK-NEXT: fabs z0.h, p0/m, z0.h
2099 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
2101 %op = load <16 x half>, ptr %a
2102 %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op)
2103 store <16 x half> %res, ptr %a
2107 define void @fabs_v32f16(ptr %a) #0 {
2108 ; VBITS_GE_256-LABEL: fabs_v32f16:
2109 ; VBITS_GE_256: // %bb.0:
2110 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
2111 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
2112 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
2113 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
2114 ; VBITS_GE_256-NEXT: fabs z0.h, p0/m, z0.h
2115 ; VBITS_GE_256-NEXT: fabs z1.h, p0/m, z1.h
2116 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
2117 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
2118 ; VBITS_GE_256-NEXT: ret
2120 ; VBITS_GE_512-LABEL: fabs_v32f16:
2121 ; VBITS_GE_512: // %bb.0:
2122 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
2123 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
2124 ; VBITS_GE_512-NEXT: fabs z0.h, p0/m, z0.h
2125 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
2126 ; VBITS_GE_512-NEXT: ret
2127 %op = load <32 x half>, ptr %a
2128 %res = call <32 x half> @llvm.fabs.v32f16(<32 x half> %op)
2129 store <32 x half> %res, ptr %a
2133 define void @fabs_v64f16(ptr %a) vscale_range(8,0) #0 {
2134 ; CHECK-LABEL: fabs_v64f16:
2136 ; CHECK-NEXT: ptrue p0.h, vl64
2137 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
2138 ; CHECK-NEXT: fabs z0.h, p0/m, z0.h
2139 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
2141 %op = load <64 x half>, ptr %a
2142 %res = call <64 x half> @llvm.fabs.v64f16(<64 x half> %op)
2143 store <64 x half> %res, ptr %a
2147 define void @fabs_v128f16(ptr %a) vscale_range(16,0) #0 {
2148 ; CHECK-LABEL: fabs_v128f16:
2150 ; CHECK-NEXT: ptrue p0.h, vl128
2151 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
2152 ; CHECK-NEXT: fabs z0.h, p0/m, z0.h
2153 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
2155 %op = load <128 x half>, ptr %a
2156 %res = call <128 x half> @llvm.fabs.v128f16(<128 x half> %op)
2157 store <128 x half> %res, ptr %a
2161 ; Don't use SVE for 64-bit vectors.
2162 define <2 x float> @fabs_v2f32(<2 x float> %op) vscale_range(2,0) #0 {
2163 ; CHECK-LABEL: fabs_v2f32:
2165 ; CHECK-NEXT: fabs v0.2s, v0.2s
2167 %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op)
2168 ret <2 x float> %res
2171 ; Don't use SVE for 128-bit vectors.
2172 define <4 x float> @fabs_v4f32(<4 x float> %op) vscale_range(2,0) #0 {
2173 ; CHECK-LABEL: fabs_v4f32:
2175 ; CHECK-NEXT: fabs v0.4s, v0.4s
2177 %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op)
2178 ret <4 x float> %res
2181 define void @fabs_v8f32(ptr %a) vscale_range(2,0) #0 {
2182 ; CHECK-LABEL: fabs_v8f32:
2184 ; CHECK-NEXT: ptrue p0.s, vl8
2185 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
2186 ; CHECK-NEXT: fabs z0.s, p0/m, z0.s
2187 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
2189 %op = load <8 x float>, ptr %a
2190 %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op)
2191 store <8 x float> %res, ptr %a
2195 define void @fabs_v16f32(ptr %a) #0 {
2196 ; VBITS_GE_256-LABEL: fabs_v16f32:
2197 ; VBITS_GE_256: // %bb.0:
2198 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
2199 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
2200 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
2201 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
2202 ; VBITS_GE_256-NEXT: fabs z0.s, p0/m, z0.s
2203 ; VBITS_GE_256-NEXT: fabs z1.s, p0/m, z1.s
2204 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
2205 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
2206 ; VBITS_GE_256-NEXT: ret
2208 ; VBITS_GE_512-LABEL: fabs_v16f32:
2209 ; VBITS_GE_512: // %bb.0:
2210 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
2211 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
2212 ; VBITS_GE_512-NEXT: fabs z0.s, p0/m, z0.s
2213 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
2214 ; VBITS_GE_512-NEXT: ret
2215 %op = load <16 x float>, ptr %a
2216 %res = call <16 x float> @llvm.fabs.v16f32(<16 x float> %op)
2217 store <16 x float> %res, ptr %a
2221 define void @fabs_v32f32(ptr %a) vscale_range(8,0) #0 {
2222 ; CHECK-LABEL: fabs_v32f32:
2224 ; CHECK-NEXT: ptrue p0.s, vl32
2225 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
2226 ; CHECK-NEXT: fabs z0.s, p0/m, z0.s
2227 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
2229 %op = load <32 x float>, ptr %a
2230 %res = call <32 x float> @llvm.fabs.v32f32(<32 x float> %op)
2231 store <32 x float> %res, ptr %a
2235 define void @fabs_v64f32(ptr %a) vscale_range(16,0) #0 {
2236 ; CHECK-LABEL: fabs_v64f32:
2238 ; CHECK-NEXT: ptrue p0.s, vl64
2239 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
2240 ; CHECK-NEXT: fabs z0.s, p0/m, z0.s
2241 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
2243 %op = load <64 x float>, ptr %a
2244 %res = call <64 x float> @llvm.fabs.v64f32(<64 x float> %op)
2245 store <64 x float> %res, ptr %a
2249 ; Don't use SVE for 64-bit vectors.
2250 define <1 x double> @fabs_v1f64(<1 x double> %op) vscale_range(2,0) #0 {
2251 ; CHECK-LABEL: fabs_v1f64:
2253 ; CHECK-NEXT: fabs d0, d0
2255 %res = call <1 x double> @llvm.fabs.v1f64(<1 x double> %op)
2256 ret <1 x double> %res
2259 ; Don't use SVE for 128-bit vectors.
2260 define <2 x double> @fabs_v2f64(<2 x double> %op) vscale_range(2,0) #0 {
2261 ; CHECK-LABEL: fabs_v2f64:
2263 ; CHECK-NEXT: fabs v0.2d, v0.2d
2265 %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op)
2266 ret <2 x double> %res
2269 define void @fabs_v4f64(ptr %a) vscale_range(2,0) #0 {
2270 ; CHECK-LABEL: fabs_v4f64:
2272 ; CHECK-NEXT: ptrue p0.d, vl4
2273 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2274 ; CHECK-NEXT: fabs z0.d, p0/m, z0.d
2275 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2277 %op = load <4 x double>, ptr %a
2278 %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op)
2279 store <4 x double> %res, ptr %a
2283 define void @fabs_v8f64(ptr %a) #0 {
2284 ; VBITS_GE_256-LABEL: fabs_v8f64:
2285 ; VBITS_GE_256: // %bb.0:
2286 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
2287 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
2288 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
2289 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
2290 ; VBITS_GE_256-NEXT: fabs z0.d, p0/m, z0.d
2291 ; VBITS_GE_256-NEXT: fabs z1.d, p0/m, z1.d
2292 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
2293 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
2294 ; VBITS_GE_256-NEXT: ret
2296 ; VBITS_GE_512-LABEL: fabs_v8f64:
2297 ; VBITS_GE_512: // %bb.0:
2298 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
2299 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
2300 ; VBITS_GE_512-NEXT: fabs z0.d, p0/m, z0.d
2301 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
2302 ; VBITS_GE_512-NEXT: ret
2303 %op = load <8 x double>, ptr %a
2304 %res = call <8 x double> @llvm.fabs.v8f64(<8 x double> %op)
2305 store <8 x double> %res, ptr %a
2309 define void @fabs_v16f64(ptr %a) vscale_range(8,0) #0 {
2310 ; CHECK-LABEL: fabs_v16f64:
2312 ; CHECK-NEXT: ptrue p0.d, vl16
2313 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2314 ; CHECK-NEXT: fabs z0.d, p0/m, z0.d
2315 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2317 %op = load <16 x double>, ptr %a
2318 %res = call <16 x double> @llvm.fabs.v16f64(<16 x double> %op)
2319 store <16 x double> %res, ptr %a
2323 define void @fabs_v32f64(ptr %a) vscale_range(16,0) #0 {
2324 ; CHECK-LABEL: fabs_v32f64:
2326 ; CHECK-NEXT: ptrue p0.d, vl32
2327 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
2328 ; CHECK-NEXT: fabs z0.d, p0/m, z0.d
2329 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
2331 %op = load <32 x double>, ptr %a
2332 %res = call <32 x double> @llvm.fabs.v32f64(<32 x double> %op)
2333 store <32 x double> %res, ptr %a
2337 attributes #0 = { "target-features"="+sve" }
2339 declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>)
2340 declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
2341 declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>)
2342 declare <32 x half> @llvm.fma.v32f16(<32 x half>, <32 x half>, <32 x half>)
2343 declare <64 x half> @llvm.fma.v64f16(<64 x half>, <64 x half>, <64 x half>)
2344 declare <128 x half> @llvm.fma.v128f16(<128 x half>, <128 x half>, <128 x half>)
2345 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
2346 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
2347 declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
2348 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>)
2349 declare <32 x float> @llvm.fma.v32f32(<32 x float>, <32 x float>, <32 x float>)
2350 declare <64 x float> @llvm.fma.v64f32(<64 x float>, <64 x float>, <64 x float>)
2351 declare <1 x double> @llvm.fma.v1f64(<1 x double>, <1 x double>, <1 x double>)
2352 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
2353 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
2354 declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>)
2355 declare <16 x double> @llvm.fma.v16f64(<16 x double>, <16 x double>, <16 x double>)
2356 declare <32 x double> @llvm.fma.v32f64(<32 x double>, <32 x double>, <32 x double>)
2358 declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
2359 declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
2360 declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
2361 declare <32 x half> @llvm.sqrt.v32f16(<32 x half>)
2362 declare <64 x half> @llvm.sqrt.v64f16(<64 x half>)
2363 declare <128 x half> @llvm.sqrt.v128f16(<128 x half>)
2364 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
2365 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
2366 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
2367 declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
2368 declare <32 x float> @llvm.sqrt.v32f32(<32 x float>)
2369 declare <64 x float> @llvm.sqrt.v64f32(<64 x float>)
2370 declare <1 x double> @llvm.sqrt.v1f64(<1 x double>)
2371 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
2372 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
2373 declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
2374 declare <16 x double> @llvm.sqrt.v16f64(<16 x double>)
2375 declare <32 x double> @llvm.sqrt.v32f64(<32 x double>)
2377 declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
2378 declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
2379 declare <16 x half> @llvm.fabs.v16f16(<16 x half>)
2380 declare <32 x half> @llvm.fabs.v32f16(<32 x half>)
2381 declare <64 x half> @llvm.fabs.v64f16(<64 x half>)
2382 declare <128 x half> @llvm.fabs.v128f16(<128 x half>)
2383 declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
2384 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
2385 declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
2386 declare <16 x float> @llvm.fabs.v16f32(<16 x float>)
2387 declare <32 x float> @llvm.fabs.v32f32(<32 x float>)
2388 declare <64 x float> @llvm.fabs.v64f32(<64 x float>)
2389 declare <1 x double> @llvm.fabs.v1f64(<1 x double>)
2390 declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
2391 declare <4 x double> @llvm.fabs.v4f64(<4 x double>)
2392 declare <8 x double> @llvm.fabs.v8f64(<8 x double>)
2393 declare <16 x double> @llvm.fabs.v16f64(<16 x double>)
2394 declare <32 x double> @llvm.fabs.v32f64(<32 x double>)