1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
12 ; No single instruction NEON support. Use SVE.
13 define half @fadda_v4f16(half %start, <4 x half> %a) vscale_range(1,0) #0 {
14 ; CHECK-LABEL: fadda_v4f16:
16 ; CHECK-NEXT: ptrue p0.h, vl4
17 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
18 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
19 ; CHECK-NEXT: fadda h0, p0, h0, z1.h
20 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
22 %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
26 ; No single instruction NEON support. Use SVE.
27 define half @fadda_v8f16(half %start, <8 x half> %a) vscale_range(1,0) #0 {
28 ; CHECK-LABEL: fadda_v8f16:
30 ; CHECK-NEXT: ptrue p0.h, vl8
31 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
32 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
33 ; CHECK-NEXT: fadda h0, p0, h0, z1.h
34 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
36 %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
40 define half @fadda_v16f16(half %start, ptr %a) vscale_range(2,0) #0 {
41 ; CHECK-LABEL: fadda_v16f16:
43 ; CHECK-NEXT: ptrue p0.h, vl16
44 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
45 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
46 ; CHECK-NEXT: fadda h0, p0, h0, z1.h
47 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
49 %op = load <16 x half>, ptr %a
50 %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
54 define half @fadda_v32f16(half %start, ptr %a) #0 {
55 ; VBITS_GE_256-LABEL: fadda_v32f16:
56 ; VBITS_GE_256: // %bb.0:
57 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
58 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
59 ; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0
60 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
61 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
62 ; VBITS_GE_256-NEXT: fadda h0, p0, h0, z2.h
63 ; VBITS_GE_256-NEXT: fadda h0, p0, h0, z1.h
64 ; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
65 ; VBITS_GE_256-NEXT: ret
67 ; VBITS_GE_512-LABEL: fadda_v32f16:
68 ; VBITS_GE_512: // %bb.0:
69 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
70 ; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 def $z0
71 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0]
72 ; VBITS_GE_512-NEXT: fadda h0, p0, h0, z1.h
73 ; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
74 ; VBITS_GE_512-NEXT: ret
75 %op = load <32 x half>, ptr %a
76 %res = call half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
80 define half @fadda_v64f16(half %start, ptr %a) vscale_range(8,0) #0 {
81 ; CHECK-LABEL: fadda_v64f16:
83 ; CHECK-NEXT: ptrue p0.h, vl64
84 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
85 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
86 ; CHECK-NEXT: fadda h0, p0, h0, z1.h
87 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
89 %op = load <64 x half>, ptr %a
90 %res = call half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
94 define half @fadda_v128f16(half %start, ptr %a) vscale_range(16,0) #0 {
95 ; CHECK-LABEL: fadda_v128f16:
97 ; CHECK-NEXT: ptrue p0.h, vl128
98 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0
99 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
100 ; CHECK-NEXT: fadda h0, p0, h0, z1.h
101 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
103 %op = load <128 x half>, ptr %a
104 %res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
108 ; No single instruction NEON support. Use SVE.
109 define float @fadda_v2f32(float %start, <2 x float> %a) vscale_range(1,0) #0 {
110 ; CHECK-LABEL: fadda_v2f32:
112 ; CHECK-NEXT: ptrue p0.s, vl2
113 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
114 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
115 ; CHECK-NEXT: fadda s0, p0, s0, z1.s
116 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
118 %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
122 ; No single instruction NEON support. Use SVE.
123 define float @fadda_v4f32(float %start, <4 x float> %a) vscale_range(1,0) #0 {
124 ; CHECK-LABEL: fadda_v4f32:
126 ; CHECK-NEXT: ptrue p0.s, vl4
127 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
128 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
129 ; CHECK-NEXT: fadda s0, p0, s0, z1.s
130 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
132 %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
136 define float @fadda_v8f32(float %start, ptr %a) vscale_range(2,0) #0 {
137 ; CHECK-LABEL: fadda_v8f32:
139 ; CHECK-NEXT: ptrue p0.s, vl8
140 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
141 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
142 ; CHECK-NEXT: fadda s0, p0, s0, z1.s
143 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
145 %op = load <8 x float>, ptr %a
146 %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
150 define float @fadda_v16f32(float %start, ptr %a) #0 {
151 ; VBITS_GE_256-LABEL: fadda_v16f32:
152 ; VBITS_GE_256: // %bb.0:
153 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
154 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
155 ; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0
156 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
157 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
158 ; VBITS_GE_256-NEXT: fadda s0, p0, s0, z2.s
159 ; VBITS_GE_256-NEXT: fadda s0, p0, s0, z1.s
160 ; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
161 ; VBITS_GE_256-NEXT: ret
163 ; VBITS_GE_512-LABEL: fadda_v16f32:
164 ; VBITS_GE_512: // %bb.0:
165 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
166 ; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 def $z0
167 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0]
168 ; VBITS_GE_512-NEXT: fadda s0, p0, s0, z1.s
169 ; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
170 ; VBITS_GE_512-NEXT: ret
171 %op = load <16 x float>, ptr %a
172 %res = call float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
176 define float @fadda_v32f32(float %start, ptr %a) vscale_range(8,0) #0 {
177 ; CHECK-LABEL: fadda_v32f32:
179 ; CHECK-NEXT: ptrue p0.s, vl32
180 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
181 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
182 ; CHECK-NEXT: fadda s0, p0, s0, z1.s
183 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
185 %op = load <32 x float>, ptr %a
186 %res = call float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
190 define float @fadda_v64f32(float %start, ptr %a) vscale_range(16,0) #0 {
191 ; CHECK-LABEL: fadda_v64f32:
193 ; CHECK-NEXT: ptrue p0.s, vl64
194 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0
195 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
196 ; CHECK-NEXT: fadda s0, p0, s0, z1.s
197 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
199 %op = load <64 x float>, ptr %a
200 %res = call float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
204 ; No single instruction NEON support. Use SVE.
205 define double @fadda_v1f64(double %start, <1 x double> %a) vscale_range(1,0) #0 {
206 ; CHECK-LABEL: fadda_v1f64:
208 ; CHECK-NEXT: fadd d0, d0, d1
210 %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
214 ; No single instruction NEON support. Use SVE.
215 define double @fadda_v2f64(double %start, <2 x double> %a) vscale_range(1,0) #0 {
216 ; CHECK-LABEL: fadda_v2f64:
218 ; CHECK-NEXT: ptrue p0.d, vl2
219 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
220 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
221 ; CHECK-NEXT: fadda d0, p0, d0, z1.d
222 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
224 %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
228 define double @fadda_v4f64(double %start, ptr %a) vscale_range(2,0) #0 {
229 ; CHECK-LABEL: fadda_v4f64:
231 ; CHECK-NEXT: ptrue p0.d, vl4
232 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
233 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
234 ; CHECK-NEXT: fadda d0, p0, d0, z1.d
235 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
237 %op = load <4 x double>, ptr %a
238 %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
242 define double @fadda_v8f64(double %start, ptr %a) #0 {
243 ; VBITS_GE_256-LABEL: fadda_v8f64:
244 ; VBITS_GE_256: // %bb.0:
245 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
246 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
247 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0
248 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
249 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
250 ; VBITS_GE_256-NEXT: fadda d0, p0, d0, z2.d
251 ; VBITS_GE_256-NEXT: fadda d0, p0, d0, z1.d
252 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
253 ; VBITS_GE_256-NEXT: ret
255 ; VBITS_GE_512-LABEL: fadda_v8f64:
256 ; VBITS_GE_512: // %bb.0:
257 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
258 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0
259 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0]
260 ; VBITS_GE_512-NEXT: fadda d0, p0, d0, z1.d
261 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
262 ; VBITS_GE_512-NEXT: ret
263 %op = load <8 x double>, ptr %a
264 %res = call double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
268 define double @fadda_v16f64(double %start, ptr %a) vscale_range(8,0) #0 {
269 ; CHECK-LABEL: fadda_v16f64:
271 ; CHECK-NEXT: ptrue p0.d, vl16
272 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
273 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
274 ; CHECK-NEXT: fadda d0, p0, d0, z1.d
275 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
277 %op = load <16 x double>, ptr %a
278 %res = call double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
282 define double @fadda_v32f64(double %start, ptr %a) vscale_range(16,0) #0 {
283 ; CHECK-LABEL: fadda_v32f64:
285 ; CHECK-NEXT: ptrue p0.d, vl32
286 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
287 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
288 ; CHECK-NEXT: fadda d0, p0, d0, z1.d
289 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
291 %op = load <32 x double>, ptr %a
292 %res = call double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
300 ; No single instruction NEON support for 4 element vectors.
301 define half @faddv_v4f16(half %start, <4 x half> %a) vscale_range(2,0) #0 {
302 ; CHECK-LABEL: faddv_v4f16:
304 ; CHECK-NEXT: ptrue p0.h, vl4
305 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
306 ; CHECK-NEXT: faddv h1, p0, z1.h
307 ; CHECK-NEXT: fadd h0, h0, h1
309 %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
313 ; No single instruction NEON support for 8 element vectors.
314 define half @faddv_v8f16(half %start, <8 x half> %a) vscale_range(2,0) #0 {
315 ; CHECK-LABEL: faddv_v8f16:
317 ; CHECK-NEXT: ptrue p0.h, vl8
318 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
319 ; CHECK-NEXT: faddv h1, p0, z1.h
320 ; CHECK-NEXT: fadd h0, h0, h1
322 %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
326 define half @faddv_v16f16(half %start, ptr %a) vscale_range(2,0) #0 {
327 ; CHECK-LABEL: faddv_v16f16:
329 ; CHECK-NEXT: ptrue p0.h, vl16
330 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
331 ; CHECK-NEXT: faddv h1, p0, z1.h
332 ; CHECK-NEXT: fadd h0, h0, h1
334 %op = load <16 x half>, ptr %a
335 %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
339 define half @faddv_v32f16(half %start, ptr %a) #0 {
340 ; VBITS_GE_256-LABEL: faddv_v32f16:
341 ; VBITS_GE_256: // %bb.0:
342 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
343 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
344 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
345 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
346 ; VBITS_GE_256-NEXT: fadd z1.h, p0/m, z1.h, z2.h
347 ; VBITS_GE_256-NEXT: faddv h1, p0, z1.h
348 ; VBITS_GE_256-NEXT: fadd h0, h0, h1
349 ; VBITS_GE_256-NEXT: ret
351 ; VBITS_GE_512-LABEL: faddv_v32f16:
352 ; VBITS_GE_512: // %bb.0:
353 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
354 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0]
355 ; VBITS_GE_512-NEXT: faddv h1, p0, z1.h
356 ; VBITS_GE_512-NEXT: fadd h0, h0, h1
357 ; VBITS_GE_512-NEXT: ret
358 %op = load <32 x half>, ptr %a
359 %res = call fast half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
363 define half @faddv_v64f16(half %start, ptr %a) vscale_range(8,0) #0 {
364 ; CHECK-LABEL: faddv_v64f16:
366 ; CHECK-NEXT: ptrue p0.h, vl64
367 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
368 ; CHECK-NEXT: faddv h1, p0, z1.h
369 ; CHECK-NEXT: fadd h0, h0, h1
371 %op = load <64 x half>, ptr %a
372 %res = call fast half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
376 define half @faddv_v128f16(half %start, ptr %a) vscale_range(16,0) #0 {
377 ; CHECK-LABEL: faddv_v128f16:
379 ; CHECK-NEXT: ptrue p0.h, vl128
380 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
381 ; CHECK-NEXT: faddv h1, p0, z1.h
382 ; CHECK-NEXT: fadd h0, h0, h1
384 %op = load <128 x half>, ptr %a
385 %res = call fast half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
389 ; Don't use SVE for 2 element vectors.
390 define float @faddv_v2f32(float %start, <2 x float> %a) vscale_range(2,0) #0 {
391 ; CHECK-LABEL: faddv_v2f32:
393 ; CHECK-NEXT: faddp s1, v1.2s
394 ; CHECK-NEXT: fadd s0, s0, s1
396 %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
400 ; No single instruction NEON support for 4 element vectors.
401 define float @faddv_v4f32(float %start, <4 x float> %a) vscale_range(2,0) #0 {
402 ; CHECK-LABEL: faddv_v4f32:
404 ; CHECK-NEXT: ptrue p0.s, vl4
405 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
406 ; CHECK-NEXT: faddv s1, p0, z1.s
407 ; CHECK-NEXT: fadd s0, s0, s1
409 %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
413 define float @faddv_v8f32(float %start, ptr %a) vscale_range(2,0) #0 {
414 ; CHECK-LABEL: faddv_v8f32:
416 ; CHECK-NEXT: ptrue p0.s, vl8
417 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
418 ; CHECK-NEXT: faddv s1, p0, z1.s
419 ; CHECK-NEXT: fadd s0, s0, s1
421 %op = load <8 x float>, ptr %a
422 %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
426 define float @faddv_v16f32(float %start, ptr %a) #0 {
427 ; VBITS_GE_256-LABEL: faddv_v16f32:
428 ; VBITS_GE_256: // %bb.0:
429 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
430 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
431 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
432 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
433 ; VBITS_GE_256-NEXT: fadd z1.s, p0/m, z1.s, z2.s
434 ; VBITS_GE_256-NEXT: faddv s1, p0, z1.s
435 ; VBITS_GE_256-NEXT: fadd s0, s0, s1
436 ; VBITS_GE_256-NEXT: ret
438 ; VBITS_GE_512-LABEL: faddv_v16f32:
439 ; VBITS_GE_512: // %bb.0:
440 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
441 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0]
442 ; VBITS_GE_512-NEXT: faddv s1, p0, z1.s
443 ; VBITS_GE_512-NEXT: fadd s0, s0, s1
444 ; VBITS_GE_512-NEXT: ret
445 %op = load <16 x float>, ptr %a
446 %res = call fast float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
450 define float @faddv_v32f32(float %start, ptr %a) vscale_range(8,0) #0 {
451 ; CHECK-LABEL: faddv_v32f32:
453 ; CHECK-NEXT: ptrue p0.s, vl32
454 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
455 ; CHECK-NEXT: faddv s1, p0, z1.s
456 ; CHECK-NEXT: fadd s0, s0, s1
458 %op = load <32 x float>, ptr %a
459 %res = call fast float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
463 define float @faddv_v64f32(float %start, ptr %a) vscale_range(16,0) #0 {
464 ; CHECK-LABEL: faddv_v64f32:
466 ; CHECK-NEXT: ptrue p0.s, vl64
467 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
468 ; CHECK-NEXT: faddv s1, p0, z1.s
469 ; CHECK-NEXT: fadd s0, s0, s1
471 %op = load <64 x float>, ptr %a
472 %res = call fast float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
476 ; Don't use SVE for 1 element vectors.
477 define double @faddv_v1f64(double %start, <1 x double> %a) vscale_range(2,0) #0 {
478 ; CHECK-LABEL: faddv_v1f64:
480 ; CHECK-NEXT: fadd d0, d0, d1
482 %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
486 ; Don't use SVE for 2 element vectors.
487 define double @faddv_v2f64(double %start, <2 x double> %a) vscale_range(2,0) #0 {
488 ; CHECK-LABEL: faddv_v2f64:
490 ; CHECK-NEXT: faddp d1, v1.2d
491 ; CHECK-NEXT: fadd d0, d0, d1
493 %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
497 define double @faddv_v4f64(double %start, ptr %a) vscale_range(2,0) #0 {
498 ; CHECK-LABEL: faddv_v4f64:
500 ; CHECK-NEXT: ptrue p0.d, vl4
501 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
502 ; CHECK-NEXT: faddv d1, p0, z1.d
503 ; CHECK-NEXT: fadd d0, d0, d1
505 %op = load <4 x double>, ptr %a
506 %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
510 define double @faddv_v8f64(double %start, ptr %a) #0 {
511 ; VBITS_GE_256-LABEL: faddv_v8f64:
512 ; VBITS_GE_256: // %bb.0:
513 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
514 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
515 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
516 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
517 ; VBITS_GE_256-NEXT: fadd z1.d, p0/m, z1.d, z2.d
518 ; VBITS_GE_256-NEXT: faddv d1, p0, z1.d
519 ; VBITS_GE_256-NEXT: fadd d0, d0, d1
520 ; VBITS_GE_256-NEXT: ret
522 ; VBITS_GE_512-LABEL: faddv_v8f64:
523 ; VBITS_GE_512: // %bb.0:
524 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
525 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0]
526 ; VBITS_GE_512-NEXT: faddv d1, p0, z1.d
527 ; VBITS_GE_512-NEXT: fadd d0, d0, d1
528 ; VBITS_GE_512-NEXT: ret
529 %op = load <8 x double>, ptr %a
530 %res = call fast double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
534 define double @faddv_v16f64(double %start, ptr %a) vscale_range(8,0) #0 {
535 ; CHECK-LABEL: faddv_v16f64:
537 ; CHECK-NEXT: ptrue p0.d, vl16
538 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
539 ; CHECK-NEXT: faddv d1, p0, z1.d
540 ; CHECK-NEXT: fadd d0, d0, d1
542 %op = load <16 x double>, ptr %a
543 %res = call fast double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
547 define double @faddv_v32f64(double %start, ptr %a) vscale_range(16,0) #0 {
548 ; CHECK-LABEL: faddv_v32f64:
550 ; CHECK-NEXT: ptrue p0.d, vl32
551 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
552 ; CHECK-NEXT: faddv d1, p0, z1.d
553 ; CHECK-NEXT: fadd d0, d0, d1
555 %op = load <32 x double>, ptr %a
556 %res = call fast double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
564 ; No NEON 16-bit vector FMAXNMV support. Use SVE.
565 define half @fmaxv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
566 ; CHECK-LABEL: fmaxv_v4f16:
568 ; CHECK-NEXT: fmaxnmv h0, v0.4h
570 %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
574 ; No NEON 16-bit vector FMAXNMV support. Use SVE.
575 define half @fmaxv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
576 ; CHECK-LABEL: fmaxv_v8f16:
578 ; CHECK-NEXT: fmaxnmv h0, v0.8h
580 %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
584 define half @fmaxv_v16f16(ptr %a) vscale_range(2,0) #0 {
585 ; CHECK-LABEL: fmaxv_v16f16:
587 ; CHECK-NEXT: ptrue p0.h, vl16
588 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
589 ; CHECK-NEXT: fmaxnmv h0, p0, z0.h
590 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
592 %op = load <16 x half>, ptr %a
593 %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
597 define half @fmaxv_v32f16(ptr %a) #0 {
598 ; VBITS_GE_256-LABEL: fmaxv_v32f16:
599 ; VBITS_GE_256: // %bb.0:
600 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
601 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
602 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
603 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
604 ; VBITS_GE_256-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h
605 ; VBITS_GE_256-NEXT: fmaxnmv h0, p0, z0.h
606 ; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
607 ; VBITS_GE_256-NEXT: ret
609 ; VBITS_GE_512-LABEL: fmaxv_v32f16:
610 ; VBITS_GE_512: // %bb.0:
611 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
612 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
613 ; VBITS_GE_512-NEXT: fmaxnmv h0, p0, z0.h
614 ; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
615 ; VBITS_GE_512-NEXT: ret
616 %op = load <32 x half>, ptr %a
617 %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op)
621 define half @fmaxv_v64f16(ptr %a) vscale_range(8,0) #0 {
622 ; CHECK-LABEL: fmaxv_v64f16:
624 ; CHECK-NEXT: ptrue p0.h, vl64
625 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
626 ; CHECK-NEXT: fmaxnmv h0, p0, z0.h
627 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
629 %op = load <64 x half>, ptr %a
630 %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op)
634 define half @fmaxv_v128f16(ptr %a) vscale_range(16,0) #0 {
635 ; CHECK-LABEL: fmaxv_v128f16:
637 ; CHECK-NEXT: ptrue p0.h, vl128
638 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
639 ; CHECK-NEXT: fmaxnmv h0, p0, z0.h
640 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
642 %op = load <128 x half>, ptr %a
643 %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op)
647 ; Don't use SVE for 64-bit f32 vectors.
648 define float @fmaxv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
649 ; CHECK-LABEL: fmaxv_v2f32:
651 ; CHECK-NEXT: fmaxnmp s0, v0.2s
653 %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
657 ; Don't use SVE for 128-bit f32 vectors.
658 define float @fmaxv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
659 ; CHECK-LABEL: fmaxv_v4f32:
661 ; CHECK-NEXT: fmaxnmv s0, v0.4s
663 %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
667 define float @fmaxv_v8f32(ptr %a) vscale_range(2,0) #0 {
668 ; CHECK-LABEL: fmaxv_v8f32:
670 ; CHECK-NEXT: ptrue p0.s, vl8
671 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
672 ; CHECK-NEXT: fmaxnmv s0, p0, z0.s
673 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
675 %op = load <8 x float>, ptr %a
676 %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
680 define float @fmaxv_v16f32(ptr %a) #0 {
681 ; VBITS_GE_256-LABEL: fmaxv_v16f32:
682 ; VBITS_GE_256: // %bb.0:
683 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
684 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
685 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
686 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
687 ; VBITS_GE_256-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s
688 ; VBITS_GE_256-NEXT: fmaxnmv s0, p0, z0.s
689 ; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
690 ; VBITS_GE_256-NEXT: ret
692 ; VBITS_GE_512-LABEL: fmaxv_v16f32:
693 ; VBITS_GE_512: // %bb.0:
694 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
695 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
696 ; VBITS_GE_512-NEXT: fmaxnmv s0, p0, z0.s
697 ; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
698 ; VBITS_GE_512-NEXT: ret
699 %op = load <16 x float>, ptr %a
700 %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op)
704 define float @fmaxv_v32f32(ptr %a) vscale_range(8,0) #0 {
705 ; CHECK-LABEL: fmaxv_v32f32:
707 ; CHECK-NEXT: ptrue p0.s, vl32
708 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
709 ; CHECK-NEXT: fmaxnmv s0, p0, z0.s
710 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
712 %op = load <32 x float>, ptr %a
713 %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op)
717 define float @fmaxv_v64f32(ptr %a) vscale_range(16,0) #0 {
718 ; CHECK-LABEL: fmaxv_v64f32:
720 ; CHECK-NEXT: ptrue p0.s, vl64
721 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
722 ; CHECK-NEXT: fmaxnmv s0, p0, z0.s
723 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
725 %op = load <64 x float>, ptr %a
726 %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op)
730 ; Nothing to do for single element vectors.
731 define double @fmaxv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
732 ; CHECK-LABEL: fmaxv_v1f64:
735 %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
739 ; Don't use SVE for 128-bit f64 vectors.
740 define double @fmaxv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
741 ; CHECK-LABEL: fmaxv_v2f64:
743 ; CHECK-NEXT: fmaxnmp d0, v0.2d
745 %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
749 define double @fmaxv_v4f64(ptr %a) vscale_range(2,0) #0 {
750 ; CHECK-LABEL: fmaxv_v4f64:
752 ; CHECK-NEXT: ptrue p0.d, vl4
753 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
754 ; CHECK-NEXT: fmaxnmv d0, p0, z0.d
755 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
757 %op = load <4 x double>, ptr %a
758 %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
762 define double @fmaxv_v8f64(ptr %a) #0 {
763 ; VBITS_GE_256-LABEL: fmaxv_v8f64:
764 ; VBITS_GE_256: // %bb.0:
765 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
766 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
767 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
768 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
769 ; VBITS_GE_256-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d
770 ; VBITS_GE_256-NEXT: fmaxnmv d0, p0, z0.d
771 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
772 ; VBITS_GE_256-NEXT: ret
774 ; VBITS_GE_512-LABEL: fmaxv_v8f64:
775 ; VBITS_GE_512: // %bb.0:
776 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
777 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
778 ; VBITS_GE_512-NEXT: fmaxnmv d0, p0, z0.d
779 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
780 ; VBITS_GE_512-NEXT: ret
781 %op = load <8 x double>, ptr %a
782 %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op)
786 define double @fmaxv_v16f64(ptr %a) vscale_range(8,0) #0 {
787 ; CHECK-LABEL: fmaxv_v16f64:
789 ; CHECK-NEXT: ptrue p0.d, vl16
790 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
791 ; CHECK-NEXT: fmaxnmv d0, p0, z0.d
792 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
794 %op = load <16 x double>, ptr %a
795 %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op)
799 define double @fmaxv_v32f64(ptr %a) vscale_range(16,0) #0 {
800 ; CHECK-LABEL: fmaxv_v32f64:
802 ; CHECK-NEXT: ptrue p0.d, vl32
803 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
804 ; CHECK-NEXT: fmaxnmv d0, p0, z0.d
805 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
807 %op = load <32 x double>, ptr %a
808 %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op)
816 ; No NEON 16-bit vector FMINNMV support. Use SVE.
817 define half @fminv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
818 ; CHECK-LABEL: fminv_v4f16:
820 ; CHECK-NEXT: fminnmv h0, v0.4h
822 %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
826 ; No NEON 16-bit vector FMINNMV support. Use SVE.
827 define half @fminv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
828 ; CHECK-LABEL: fminv_v8f16:
830 ; CHECK-NEXT: fminnmv h0, v0.8h
832 %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
836 define half @fminv_v16f16(ptr %a) vscale_range(2,0) #0 {
837 ; CHECK-LABEL: fminv_v16f16:
839 ; CHECK-NEXT: ptrue p0.h, vl16
840 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
841 ; CHECK-NEXT: fminnmv h0, p0, z0.h
842 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
844 %op = load <16 x half>, ptr %a
845 %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
849 define half @fminv_v32f16(ptr %a) #0 {
850 ; VBITS_GE_256-LABEL: fminv_v32f16:
851 ; VBITS_GE_256: // %bb.0:
852 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
853 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
854 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
855 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
856 ; VBITS_GE_256-NEXT: fminnm z0.h, p0/m, z0.h, z1.h
857 ; VBITS_GE_256-NEXT: fminnmv h0, p0, z0.h
858 ; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
859 ; VBITS_GE_256-NEXT: ret
861 ; VBITS_GE_512-LABEL: fminv_v32f16:
862 ; VBITS_GE_512: // %bb.0:
863 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
864 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
865 ; VBITS_GE_512-NEXT: fminnmv h0, p0, z0.h
866 ; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
867 ; VBITS_GE_512-NEXT: ret
868 %op = load <32 x half>, ptr %a
869 %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op)
873 define half @fminv_v64f16(ptr %a) vscale_range(8,0) #0 {
874 ; CHECK-LABEL: fminv_v64f16:
876 ; CHECK-NEXT: ptrue p0.h, vl64
877 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
878 ; CHECK-NEXT: fminnmv h0, p0, z0.h
879 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
881 %op = load <64 x half>, ptr %a
882 %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op)
886 define half @fminv_v128f16(ptr %a) vscale_range(16,0) #0 {
887 ; CHECK-LABEL: fminv_v128f16:
889 ; CHECK-NEXT: ptrue p0.h, vl128
890 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
891 ; CHECK-NEXT: fminnmv h0, p0, z0.h
892 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
894 %op = load <128 x half>, ptr %a
895 %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op)
899 ; Don't use SVE for 64-bit f32 vectors.
900 define float @fminv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
901 ; CHECK-LABEL: fminv_v2f32:
903 ; CHECK-NEXT: fminnmp s0, v0.2s
905 %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
909 ; Don't use SVE for 128-bit f32 vectors.
910 define float @fminv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
911 ; CHECK-LABEL: fminv_v4f32:
913 ; CHECK-NEXT: fminnmv s0, v0.4s
915 %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
919 define float @fminv_v8f32(ptr %a) vscale_range(2,0) #0 {
920 ; CHECK-LABEL: fminv_v8f32:
922 ; CHECK-NEXT: ptrue p0.s, vl8
923 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
924 ; CHECK-NEXT: fminnmv s0, p0, z0.s
925 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
927 %op = load <8 x float>, ptr %a
928 %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
932 define float @fminv_v16f32(ptr %a) #0 {
933 ; VBITS_GE_256-LABEL: fminv_v16f32:
934 ; VBITS_GE_256: // %bb.0:
935 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
936 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
937 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
938 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
939 ; VBITS_GE_256-NEXT: fminnm z0.s, p0/m, z0.s, z1.s
940 ; VBITS_GE_256-NEXT: fminnmv s0, p0, z0.s
941 ; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
942 ; VBITS_GE_256-NEXT: ret
944 ; VBITS_GE_512-LABEL: fminv_v16f32:
945 ; VBITS_GE_512: // %bb.0:
946 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
947 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
948 ; VBITS_GE_512-NEXT: fminnmv s0, p0, z0.s
949 ; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
950 ; VBITS_GE_512-NEXT: ret
951 %op = load <16 x float>, ptr %a
952 %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op)
956 define float @fminv_v32f32(ptr %a) vscale_range(8,0) #0 {
957 ; CHECK-LABEL: fminv_v32f32:
959 ; CHECK-NEXT: ptrue p0.s, vl32
960 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
961 ; CHECK-NEXT: fminnmv s0, p0, z0.s
962 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
964 %op = load <32 x float>, ptr %a
965 %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op)
969 define float @fminv_v64f32(ptr %a) vscale_range(16,0) #0 {
970 ; CHECK-LABEL: fminv_v64f32:
972 ; CHECK-NEXT: ptrue p0.s, vl64
973 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
974 ; CHECK-NEXT: fminnmv s0, p0, z0.s
975 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
977 %op = load <64 x float>, ptr %a
978 %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op)
982 ; Nothing to do for single element vectors.
983 define double @fminv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
984 ; CHECK-LABEL: fminv_v1f64:
987 %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
991 ; Don't use SVE for 128-bit f64 vectors.
992 define double @fminv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
993 ; CHECK-LABEL: fminv_v2f64:
995 ; CHECK-NEXT: fminnmp d0, v0.2d
997 %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
1001 define double @fminv_v4f64(ptr %a) vscale_range(2,0) #0 {
1002 ; CHECK-LABEL: fminv_v4f64:
1004 ; CHECK-NEXT: ptrue p0.d, vl4
1005 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1006 ; CHECK-NEXT: fminnmv d0, p0, z0.d
1007 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1009 %op = load <4 x double>, ptr %a
1010 %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
1014 define double @fminv_v8f64(ptr %a) #0 {
1015 ; VBITS_GE_256-LABEL: fminv_v8f64:
1016 ; VBITS_GE_256: // %bb.0:
1017 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1018 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1019 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1020 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1021 ; VBITS_GE_256-NEXT: fminnm z0.d, p0/m, z0.d, z1.d
1022 ; VBITS_GE_256-NEXT: fminnmv d0, p0, z0.d
1023 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
1024 ; VBITS_GE_256-NEXT: ret
1026 ; VBITS_GE_512-LABEL: fminv_v8f64:
1027 ; VBITS_GE_512: // %bb.0:
1028 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1029 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1030 ; VBITS_GE_512-NEXT: fminnmv d0, p0, z0.d
1031 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
1032 ; VBITS_GE_512-NEXT: ret
1033 %op = load <8 x double>, ptr %a
1034 %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op)
1038 define double @fminv_v16f64(ptr %a) vscale_range(8,0) #0 {
1039 ; CHECK-LABEL: fminv_v16f64:
1041 ; CHECK-NEXT: ptrue p0.d, vl16
1042 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1043 ; CHECK-NEXT: fminnmv d0, p0, z0.d
1044 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1046 %op = load <16 x double>, ptr %a
1047 %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op)
1051 define double @fminv_v32f64(ptr %a) vscale_range(16,0) #0 {
1052 ; CHECK-LABEL: fminv_v32f64:
1054 ; CHECK-NEXT: ptrue p0.d, vl32
1055 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1056 ; CHECK-NEXT: fminnmv d0, p0, z0.d
1057 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1059 %op = load <32 x double>, ptr %a
1060 %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op)
1068 define half @fmaximumv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
1069 ; CHECK-LABEL: fmaximumv_v4f16:
1071 ; CHECK-NEXT: fmaxv h0, v0.4h
1073 %res = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> %a)
1077 define half @fmaximumv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
1078 ; CHECK-LABEL: fmaximumv_v8f16:
1080 ; CHECK-NEXT: fmaxv h0, v0.8h
1082 %res = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> %a)
1086 define half @fmaximumv_v16f16(ptr %a) vscale_range(2,0) #0 {
1087 ; CHECK-LABEL: fmaximumv_v16f16:
1089 ; CHECK-NEXT: ptrue p0.h, vl16
1090 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1091 ; CHECK-NEXT: fmaxv h0, p0, z0.h
1092 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
1094 %op = load <16 x half>, ptr %a
1095 %res = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> %op)
1099 define half @fmaximumv_v32f16(ptr %a) #0 {
1100 ; VBITS_GE_256-LABEL: fmaximumv_v32f16:
1101 ; VBITS_GE_256: // %bb.0:
1102 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1103 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1104 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1105 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
1106 ; VBITS_GE_256-NEXT: fmax z0.h, p0/m, z0.h, z1.h
1107 ; VBITS_GE_256-NEXT: fmaxv h0, p0, z0.h
1108 ; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
1109 ; VBITS_GE_256-NEXT: ret
1111 ; VBITS_GE_512-LABEL: fmaximumv_v32f16:
1112 ; VBITS_GE_512: // %bb.0:
1113 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1114 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1115 ; VBITS_GE_512-NEXT: fmaxv h0, p0, z0.h
1116 ; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
1117 ; VBITS_GE_512-NEXT: ret
1118 %op = load <32 x half>, ptr %a
1119 %res = call half @llvm.vector.reduce.fmaximum.v32f16(<32 x half> %op)
1123 define half @fmaximumv_v64f16(ptr %a) vscale_range(8,0) #0 {
1124 ; CHECK-LABEL: fmaximumv_v64f16:
1126 ; CHECK-NEXT: ptrue p0.h, vl64
1127 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1128 ; CHECK-NEXT: fmaxv h0, p0, z0.h
1129 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
1131 %op = load <64 x half>, ptr %a
1132 %res = call half @llvm.vector.reduce.fmaximum.v64f16(<64 x half> %op)
1136 define half @fmaximumv_v128f16(ptr %a) vscale_range(16,0) #0 {
1137 ; CHECK-LABEL: fmaximumv_v128f16:
1139 ; CHECK-NEXT: ptrue p0.h, vl128
1140 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1141 ; CHECK-NEXT: fmaxv h0, p0, z0.h
1142 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
1144 %op = load <128 x half>, ptr %a
1145 %res = call half @llvm.vector.reduce.fmaximum.v128f16(<128 x half> %op)
1149 ; Don't use SVE for 64-bit f32 vectors.
1150 define float @fmaximumv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
1151 ; CHECK-LABEL: fmaximumv_v2f32:
1153 ; CHECK-NEXT: fmaxp s0, v0.2s
1155 %res = call float @llvm.vector.reduce.fmaximum.v2f32(<2 x float> %a)
1159 ; Don't use SVE for 128-bit f32 vectors.
1160 define float @fmaximumv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
1161 ; CHECK-LABEL: fmaximumv_v4f32:
1163 ; CHECK-NEXT: fmaxv s0, v0.4s
1165 %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %a)
1169 define float @fmaximumv_v8f32(ptr %a) vscale_range(2,0) #0 {
1170 ; CHECK-LABEL: fmaximumv_v8f32:
1172 ; CHECK-NEXT: ptrue p0.s, vl8
1173 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1174 ; CHECK-NEXT: fmaxv s0, p0, z0.s
1175 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
1177 %op = load <8 x float>, ptr %a
1178 %res = call float @llvm.vector.reduce.fmaximum.v8f32(<8 x float> %op)
1182 define float @fmaximumv_v16f32(ptr %a) #0 {
1183 ; VBITS_GE_256-LABEL: fmaximumv_v16f32:
1184 ; VBITS_GE_256: // %bb.0:
1185 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1186 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1187 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1188 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1189 ; VBITS_GE_256-NEXT: fmax z0.s, p0/m, z0.s, z1.s
1190 ; VBITS_GE_256-NEXT: fmaxv s0, p0, z0.s
1191 ; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
1192 ; VBITS_GE_256-NEXT: ret
1194 ; VBITS_GE_512-LABEL: fmaximumv_v16f32:
1195 ; VBITS_GE_512: // %bb.0:
1196 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1197 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1198 ; VBITS_GE_512-NEXT: fmaxv s0, p0, z0.s
1199 ; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
1200 ; VBITS_GE_512-NEXT: ret
1201 %op = load <16 x float>, ptr %a
1202 %res = call float @llvm.vector.reduce.fmaximum.v16f32(<16 x float> %op)
1206 define float @fmaximumv_v32f32(ptr %a) vscale_range(8,0) #0 {
1207 ; CHECK-LABEL: fmaximumv_v32f32:
1209 ; CHECK-NEXT: ptrue p0.s, vl32
1210 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1211 ; CHECK-NEXT: fmaxv s0, p0, z0.s
1212 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
1214 %op = load <32 x float>, ptr %a
1215 %res = call float @llvm.vector.reduce.fmaximum.v32f32(<32 x float> %op)
1219 define float @fmaximumv_v64f32(ptr %a) vscale_range(16,0) #0 {
1220 ; CHECK-LABEL: fmaximumv_v64f32:
1222 ; CHECK-NEXT: ptrue p0.s, vl64
1223 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1224 ; CHECK-NEXT: fmaxv s0, p0, z0.s
1225 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
1227 %op = load <64 x float>, ptr %a
1228 %res = call float @llvm.vector.reduce.fmaximum.v64f32(<64 x float> %op)
1232 ; Nothing to do for single element vectors.
1233 define double @fmaximumv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
1234 ; CHECK-LABEL: fmaximumv_v1f64:
1237 %res = call double @llvm.vector.reduce.fmaximum.v1f64(<1 x double> %a)
1241 ; Don't use SVE for 128-bit f64 vectors.
1242 define double @fmaximumv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
1243 ; CHECK-LABEL: fmaximumv_v2f64:
1245 ; CHECK-NEXT: fmaxp d0, v0.2d
1247 %res = call double @llvm.vector.reduce.fmaximum.v2f64(<2 x double> %a)
1251 define double @fmaximumv_v4f64(ptr %a) vscale_range(2,0) #0 {
1252 ; CHECK-LABEL: fmaximumv_v4f64:
1254 ; CHECK-NEXT: ptrue p0.d, vl4
1255 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1256 ; CHECK-NEXT: fmaxv d0, p0, z0.d
1257 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1259 %op = load <4 x double>, ptr %a
1260 %res = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %op)
1264 define double @fmaximumv_v8f64(ptr %a) #0 {
1265 ; VBITS_GE_256-LABEL: fmaximumv_v8f64:
1266 ; VBITS_GE_256: // %bb.0:
1267 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1268 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1269 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1270 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1271 ; VBITS_GE_256-NEXT: fmax z0.d, p0/m, z0.d, z1.d
1272 ; VBITS_GE_256-NEXT: fmaxv d0, p0, z0.d
1273 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
1274 ; VBITS_GE_256-NEXT: ret
1276 ; VBITS_GE_512-LABEL: fmaximumv_v8f64:
1277 ; VBITS_GE_512: // %bb.0:
1278 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1279 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1280 ; VBITS_GE_512-NEXT: fmaxv d0, p0, z0.d
1281 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
1282 ; VBITS_GE_512-NEXT: ret
1283 %op = load <8 x double>, ptr %a
1284 %res = call double @llvm.vector.reduce.fmaximum.v8f64(<8 x double> %op)
1288 define double @fmaximumv_v16f64(ptr %a) vscale_range(8,0) #0 {
1289 ; CHECK-LABEL: fmaximumv_v16f64:
1291 ; CHECK-NEXT: ptrue p0.d, vl16
1292 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1293 ; CHECK-NEXT: fmaxv d0, p0, z0.d
1294 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1296 %op = load <16 x double>, ptr %a
1297 %res = call double @llvm.vector.reduce.fmaximum.v16f64(<16 x double> %op)
1301 define double @fmaximumv_v32f64(ptr %a) vscale_range(16,0) #0 {
1302 ; CHECK-LABEL: fmaximumv_v32f64:
1304 ; CHECK-NEXT: ptrue p0.d, vl32
1305 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1306 ; CHECK-NEXT: fmaxv d0, p0, z0.d
1307 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1309 %op = load <32 x double>, ptr %a
1310 %res = call double @llvm.vector.reduce.fmaximum.v32f64(<32 x double> %op)
1318 define half @fminimumv_v4f16(<4 x half> %a) vscale_range(2,0) #0 {
1319 ; CHECK-LABEL: fminimumv_v4f16:
1321 ; CHECK-NEXT: fminv h0, v0.4h
1323 %res = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> %a)
1327 define half @fminimumv_v8f16(<8 x half> %a) vscale_range(2,0) #0 {
1328 ; CHECK-LABEL: fminimumv_v8f16:
1330 ; CHECK-NEXT: fminv h0, v0.8h
1332 %res = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> %a)
1336 define half @fminimumv_v16f16(ptr %a) vscale_range(2,0) #0 {
1337 ; CHECK-LABEL: fminimumv_v16f16:
1339 ; CHECK-NEXT: ptrue p0.h, vl16
1340 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1341 ; CHECK-NEXT: fminv h0, p0, z0.h
1342 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
1344 %op = load <16 x half>, ptr %a
1345 %res = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> %op)
1349 define half @fminimumv_v32f16(ptr %a) #0 {
1350 ; VBITS_GE_256-LABEL: fminimumv_v32f16:
1351 ; VBITS_GE_256: // %bb.0:
1352 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
1353 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
1354 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
1355 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
1356 ; VBITS_GE_256-NEXT: fmin z0.h, p0/m, z0.h, z1.h
1357 ; VBITS_GE_256-NEXT: fminv h0, p0, z0.h
1358 ; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 killed $z0
1359 ; VBITS_GE_256-NEXT: ret
1361 ; VBITS_GE_512-LABEL: fminimumv_v32f16:
1362 ; VBITS_GE_512: // %bb.0:
1363 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
1364 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
1365 ; VBITS_GE_512-NEXT: fminv h0, p0, z0.h
1366 ; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 killed $z0
1367 ; VBITS_GE_512-NEXT: ret
1368 %op = load <32 x half>, ptr %a
1369 %res = call half @llvm.vector.reduce.fminimum.v32f16(<32 x half> %op)
1373 define half @fminimumv_v64f16(ptr %a) vscale_range(8,0) #0 {
1374 ; CHECK-LABEL: fminimumv_v64f16:
1376 ; CHECK-NEXT: ptrue p0.h, vl64
1377 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1378 ; CHECK-NEXT: fminv h0, p0, z0.h
1379 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
1381 %op = load <64 x half>, ptr %a
1382 %res = call half @llvm.vector.reduce.fminimum.v64f16(<64 x half> %op)
1386 define half @fminimumv_v128f16(ptr %a) vscale_range(16,0) #0 {
1387 ; CHECK-LABEL: fminimumv_v128f16:
1389 ; CHECK-NEXT: ptrue p0.h, vl128
1390 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
1391 ; CHECK-NEXT: fminv h0, p0, z0.h
1392 ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0
1394 %op = load <128 x half>, ptr %a
1395 %res = call half @llvm.vector.reduce.fminimum.v128f16(<128 x half> %op)
1399 ; Don't use SVE for 64-bit f32 vectors.
1400 define float @fminimumv_v2f32(<2 x float> %a) vscale_range(2,0) #0 {
1401 ; CHECK-LABEL: fminimumv_v2f32:
1403 ; CHECK-NEXT: fminp s0, v0.2s
1405 %res = call float @llvm.vector.reduce.fminimum.v2f32(<2 x float> %a)
1409 ; Don't use SVE for 128-bit f32 vectors.
1410 define float @fminimumv_v4f32(<4 x float> %a) vscale_range(2,0) #0 {
1411 ; CHECK-LABEL: fminimumv_v4f32:
1413 ; CHECK-NEXT: fminv s0, v0.4s
1415 %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %a)
1419 define float @fminimumv_v8f32(ptr %a) vscale_range(2,0) #0 {
1420 ; CHECK-LABEL: fminimumv_v8f32:
1422 ; CHECK-NEXT: ptrue p0.s, vl8
1423 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1424 ; CHECK-NEXT: fminv s0, p0, z0.s
1425 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
1427 %op = load <8 x float>, ptr %a
1428 %res = call float @llvm.vector.reduce.fminimum.v8f32(<8 x float> %op)
1432 define float @fminimumv_v16f32(ptr %a) #0 {
1433 ; VBITS_GE_256-LABEL: fminimumv_v16f32:
1434 ; VBITS_GE_256: // %bb.0:
1435 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
1436 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1437 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
1438 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
1439 ; VBITS_GE_256-NEXT: fmin z0.s, p0/m, z0.s, z1.s
1440 ; VBITS_GE_256-NEXT: fminv s0, p0, z0.s
1441 ; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 killed $z0
1442 ; VBITS_GE_256-NEXT: ret
1444 ; VBITS_GE_512-LABEL: fminimumv_v16f32:
1445 ; VBITS_GE_512: // %bb.0:
1446 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
1447 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
1448 ; VBITS_GE_512-NEXT: fminv s0, p0, z0.s
1449 ; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 killed $z0
1450 ; VBITS_GE_512-NEXT: ret
1451 %op = load <16 x float>, ptr %a
1452 %res = call float @llvm.vector.reduce.fminimum.v16f32(<16 x float> %op)
1456 define float @fminimumv_v32f32(ptr %a) vscale_range(8,0) #0 {
1457 ; CHECK-LABEL: fminimumv_v32f32:
1459 ; CHECK-NEXT: ptrue p0.s, vl32
1460 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1461 ; CHECK-NEXT: fminv s0, p0, z0.s
1462 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
1464 %op = load <32 x float>, ptr %a
1465 %res = call float @llvm.vector.reduce.fminimum.v32f32(<32 x float> %op)
1469 define float @fminimumv_v64f32(ptr %a) vscale_range(16,0) #0 {
1470 ; CHECK-LABEL: fminimumv_v64f32:
1472 ; CHECK-NEXT: ptrue p0.s, vl64
1473 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
1474 ; CHECK-NEXT: fminv s0, p0, z0.s
1475 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0
1477 %op = load <64 x float>, ptr %a
1478 %res = call float @llvm.vector.reduce.fminimum.v64f32(<64 x float> %op)
1482 ; Nothing to do for single element vectors.
1483 define double @fminimumv_v1f64(<1 x double> %a) vscale_range(2,0) #0 {
1484 ; CHECK-LABEL: fminimumv_v1f64:
1487 %res = call double @llvm.vector.reduce.fminimum.v1f64(<1 x double> %a)
1491 ; Don't use SVE for 128-bit f64 vectors.
1492 define double @fminimumv_v2f64(<2 x double> %a) vscale_range(2,0) #0 {
1493 ; CHECK-LABEL: fminimumv_v2f64:
1495 ; CHECK-NEXT: fminp d0, v0.2d
1497 %res = call double @llvm.vector.reduce.fminimum.v2f64(<2 x double> %a)
1501 define double @fminimumv_v4f64(ptr %a) vscale_range(2,0) #0 {
1502 ; CHECK-LABEL: fminimumv_v4f64:
1504 ; CHECK-NEXT: ptrue p0.d, vl4
1505 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1506 ; CHECK-NEXT: fminv d0, p0, z0.d
1507 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1509 %op = load <4 x double>, ptr %a
1510 %res = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %op)
1514 define double @fminimumv_v8f64(ptr %a) #0 {
1515 ; VBITS_GE_256-LABEL: fminimumv_v8f64:
1516 ; VBITS_GE_256: // %bb.0:
1517 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
1518 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1519 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1520 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
1521 ; VBITS_GE_256-NEXT: fmin z0.d, p0/m, z0.d, z1.d
1522 ; VBITS_GE_256-NEXT: fminv d0, p0, z0.d
1523 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 killed $z0
1524 ; VBITS_GE_256-NEXT: ret
1526 ; VBITS_GE_512-LABEL: fminimumv_v8f64:
1527 ; VBITS_GE_512: // %bb.0:
1528 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
1529 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
1530 ; VBITS_GE_512-NEXT: fminv d0, p0, z0.d
1531 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 killed $z0
1532 ; VBITS_GE_512-NEXT: ret
1533 %op = load <8 x double>, ptr %a
1534 %res = call double @llvm.vector.reduce.fminimum.v8f64(<8 x double> %op)
1538 define double @fminimumv_v16f64(ptr %a) vscale_range(8,0) #0 {
1539 ; CHECK-LABEL: fminimumv_v16f64:
1541 ; CHECK-NEXT: ptrue p0.d, vl16
1542 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1543 ; CHECK-NEXT: fminv d0, p0, z0.d
1544 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1546 %op = load <16 x double>, ptr %a
1547 %res = call double @llvm.vector.reduce.fminimum.v16f64(<16 x double> %op)
1551 define double @fminimumv_v32f64(ptr %a) vscale_range(16,0) #0 {
1552 ; CHECK-LABEL: fminimumv_v32f64:
1554 ; CHECK-NEXT: ptrue p0.d, vl32
1555 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
1556 ; CHECK-NEXT: fminv d0, p0, z0.d
1557 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1559 %op = load <32 x double>, ptr %a
1560 %res = call double @llvm.vector.reduce.fminimum.v32f64(<32 x double> %op)
1564 attributes #0 = { "target-features"="+sve" }
1566 declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>)
1567 declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>)
1568 declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>)
1569 declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>)
1570 declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>)
1571 declare half @llvm.vector.reduce.fadd.v128f16(half, <128 x half>)
1573 declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
1574 declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
1575 declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
1576 declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>)
1577 declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>)
1578 declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>)
1580 declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>)
1581 declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
1582 declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
1583 declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>)
1584 declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>)
1585 declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>)
1587 declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
1588 declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>)
1589 declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)
1590 declare half @llvm.vector.reduce.fmax.v32f16(<32 x half>)
1591 declare half @llvm.vector.reduce.fmax.v64f16(<64 x half>)
1592 declare half @llvm.vector.reduce.fmax.v128f16(<128 x half>)
1594 declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
1595 declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
1596 declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
1597 declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
1598 declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>)
1599 declare float @llvm.vector.reduce.fmax.v64f32(<64 x float>)
1601 declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
1602 declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
1603 declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
1604 declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
1605 declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
1606 declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>)
1608 declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
1609 declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>)
1610 declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>)
1611 declare half @llvm.vector.reduce.fmin.v32f16(<32 x half>)
1612 declare half @llvm.vector.reduce.fmin.v64f16(<64 x half>)
1613 declare half @llvm.vector.reduce.fmin.v128f16(<128 x half>)
1615 declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
1616 declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
1617 declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
1618 declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
1619 declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>)
1620 declare float @llvm.vector.reduce.fmin.v64f32(<64 x float>)
1622 declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
1623 declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
1624 declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
1625 declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
1626 declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
1627 declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>)
1629 declare half @llvm.vector.reduce.fmaximum.v4f16(<4 x half>)
1630 declare half @llvm.vector.reduce.fmaximum.v8f16(<8 x half>)
1631 declare half @llvm.vector.reduce.fmaximum.v16f16(<16 x half>)
1632 declare half @llvm.vector.reduce.fmaximum.v32f16(<32 x half>)
1633 declare half @llvm.vector.reduce.fmaximum.v64f16(<64 x half>)
1634 declare half @llvm.vector.reduce.fmaximum.v128f16(<128 x half>)
1636 declare float @llvm.vector.reduce.fmaximum.v2f32(<2 x float>)
1637 declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>)
1638 declare float @llvm.vector.reduce.fmaximum.v8f32(<8 x float>)
1639 declare float @llvm.vector.reduce.fmaximum.v16f32(<16 x float>)
1640 declare float @llvm.vector.reduce.fmaximum.v32f32(<32 x float>)
1641 declare float @llvm.vector.reduce.fmaximum.v64f32(<64 x float>)
1643 declare double @llvm.vector.reduce.fmaximum.v1f64(<1 x double>)
1644 declare double @llvm.vector.reduce.fmaximum.v2f64(<2 x double>)
1645 declare double @llvm.vector.reduce.fmaximum.v4f64(<4 x double>)
1646 declare double @llvm.vector.reduce.fmaximum.v8f64(<8 x double>)
1647 declare double @llvm.vector.reduce.fmaximum.v16f64(<16 x double>)
1648 declare double @llvm.vector.reduce.fmaximum.v32f64(<32 x double>)
1650 declare half @llvm.vector.reduce.fminimum.v4f16(<4 x half>)
1651 declare half @llvm.vector.reduce.fminimum.v8f16(<8 x half>)
1652 declare half @llvm.vector.reduce.fminimum.v16f16(<16 x half>)
1653 declare half @llvm.vector.reduce.fminimum.v32f16(<32 x half>)
1654 declare half @llvm.vector.reduce.fminimum.v64f16(<64 x half>)
1655 declare half @llvm.vector.reduce.fminimum.v128f16(<128 x half>)
1657 declare float @llvm.vector.reduce.fminimum.v2f32(<2 x float>)
1658 declare float @llvm.vector.reduce.fminimum.v4f32(<4 x float>)
1659 declare float @llvm.vector.reduce.fminimum.v8f32(<8 x float>)
1660 declare float @llvm.vector.reduce.fminimum.v16f32(<16 x float>)
1661 declare float @llvm.vector.reduce.fminimum.v32f32(<32 x float>)
1662 declare float @llvm.vector.reduce.fminimum.v64f32(<64 x float>)
1664 declare double @llvm.vector.reduce.fminimum.v1f64(<1 x double>)
1665 declare double @llvm.vector.reduce.fminimum.v2f64(<2 x double>)
1666 declare double @llvm.vector.reduce.fminimum.v4f64(<4 x double>)
1667 declare double @llvm.vector.reduce.fminimum.v8f64(<8 x double>)
1668 declare double @llvm.vector.reduce.fminimum.v16f64(<16 x double>)
1669 declare double @llvm.vector.reduce.fminimum.v32f64(<32 x double>)