1 ; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
4 ; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
5 ; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
6 ; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
7 ; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
8 ; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
9 ; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
10 ; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
11 ; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
12 ; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
13 ; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
14 ; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
15 ; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
16 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
18 target triple = "aarch64-unknown-linux-gnu"
20 ; Don't use SVE when its registers are no bigger than NEON.
27 ; No single instruction NEON support. Use SVE.
28 define half @fadda_v4f16(half %start, <4 x half> %a) #0 {
29 ; CHECK-LABEL: fadda_v4f16:
30 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
31 ; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h
33 %res = call half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
37 ; No single instruction NEON support. Use SVE.
38 define half @fadda_v8f16(half %start, <8 x half> %a) #0 {
39 ; CHECK-LABEL: fadda_v8f16:
40 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
41 ; CHECK-NEXT: fadda h0, [[PG]], h0, z1.h
43 %res = call half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
47 define half @fadda_v16f16(half %start, <16 x half>* %a) #0 {
48 ; CHECK-LABEL: fadda_v16f16:
49 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
50 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
51 ; CHECK-NEXT: fadda h0, [[PG]], h0, [[OP]].h
53 %op = load <16 x half>, <16 x half>* %a
54 %res = call half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
58 define half @fadda_v32f16(half %start, <32 x half>* %a) #0 {
59 ; CHECK-LABEL: fadda_v32f16:
60 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
61 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
62 ; VBITS_GE_512-NEXT: fadda h0, [[PG]], h0, [[OP]].h
63 ; VBITS_GE_512-NEXT: ret
65 ; Ensure sensible type legalisation.
66 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
67 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
68 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
69 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
70 ; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[LO]].h
71 ; VBITS_EQ_256-NEXT: fadda h0, [[PG]], h0, [[HI]].h
72 ; VBITS_EQ_256-NEXT: ret
73 %op = load <32 x half>, <32 x half>* %a
74 %res = call half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
78 define half @fadda_v64f16(half %start, <64 x half>* %a) #0 {
79 ; CHECK-LABEL: fadda_v64f16:
80 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
81 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
82 ; VBITS_GE_1024-NEXT: fadda h0, [[PG]], h0, [[OP]].h
83 ; VBITS_GE_1024-NEXT: ret
84 %op = load <64 x half>, <64 x half>* %a
85 %res = call half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
89 define half @fadda_v128f16(half %start, <128 x half>* %a) #0 {
90 ; CHECK-LABEL: fadda_v128f16:
91 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
92 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
93 ; VBITS_GE_2048-NEXT: fadda h0, [[PG]], h0, [[OP]].h
94 ; VBITS_GE_2048-NEXT: ret
95 %op = load <128 x half>, <128 x half>* %a
96 %res = call half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
100 ; No single instruction NEON support. Use SVE.
101 define float @fadda_v2f32(float %start, <2 x float> %a) #0 {
102 ; CHECK-LABEL: fadda_v2f32:
103 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl2
104 ; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s
106 %res = call float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
110 ; No single instruction NEON support. Use SVE.
111 define float @fadda_v4f32(float %start, <4 x float> %a) #0 {
112 ; CHECK-LABEL: fadda_v4f32:
113 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
114 ; CHECK-NEXT: fadda s0, [[PG]], s0, z1.s
116 %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
120 define float @fadda_v8f32(float %start, <8 x float>* %a) #0 {
121 ; CHECK-LABEL: fadda_v8f32:
122 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
123 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
124 ; CHECK-NEXT: fadda s0, [[PG]], s0, [[OP]].s
126 %op = load <8 x float>, <8 x float>* %a
127 %res = call float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
131 define float @fadda_v16f32(float %start, <16 x float>* %a) #0 {
132 ; CHECK-LABEL: fadda_v16f32:
133 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
134 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
135 ; VBITS_GE_512-NEXT: fadda s0, [[PG]], s0, [[OP]].s
136 ; VBITS_GE_512-NEXT: ret
138 ; Ensure sensible type legalisation.
139 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
140 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
141 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
142 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
143 ; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[LO]].s
144 ; VBITS_EQ_256-NEXT: fadda s0, [[PG]], s0, [[HI]].s
145 ; VBITS_EQ_256-NEXT: ret
146 %op = load <16 x float>, <16 x float>* %a
147 %res = call float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
151 define float @fadda_v32f32(float %start, <32 x float>* %a) #0 {
152 ; CHECK-LABEL: fadda_v32f32:
153 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
154 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
155 ; VBITS_GE_1024-NEXT: fadda s0, [[PG]], s0, [[OP]].s
156 ; VBITS_GE_1024-NEXT: ret
157 %op = load <32 x float>, <32 x float>* %a
158 %res = call float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
162 define float @fadda_v64f32(float %start, <64 x float>* %a) #0 {
163 ; CHECK-LABEL: fadda_v64f32:
164 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
165 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
166 ; VBITS_GE_2048-NEXT: fadda s0, [[PG]], s0, [[OP]].s
167 ; VBITS_GE_2048-NEXT: ret
168 %op = load <64 x float>, <64 x float>* %a
169 %res = call float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
173 ; No single instruction NEON support. Use SVE.
174 define double @fadda_v1f64(double %start, <1 x double> %a) #0 {
175 ; CHECK-LABEL: fadda_v1f64:
176 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl1
177 ; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d
179 %res = call double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
183 ; No single instruction NEON support. Use SVE.
184 define double @fadda_v2f64(double %start, <2 x double> %a) #0 {
185 ; CHECK-LABEL: fadda_v2f64:
186 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl2
187 ; CHECK-NEXT: fadda d0, [[PG]], d0, z1.d
189 %res = call double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
193 define double @fadda_v4f64(double %start, <4 x double>* %a) #0 {
194 ; CHECK-LABEL: fadda_v4f64:
195 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
196 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
197 ; CHECK-NEXT: fadda d0, [[PG]], d0, [[OP]].d
199 %op = load <4 x double>, <4 x double>* %a
200 %res = call double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
204 define double @fadda_v8f64(double %start, <8 x double>* %a) #0 {
205 ; CHECK-LABEL: fadda_v8f64:
206 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
207 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
208 ; VBITS_GE_512-NEXT: fadda d0, [[PG]], d0, [[OP]].d
209 ; VBITS_GE_512-NEXT: ret
211 ; Ensure sensible type legalisation.
212 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
213 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
214 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
215 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
216 ; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[LO]].d
217 ; VBITS_EQ_256-NEXT: fadda d0, [[PG]], d0, [[HI]].d
218 ; VBITS_EQ_256-NEXT: ret
219 %op = load <8 x double>, <8 x double>* %a
220 %res = call double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
224 define double @fadda_v16f64(double %start, <16 x double>* %a) #0 {
225 ; CHECK-LABEL: fadda_v16f64:
226 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
227 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
228 ; VBITS_GE_1024-NEXT: fadda d0, [[PG]], d0, [[OP]].d
229 ; VBITS_GE_1024-NEXT: ret
230 %op = load <16 x double>, <16 x double>* %a
231 %res = call double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
235 define double @fadda_v32f64(double %start, <32 x double>* %a) #0 {
236 ; CHECK-LABEL: fadda_v32f64:
237 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
238 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
239 ; VBITS_GE_2048-NEXT: fadda d0, [[PG]], d0, [[OP]].d
240 ; VBITS_GE_2048-NEXT: ret
241 %op = load <32 x double>, <32 x double>* %a
242 %res = call double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
250 ; No single instruction NEON support for 4 element vectors.
251 define half @faddv_v4f16(half %start, <4 x half> %a) #0 {
252 ; CHECK-LABEL: faddv_v4f16:
253 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl4
254 ; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h
255 ; CHECK-NEXT: fadd h0, h0, [[RDX]]
257 %res = call fast half @llvm.vector.reduce.fadd.v4f16(half %start, <4 x half> %a)
261 ; No single instruction NEON support for 8 element vectors.
262 define half @faddv_v8f16(half %start, <8 x half> %a) #0 {
263 ; CHECK-LABEL: faddv_v8f16:
264 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl8
265 ; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], z1.h
266 ; CHECK-NEXT: fadd h0, h0, [[RDX]]
268 %res = call fast half @llvm.vector.reduce.fadd.v8f16(half %start, <8 x half> %a)
272 define half @faddv_v16f16(half %start, <16 x half>* %a) #0 {
273 ; CHECK-LABEL: faddv_v16f16:
274 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
275 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
276 ; CHECK-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
277 ; CHECK-NEXT: fadd h0, h0, [[RDX]]
279 %op = load <16 x half>, <16 x half>* %a
280 %res = call fast half @llvm.vector.reduce.fadd.v16f16(half %start, <16 x half> %op)
284 define half @faddv_v32f16(half %start, <32 x half>* %a) #0 {
285 ; CHECK-LABEL: faddv_v32f16:
286 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
287 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
288 ; VBITS_GE_512-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
289 ; VBITS_GE_512-NEXT: fadd h0, h0, [[RDX]]
290 ; VBITS_GE_512-NEXT: ret
292 ; Ensure sensible type legalisation.
293 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
294 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
295 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
296 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
297 ; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
298 ; VBITS_EQ_256-DAG: faddv h1, [[PG]], [[ADD]].h
299 ; VBITS_EQ_256-DAG: fadd h0, h0, [[RDX]]
300 ; VBITS_EQ_256-NEXT: ret
301 %op = load <32 x half>, <32 x half>* %a
302 %res = call fast half @llvm.vector.reduce.fadd.v32f16(half %start, <32 x half> %op)
306 define half @faddv_v64f16(half %start, <64 x half>* %a) #0 {
307 ; CHECK-LABEL: faddv_v64f16:
308 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
309 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
310 ; VBITS_GE_1024-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
311 ; VBITS_GE_1024-NEXT: fadd h0, h0, [[RDX]]
312 ; VBITS_GE_1024-NEXT: ret
313 %op = load <64 x half>, <64 x half>* %a
314 %res = call fast half @llvm.vector.reduce.fadd.v64f16(half %start, <64 x half> %op)
318 define half @faddv_v128f16(half %start, <128 x half>* %a) #0 {
319 ; CHECK-LABEL: faddv_v128f16:
320 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
321 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
322 ; VBITS_GE_2048-NEXT: faddv [[RDX:h[0-9]+]], [[PG]], [[OP]].h
323 ; VBITS_GE_2048-NEXT: fadd h0, h0, [[RDX]]
324 ; VBITS_GE_2048-NEXT: ret
325 %op = load <128 x half>, <128 x half>* %a
326 %res = call fast half @llvm.vector.reduce.fadd.v128f16(half %start, <128 x half> %op)
330 ; Don't use SVE for 2 element vectors.
331 define float @faddv_v2f32(float %start, <2 x float> %a) #0 {
332 ; CHECK-LABEL: faddv_v2f32:
333 ; CHECK: faddp s1, v1.2s
334 ; CHECK-NEXT: fadd s0, s0, s1
336 %res = call fast float @llvm.vector.reduce.fadd.v2f32(float %start, <2 x float> %a)
340 ; No single instruction NEON support for 4 element vectors.
341 define float @faddv_v4f32(float %start, <4 x float> %a) #0 {
342 ; CHECK-LABEL: faddv_v4f32:
343 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl4
344 ; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], z1.s
345 ; CHECK-NEXT: fadd s0, s0, [[RDX]]
347 %res = call fast float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %a)
351 define float @faddv_v8f32(float %start, <8 x float>* %a) #0 {
352 ; CHECK-LABEL: faddv_v8f32:
353 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
354 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
355 ; CHECK-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
356 ; CHECK-NEXT: fadd s0, s0, [[RDX]]
358 %op = load <8 x float>, <8 x float>* %a
359 %res = call fast float @llvm.vector.reduce.fadd.v8f32(float %start, <8 x float> %op)
363 define float @faddv_v16f32(float %start, <16 x float>* %a) #0 {
364 ; CHECK-LABEL: faddv_v16f32:
365 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
366 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
367 ; VBITS_GE_512-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
368 ; VBITS_GE_512-NEXT: fadd s0, s0, [[RDX]]
369 ; VBITS_GE_512-NEXT: ret
371 ; Ensure sensible type legalisation.
372 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
373 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
374 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
375 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
376 ; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
377 ; VBITS_EQ_256-DAG: faddv [[RDX:s[0-9]+]], [[PG]], [[ADD]].s
378 ; VBITS_EQ_256-DAG: fadd s0, s0, [[RDX]]
379 ; VBITS_EQ_256-NEXT: ret
380 %op = load <16 x float>, <16 x float>* %a
381 %res = call fast float @llvm.vector.reduce.fadd.v16f32(float %start, <16 x float> %op)
385 define float @faddv_v32f32(float %start, <32 x float>* %a) #0 {
386 ; CHECK-LABEL: faddv_v32f32:
387 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
388 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
389 ; VBITS_GE_1024-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
390 ; VBITS_GE_1024-NEXT: fadd s0, s0, [[RDX]]
391 ; VBITS_GE_1024-NEXT: ret
392 %op = load <32 x float>, <32 x float>* %a
393 %res = call fast float @llvm.vector.reduce.fadd.v32f32(float %start, <32 x float> %op)
397 define float @faddv_v64f32(float %start, <64 x float>* %a) #0 {
398 ; CHECK-LABEL: faddv_v64f32:
399 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
400 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
401 ; VBITS_GE_2048-NEXT: faddv [[RDX:s[0-9]+]], [[PG]], [[OP]].s
402 ; VBITS_GE_2048-NEXT: fadd s0, s0, [[RDX]]
403 ; VBITS_GE_2048-NEXT: ret
404 %op = load <64 x float>, <64 x float>* %a
405 %res = call fast float @llvm.vector.reduce.fadd.v64f32(float %start, <64 x float> %op)
409 ; Don't use SVE for 1 element vectors.
410 define double @faddv_v1f64(double %start, <1 x double> %a) #0 {
411 ; CHECK-LABEL: faddv_v1f64:
412 ; CHECK: fadd d0, d0, d1
414 %res = call fast double @llvm.vector.reduce.fadd.v1f64(double %start, <1 x double> %a)
418 ; Don't use SVE for 2 element vectors.
419 define double @faddv_v2f64(double %start, <2 x double> %a) #0 {
420 ; CHECK-LABEL: faddv_v2f64:
421 ; CHECK: faddp d1, v1.2d
422 ; CHECK-NEXT: fadd d0, d0, d1
424 %res = call fast double @llvm.vector.reduce.fadd.v2f64(double %start, <2 x double> %a)
428 define double @faddv_v4f64(double %start, <4 x double>* %a) #0 {
429 ; CHECK-LABEL: faddv_v4f64:
430 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
431 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
432 ; CHECK-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
433 ; CHECK-NEXT: fadd d0, d0, [[RDX]]
435 %op = load <4 x double>, <4 x double>* %a
436 %res = call fast double @llvm.vector.reduce.fadd.v4f64(double %start, <4 x double> %op)
440 define double @faddv_v8f64(double %start, <8 x double>* %a) #0 {
441 ; CHECK-LABEL: faddv_v8f64:
442 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
443 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
444 ; VBITS_GE_512-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
445 ; VBITS_GE_512-NEXT: fadd d0, d0, [[RDX]]
446 ; VBITS_GE_512-NEXT: ret
448 ; Ensure sensible type legalisation.
449 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
450 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
451 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
452 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
453 ; VBITS_EQ_256-DAG: fadd [[ADD:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
454 ; VBITS_EQ_256-DAG: faddv [[RDX:d[0-9]+]], [[PG]], [[ADD]].d
455 ; VBITS_EQ_256-DAG: fadd d0, d0, [[RDX]]
456 ; VBITS_EQ_256-NEXT: ret
457 %op = load <8 x double>, <8 x double>* %a
458 %res = call fast double @llvm.vector.reduce.fadd.v8f64(double %start, <8 x double> %op)
462 define double @faddv_v16f64(double %start, <16 x double>* %a) #0 {
463 ; CHECK-LABEL: faddv_v16f64:
464 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
465 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
466 ; VBITS_GE_1024-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
467 ; VBITS_GE_1024-NEXT: fadd d0, d0, [[RDX]]
468 ; VBITS_GE_1024-NEXT: ret
469 %op = load <16 x double>, <16 x double>* %a
470 %res = call fast double @llvm.vector.reduce.fadd.v16f64(double %start, <16 x double> %op)
474 define double @faddv_v32f64(double %start, <32 x double>* %a) #0 {
475 ; CHECK-LABEL: faddv_v32f64:
476 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
477 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
478 ; VBITS_GE_2048-NEXT: faddv [[RDX:d[0-9]+]], [[PG]], [[OP]].d
479 ; VBITS_GE_2048-NEXT: fadd d0, d0, [[RDX]]
480 ; VBITS_GE_2048-NEXT: ret
481 %op = load <32 x double>, <32 x double>* %a
482 %res = call fast double @llvm.vector.reduce.fadd.v32f64(double %start, <32 x double> %op)
490 ; No NEON 16-bit vector FMAXNMV support. Use SVE.
491 define half @fmaxv_v4f16(<4 x half> %a) #0 {
492 ; CHECK-LABEL: fmaxv_v4f16:
493 ; CHECK: fmaxnmv h0, v0.4h
495 %res = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %a)
499 ; No NEON 16-bit vector FMAXNMV support. Use SVE.
500 define half @fmaxv_v8f16(<8 x half> %a) #0 {
501 ; CHECK-LABEL: fmaxv_v8f16:
502 ; CHECK: fmaxnmv h0, v0.8h
504 %res = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %a)
508 define half @fmaxv_v16f16(<16 x half>* %a) #0 {
509 ; CHECK-LABEL: fmaxv_v16f16:
510 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
511 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
512 ; CHECK-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
514 %op = load <16 x half>, <16 x half>* %a
515 %res = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %op)
519 define half @fmaxv_v32f16(<32 x half>* %a) #0 {
520 ; CHECK-LABEL: fmaxv_v32f16:
521 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
522 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
523 ; VBITS_GE_512-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
524 ; VBITS_GE_512-NEXT: ret
526 ; Ensure sensible type legalisation.
527 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
528 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
529 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
530 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
531 ; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
532 ; VBITS_EQ_256-DAG: fmaxnmv h0, [[PG]], [[MAX]].h
533 ; VBITS_EQ_256-NEXT: ret
534 %op = load <32 x half>, <32 x half>* %a
535 %res = call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %op)
539 define half @fmaxv_v64f16(<64 x half>* %a) #0 {
540 ; CHECK-LABEL: fmaxv_v64f16:
541 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
542 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
543 ; VBITS_GE_1024-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
544 ; VBITS_GE_1024-NEXT: ret
545 %op = load <64 x half>, <64 x half>* %a
546 %res = call half @llvm.vector.reduce.fmax.v64f16(<64 x half> %op)
550 define half @fmaxv_v128f16(<128 x half>* %a) #0 {
551 ; CHECK-LABEL: fmaxv_v128f16:
552 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
553 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
554 ; VBITS_GE_2048-NEXT: fmaxnmv h0, [[PG]], [[OP]].h
555 ; VBITS_GE_2048-NEXT: ret
556 %op = load <128 x half>, <128 x half>* %a
557 %res = call half @llvm.vector.reduce.fmax.v128f16(<128 x half> %op)
561 ; Don't use SVE for 64-bit f32 vectors.
562 define float @fmaxv_v2f32(<2 x float> %a) #0 {
563 ; CHECK-LABEL: fmaxv_v2f32:
564 ; CHECK: fmaxnmp s0, v0.2s
566 %res = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %a)
570 ; Don't use SVE for 128-bit f32 vectors.
571 define float @fmaxv_v4f32(<4 x float> %a) #0 {
572 ; CHECK-LABEL: fmaxv_v4f32:
573 ; CHECK: fmaxnmv s0, v0.4s
575 %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a)
579 define float @fmaxv_v8f32(<8 x float>* %a) #0 {
580 ; CHECK-LABEL: fmaxv_v8f32:
581 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
582 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
583 ; CHECK-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
585 %op = load <8 x float>, <8 x float>* %a
586 %res = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %op)
590 define float @fmaxv_v16f32(<16 x float>* %a) #0 {
591 ; CHECK-LABEL: fmaxv_v16f32:
592 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
593 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
594 ; VBITS_GE_512-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
595 ; VBITS_GE_512-NEXT: ret
597 ; Ensure sensible type legalisation.
598 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
599 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
600 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
601 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
602 ; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
603 ; VBITS_EQ_256-DAG: fmaxnmv s0, [[PG]], [[MAX]].s
604 ; VBITS_EQ_256-NEXT: ret
605 %op = load <16 x float>, <16 x float>* %a
606 %res = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %op)
610 define float @fmaxv_v32f32(<32 x float>* %a) #0 {
611 ; CHECK-LABEL: fmaxv_v32f32:
612 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
613 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
614 ; VBITS_GE_1024-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
615 ; VBITS_GE_1024-NEXT: ret
616 %op = load <32 x float>, <32 x float>* %a
617 %res = call float @llvm.vector.reduce.fmax.v32f32(<32 x float> %op)
621 define float @fmaxv_v64f32(<64 x float>* %a) #0 {
622 ; CHECK-LABEL: fmaxv_v64f32:
623 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
624 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
625 ; VBITS_GE_2048-NEXT: fmaxnmv s0, [[PG]], [[OP]].s
626 ; VBITS_GE_2048-NEXT: ret
627 %op = load <64 x float>, <64 x float>* %a
628 %res = call float @llvm.vector.reduce.fmax.v64f32(<64 x float> %op)
632 ; Nothing to do for single element vectors.
633 define double @fmaxv_v1f64(<1 x double> %a) #0 {
634 ; CHECK-LABEL: fmaxv_v1f64:
637 %res = call double @llvm.vector.reduce.fmax.v1f64(<1 x double> %a)
641 ; Don't use SVE for 128-bit f64 vectors.
642 define double @fmaxv_v2f64(<2 x double> %a) #0 {
643 ; CHECK-LABEL: fmaxv_v2f64:
644 ; CHECK: fmaxnmp d0, v0.2d
646 %res = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %a)
650 define double @fmaxv_v4f64(<4 x double>* %a) #0 {
651 ; CHECK-LABEL: fmaxv_v4f64:
652 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
653 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
654 ; CHECK-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
656 %op = load <4 x double>, <4 x double>* %a
657 %res = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %op)
661 define double @fmaxv_v8f64(<8 x double>* %a) #0 {
662 ; CHECK-LABEL: fmaxv_v8f64:
663 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
664 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
665 ; VBITS_GE_512-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
666 ; VBITS_GE_512-NEXT: ret
668 ; Ensure sensible type legalisation.
669 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
670 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
671 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
672 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
673 ; VBITS_EQ_256-DAG: fmaxnm [[MAX:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
674 ; VBITS_EQ_256-DAG: fmaxnmv d0, [[PG]], [[MAX]].d
675 ; VBITS_EQ_256-NEXT: ret
676 %op = load <8 x double>, <8 x double>* %a
677 %res = call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %op)
681 define double @fmaxv_v16f64(<16 x double>* %a) #0 {
682 ; CHECK-LABEL: fmaxv_v16f64:
683 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
684 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
685 ; VBITS_GE_1024-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
686 ; VBITS_GE_1024-NEXT: ret
687 %op = load <16 x double>, <16 x double>* %a
688 %res = call double @llvm.vector.reduce.fmax.v16f64(<16 x double> %op)
692 define double @fmaxv_v32f64(<32 x double>* %a) #0 {
693 ; CHECK-LABEL: fmaxv_v32f64:
694 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
695 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
696 ; VBITS_GE_2048-NEXT: fmaxnmv d0, [[PG]], [[OP]].d
697 ; VBITS_GE_2048-NEXT: ret
698 %op = load <32 x double>, <32 x double>* %a
699 %res = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %op)
707 ; No NEON 16-bit vector FMINNMV support. Use SVE.
708 define half @fminv_v4f16(<4 x half> %a) #0 {
709 ; CHECK-LABEL: fminv_v4f16:
710 ; CHECK: fminnmv h0, v0.4h
712 %res = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %a)
716 ; No NEON 16-bit vector FMINNMV support. Use SVE.
717 define half @fminv_v8f16(<8 x half> %a) #0 {
718 ; CHECK-LABEL: fminv_v8f16:
719 ; CHECK: fminnmv h0, v0.8h
721 %res = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %a)
725 define half @fminv_v16f16(<16 x half>* %a) #0 {
726 ; CHECK-LABEL: fminv_v16f16:
727 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
728 ; CHECK-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
729 ; CHECK-NEXT: fminnmv h0, [[PG]], [[OP]].h
731 %op = load <16 x half>, <16 x half>* %a
732 %res = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %op)
736 define half @fminv_v32f16(<32 x half>* %a) #0 {
737 ; CHECK-LABEL: fminv_v32f16:
738 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
739 ; VBITS_GE_512-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
740 ; VBITS_GE_512-NEXT: fminnmv h0, [[PG]], [[OP]].h
741 ; VBITS_GE_512-NEXT: ret
743 ; Ensure sensible type legalisation.
744 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
745 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
746 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0]
747 ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
748 ; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h
749 ; VBITS_EQ_256-DAG: fminnmv h0, [[PG]], [[MIN]].h
750 ; VBITS_EQ_256-NEXT: ret
751 %op = load <32 x half>, <32 x half>* %a
752 %res = call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %op)
756 define half @fminv_v64f16(<64 x half>* %a) #0 {
757 ; CHECK-LABEL: fminv_v64f16:
758 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
759 ; VBITS_GE_1024-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
760 ; VBITS_GE_1024-NEXT: fminnmv h0, [[PG]], [[OP]].h
761 ; VBITS_GE_1024-NEXT: ret
762 %op = load <64 x half>, <64 x half>* %a
763 %res = call half @llvm.vector.reduce.fmin.v64f16(<64 x half> %op)
767 define half @fminv_v128f16(<128 x half>* %a) #0 {
768 ; CHECK-LABEL: fminv_v128f16:
769 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
770 ; VBITS_GE_2048-NEXT: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0]
771 ; VBITS_GE_2048-NEXT: fminnmv h0, [[PG]], [[OP]].h
772 ; VBITS_GE_2048-NEXT: ret
773 %op = load <128 x half>, <128 x half>* %a
774 %res = call half @llvm.vector.reduce.fmin.v128f16(<128 x half> %op)
778 ; Don't use SVE for 64-bit f32 vectors.
779 define float @fminv_v2f32(<2 x float> %a) #0 {
780 ; CHECK-LABEL: fminv_v2f32:
781 ; CHECK: fminnmp s0, v0.2s
783 %res = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %a)
787 ; Don't use SVE for 128-bit f32 vectors.
788 define float @fminv_v4f32(<4 x float> %a) #0 {
789 ; CHECK-LABEL: fminv_v4f32:
790 ; CHECK: fminnmv s0, v0.4s
792 %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a)
796 define float @fminv_v8f32(<8 x float>* %a) #0 {
797 ; CHECK-LABEL: fminv_v8f32:
798 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
799 ; CHECK-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
800 ; CHECK-NEXT: fminnmv s0, [[PG]], [[OP]].s
802 %op = load <8 x float>, <8 x float>* %a
803 %res = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %op)
807 define float @fminv_v16f32(<16 x float>* %a) #0 {
808 ; CHECK-LABEL: fminv_v16f32:
809 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
810 ; VBITS_GE_512-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
811 ; VBITS_GE_512-NEXT: fminnmv s0, [[PG]], [[OP]].s
812 ; VBITS_GE_512-NEXT: ret
814 ; Ensure sensible type legalisation.
815 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
816 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
817 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0]
818 ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
819 ; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s
820 ; VBITS_EQ_256-DAG: fminnmv s0, [[PG]], [[MIN]].s
821 ; VBITS_EQ_256-NEXT: ret
822 %op = load <16 x float>, <16 x float>* %a
823 %res = call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %op)
827 define float @fminv_v32f32(<32 x float>* %a) #0 {
828 ; CHECK-LABEL: fminv_v32f32:
829 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
830 ; VBITS_GE_1024-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
831 ; VBITS_GE_1024-NEXT: fminnmv s0, [[PG]], [[OP]].s
832 ; VBITS_GE_1024-NEXT: ret
833 %op = load <32 x float>, <32 x float>* %a
834 %res = call float @llvm.vector.reduce.fmin.v32f32(<32 x float> %op)
838 define float @fminv_v64f32(<64 x float>* %a) #0 {
839 ; CHECK-LABEL: fminv_v64f32:
840 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
841 ; VBITS_GE_2048-NEXT: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0]
842 ; VBITS_GE_2048-NEXT: fminnmv s0, [[PG]], [[OP]].s
843 ; VBITS_GE_2048-NEXT: ret
844 %op = load <64 x float>, <64 x float>* %a
845 %res = call float @llvm.vector.reduce.fmin.v64f32(<64 x float> %op)
849 ; Nothing to do for single element vectors.
850 define double @fminv_v1f64(<1 x double> %a) #0 {
851 ; CHECK-LABEL: fminv_v1f64:
854 %res = call double @llvm.vector.reduce.fmin.v1f64(<1 x double> %a)
858 ; Don't use SVE for 128-bit f64 vectors.
859 define double @fminv_v2f64(<2 x double> %a) #0 {
860 ; CHECK-LABEL: fminv_v2f64:
861 ; CHECK: fminnmp d0, v0.2d
863 %res = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %a)
867 define double @fminv_v4f64(<4 x double>* %a) #0 {
868 ; CHECK-LABEL: fminv_v4f64:
869 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
870 ; CHECK-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
871 ; CHECK-NEXT: fminnmv d0, [[PG]], [[OP]].d
873 %op = load <4 x double>, <4 x double>* %a
874 %res = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %op)
878 define double @fminv_v8f64(<8 x double>* %a) #0 {
879 ; CHECK-LABEL: fminv_v8f64:
880 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
881 ; VBITS_GE_512-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
882 ; VBITS_GE_512-NEXT: fminnmv d0, [[PG]], [[OP]].d
883 ; VBITS_GE_512-NEXT: ret
885 ; Ensure sensible type legalisation.
886 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
887 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
888 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0]
889 ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
890 ; VBITS_EQ_256-DAG: fminnm [[MIN:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d
891 ; VBITS_EQ_256-DAG: fminnmv d0, [[PG]], [[MIN]].d
892 ; VBITS_EQ_256-NEXT: ret
893 %op = load <8 x double>, <8 x double>* %a
894 %res = call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %op)
898 define double @fminv_v16f64(<16 x double>* %a) #0 {
899 ; CHECK-LABEL: fminv_v16f64:
900 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
901 ; VBITS_GE_1024-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
902 ; VBITS_GE_1024-NEXT: fminnmv d0, [[PG]], [[OP]].d
903 ; VBITS_GE_1024-NEXT: ret
904 %op = load <16 x double>, <16 x double>* %a
905 %res = call double @llvm.vector.reduce.fmin.v16f64(<16 x double> %op)
909 define double @fminv_v32f64(<32 x double>* %a) #0 {
910 ; CHECK-LABEL: fminv_v32f64:
911 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
912 ; VBITS_GE_2048-NEXT: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0]
913 ; VBITS_GE_2048-NEXT: fminnmv d0, [[PG]], [[OP]].d
914 ; VBITS_GE_2048-NEXT: ret
915 %op = load <32 x double>, <32 x double>* %a
916 %res = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %op)
920 attributes #0 = { "target-features"="+sve" }
922 declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>)
923 declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>)
924 declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>)
925 declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>)
926 declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>)
927 declare half @llvm.vector.reduce.fadd.v128f16(half, <128 x half>)
929 declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
930 declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
931 declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
932 declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>)
933 declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>)
934 declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>)
936 declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>)
937 declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
938 declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
939 declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>)
940 declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>)
941 declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>)
943 declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>)
944 declare half @llvm.vector.reduce.fmax.v8f16(<8 x half>)
945 declare half @llvm.vector.reduce.fmax.v16f16(<16 x half>)
946 declare half @llvm.vector.reduce.fmax.v32f16(<32 x half>)
947 declare half @llvm.vector.reduce.fmax.v64f16(<64 x half>)
948 declare half @llvm.vector.reduce.fmax.v128f16(<128 x half>)
950 declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>)
951 declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
952 declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>)
953 declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
954 declare float @llvm.vector.reduce.fmax.v32f32(<32 x float>)
955 declare float @llvm.vector.reduce.fmax.v64f32(<64 x float>)
957 declare double @llvm.vector.reduce.fmax.v1f64(<1 x double>)
958 declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>)
959 declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>)
960 declare double @llvm.vector.reduce.fmax.v8f64(<8 x double>)
961 declare double @llvm.vector.reduce.fmax.v16f64(<16 x double>)
962 declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>)
964 declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>)
965 declare half @llvm.vector.reduce.fmin.v8f16(<8 x half>)
966 declare half @llvm.vector.reduce.fmin.v16f16(<16 x half>)
967 declare half @llvm.vector.reduce.fmin.v32f16(<32 x half>)
968 declare half @llvm.vector.reduce.fmin.v64f16(<64 x half>)
969 declare half @llvm.vector.reduce.fmin.v128f16(<128 x half>)
971 declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>)
972 declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
973 declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>)
974 declare float @llvm.vector.reduce.fmin.v16f32(<16 x float>)
975 declare float @llvm.vector.reduce.fmin.v32f32(<32 x float>)
976 declare float @llvm.vector.reduce.fmin.v64f32(<64 x float>)
978 declare double @llvm.vector.reduce.fmin.v1f64(<1 x double>)
979 declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>)
980 declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>)
981 declare double @llvm.vector.reduce.fmin.v8f64(<8 x double>)
982 declare double @llvm.vector.reduce.fmin.v16f64(<16 x double>)
983 declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>)