1 ; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
4 ; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5 ; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 ; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
7 ; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
8 ; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
9 ; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
10 ; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
11 ; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
12 ; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
13 ; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
14 ; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
15 ; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
16 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
18 target triple = "aarch64-unknown-linux-gnu"
20 ; Don't use SVE when its registers are no bigger than NEON.
27 ; Don't use SVE for 64-bit vectors.
28 define <4 x half> @fmaxnm_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
29 ; CHECK-LABEL: fmaxnm_v4f16:
30 ; CHECK: fmaxnm v0.4h, v0.4h, v1.4h
32 %res = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %op1, <4 x half> %op2)
36 ; Don't use SVE for 128-bit vectors.
37 define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
38 ; CHECK-LABEL: fmaxnm_v8f16:
39 ; CHECK: fmaxnm v0.8h, v0.8h, v1.8h
41 %res = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %op1, <8 x half> %op2)
45 define void @fmaxnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
46 ; CHECK-LABEL: fmaxnm_v16f16:
47 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
48 ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
49 ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
50 ; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
51 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
53 %op1 = load <16 x half>, <16 x half>* %a
54 %op2 = load <16 x half>, <16 x half>* %b
55 %res = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %op1, <16 x half> %op2)
56 store <16 x half> %res, <16 x half>* %a
60 define void @fmaxnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
61 ; CHECK-LABEL: fmaxnm_v32f16:
62 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
63 ; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
64 ; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
65 ; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
66 ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
67 ; VBITS_GE_512-NEXT: ret
69 ; Ensure sensible type legalisation.
70 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
71 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
72 ; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
73 ; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
74 ; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
75 ; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
76 ; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
77 ; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
78 ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
79 ; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
80 ; VBITS_EQ_256-NEXT: ret
81 %op1 = load <32 x half>, <32 x half>* %a
82 %op2 = load <32 x half>, <32 x half>* %b
83 %res = call <32 x half> @llvm.maxnum.v32f16(<32 x half> %op1, <32 x half> %op2)
84 store <32 x half> %res, <32 x half>* %a
88 define void @fmaxnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
89 ; CHECK-LABEL: fmaxnm_v64f16:
90 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
91 ; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
92 ; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
93 ; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
94 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
95 ; VBITS_GE_1024-NEXT: ret
96 %op1 = load <64 x half>, <64 x half>* %a
97 %op2 = load <64 x half>, <64 x half>* %b
98 %res = call <64 x half> @llvm.maxnum.v64f16(<64 x half> %op1, <64 x half> %op2)
99 store <64 x half> %res, <64 x half>* %a
103 define void @fmaxnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
104 ; CHECK-LABEL: fmaxnm_v128f16:
105 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
106 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
107 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
108 ; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
109 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
110 ; VBITS_GE_2048-NEXT: ret
111 %op1 = load <128 x half>, <128 x half>* %a
112 %op2 = load <128 x half>, <128 x half>* %b
113 %res = call <128 x half> @llvm.maxnum.v128f16(<128 x half> %op1, <128 x half> %op2)
114 store <128 x half> %res, <128 x half>* %a
118 ; Don't use SVE for 64-bit vectors.
119 define <2 x float> @fmaxnm_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
120 ; CHECK-LABEL: fmaxnm_v2f32:
121 ; CHECK: fmaxnm v0.2s, v0.2s, v1.2s
123 %res = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %op1, <2 x float> %op2)
127 ; Don't use SVE for 128-bit vectors.
128 define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
129 ; CHECK-LABEL: fmaxnm_v4f32:
130 ; CHECK: fmaxnm v0.4s, v0.4s, v1.4s
132 %res = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %op1, <4 x float> %op2)
136 define void @fmaxnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
137 ; CHECK-LABEL: fmaxnm_v8f32:
138 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
139 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
140 ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
141 ; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
142 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
144 %op1 = load <8 x float>, <8 x float>* %a
145 %op2 = load <8 x float>, <8 x float>* %b
146 %res = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %op1, <8 x float> %op2)
147 store <8 x float> %res, <8 x float>* %a
151 define void @fmaxnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
152 ; CHECK-LABEL: fmaxnm_v16f32:
153 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
154 ; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
155 ; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
156 ; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
157 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
158 ; VBITS_GE_512-NEXT: ret
160 ; Ensure sensible type legalisation.
161 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
162 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
163 ; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
164 ; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
165 ; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
166 ; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
167 ; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
168 ; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
169 ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
170 ; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
171 ; VBITS_EQ_256-NEXT: ret
172 %op1 = load <16 x float>, <16 x float>* %a
173 %op2 = load <16 x float>, <16 x float>* %b
174 %res = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %op1, <16 x float> %op2)
175 store <16 x float> %res, <16 x float>* %a
179 define void @fmaxnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
180 ; CHECK-LABEL: fmaxnm_v32f32:
181 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
182 ; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
183 ; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
184 ; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
185 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
186 ; VBITS_GE_1024-NEXT: ret
187 %op1 = load <32 x float>, <32 x float>* %a
188 %op2 = load <32 x float>, <32 x float>* %b
189 %res = call <32 x float> @llvm.maxnum.v32f32(<32 x float> %op1, <32 x float> %op2)
190 store <32 x float> %res, <32 x float>* %a
194 define void @fmaxnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
195 ; CHECK-LABEL: fmaxnm_v64f32:
196 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
197 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
198 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
199 ; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
200 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
201 ; VBITS_GE_2048-NEXT: ret
202 %op1 = load <64 x float>, <64 x float>* %a
203 %op2 = load <64 x float>, <64 x float>* %b
204 %res = call <64 x float> @llvm.maxnum.v64f32(<64 x float> %op1, <64 x float> %op2)
205 store <64 x float> %res, <64 x float>* %a
209 ; Don't use SVE for 64-bit vectors.
210 define <1 x double> @fmaxnm_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
211 ; CHECK-LABEL: fmaxnm_v1f64:
212 ; CHECK: fmaxnm d0, d0, d1
214 %res = call <1 x double> @llvm.maxnum.v1f64(<1 x double> %op1, <1 x double> %op2)
215 ret <1 x double> %res
218 ; Don't use SVE for 128-bit vectors.
219 define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
220 ; CHECK-LABEL: fmaxnm_v2f64:
221 ; CHECK: fmaxnm v0.2d, v0.2d, v1.2d
223 %res = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %op1, <2 x double> %op2)
224 ret <2 x double> %res
227 define void @fmaxnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
228 ; CHECK-LABEL: fmaxnm_v4f64:
229 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
230 ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
231 ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
232 ; CHECK-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
233 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
235 %op1 = load <4 x double>, <4 x double>* %a
236 %op2 = load <4 x double>, <4 x double>* %b
237 %res = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %op1, <4 x double> %op2)
238 store <4 x double> %res, <4 x double>* %a
242 define void @fmaxnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
243 ; CHECK-LABEL: fmaxnm_v8f64:
244 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
245 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
246 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
247 ; VBITS_GE_512-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
248 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
249 ; VBITS_GE_512-NEXT: ret
251 ; Ensure sensible type legalisation.
252 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
253 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
254 ; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
255 ; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
256 ; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
257 ; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
258 ; VBITS_EQ_256-DAG: fmaxnm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
259 ; VBITS_EQ_256-DAG: fmaxnm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
260 ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
261 ; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
262 ; VBITS_EQ_256-NEXT: ret
263 %op1 = load <8 x double>, <8 x double>* %a
264 %op2 = load <8 x double>, <8 x double>* %b
265 %res = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %op1, <8 x double> %op2)
266 store <8 x double> %res, <8 x double>* %a
270 define void @fmaxnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
271 ; CHECK-LABEL: fmaxnm_v16f64:
272 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
273 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
274 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
275 ; VBITS_GE_1024-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
276 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
277 ; VBITS_GE_1024-NEXT: ret
278 %op1 = load <16 x double>, <16 x double>* %a
279 %op2 = load <16 x double>, <16 x double>* %b
280 %res = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %op1, <16 x double> %op2)
281 store <16 x double> %res, <16 x double>* %a
285 define void @fmaxnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
286 ; CHECK-LABEL: fmaxnm_v32f64:
287 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
288 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
289 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
290 ; VBITS_GE_2048-NEXT: fmaxnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
291 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
292 ; VBITS_GE_2048-NEXT: ret
293 %op1 = load <32 x double>, <32 x double>* %a
294 %op2 = load <32 x double>, <32 x double>* %b
295 %res = call <32 x double> @llvm.maxnum.v32f64(<32 x double> %op1, <32 x double> %op2)
296 store <32 x double> %res, <32 x double>* %a
304 ; Don't use SVE for 64-bit vectors.
305 define <4 x half> @fminnm_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
306 ; CHECK-LABEL: fminnm_v4f16:
307 ; CHECK: fminnm v0.4h, v0.4h, v1.4h
309 %res = call <4 x half> @llvm.minnum.v4f16(<4 x half> %op1, <4 x half> %op2)
313 ; Don't use SVE for 128-bit vectors.
314 define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
315 ; CHECK-LABEL: fminnm_v8f16:
316 ; CHECK: fminnm v0.8h, v0.8h, v1.8h
318 %res = call <8 x half> @llvm.minnum.v8f16(<8 x half> %op1, <8 x half> %op2)
322 define void @fminnm_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
323 ; CHECK-LABEL: fminnm_v16f16:
324 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
325 ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
326 ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
327 ; CHECK-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
328 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
330 %op1 = load <16 x half>, <16 x half>* %a
331 %op2 = load <16 x half>, <16 x half>* %b
332 %res = call <16 x half> @llvm.minnum.v16f16(<16 x half> %op1, <16 x half> %op2)
333 store <16 x half> %res, <16 x half>* %a
337 define void @fminnm_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
338 ; CHECK-LABEL: fminnm_v32f16:
339 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
340 ; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
341 ; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
342 ; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
343 ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
344 ; VBITS_GE_512-NEXT: ret
346 ; Ensure sensible type legalisation.
347 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
348 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
349 ; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
350 ; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
351 ; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
352 ; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
353 ; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
354 ; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
355 ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
356 ; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
357 ; VBITS_EQ_256-NEXT: ret
358 %op1 = load <32 x half>, <32 x half>* %a
359 %op2 = load <32 x half>, <32 x half>* %b
360 %res = call <32 x half> @llvm.minnum.v32f16(<32 x half> %op1, <32 x half> %op2)
361 store <32 x half> %res, <32 x half>* %a
365 define void @fminnm_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
366 ; CHECK-LABEL: fminnm_v64f16:
367 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
368 ; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
369 ; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
370 ; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
371 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
372 ; VBITS_GE_1024-NEXT: ret
373 %op1 = load <64 x half>, <64 x half>* %a
374 %op2 = load <64 x half>, <64 x half>* %b
375 %res = call <64 x half> @llvm.minnum.v64f16(<64 x half> %op1, <64 x half> %op2)
376 store <64 x half> %res, <64 x half>* %a
380 define void @fminnm_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
381 ; CHECK-LABEL: fminnm_v128f16:
382 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
383 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
384 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
385 ; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
386 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
387 ; VBITS_GE_2048-NEXT: ret
388 %op1 = load <128 x half>, <128 x half>* %a
389 %op2 = load <128 x half>, <128 x half>* %b
390 %res = call <128 x half> @llvm.minnum.v128f16(<128 x half> %op1, <128 x half> %op2)
391 store <128 x half> %res, <128 x half>* %a
395 ; Don't use SVE for 64-bit vectors.
396 define <2 x float> @fminnm_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
397 ; CHECK-LABEL: fminnm_v2f32:
398 ; CHECK: fminnm v0.2s, v0.2s, v1.2s
400 %res = call <2 x float> @llvm.minnum.v2f32(<2 x float> %op1, <2 x float> %op2)
404 ; Don't use SVE for 128-bit vectors.
405 define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
406 ; CHECK-LABEL: fminnm_v4f32:
407 ; CHECK: fminnm v0.4s, v0.4s, v1.4s
409 %res = call <4 x float> @llvm.minnum.v4f32(<4 x float> %op1, <4 x float> %op2)
413 define void @fminnm_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
414 ; CHECK-LABEL: fminnm_v8f32:
415 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
416 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
417 ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
418 ; CHECK-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
419 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
421 %op1 = load <8 x float>, <8 x float>* %a
422 %op2 = load <8 x float>, <8 x float>* %b
423 %res = call <8 x float> @llvm.minnum.v8f32(<8 x float> %op1, <8 x float> %op2)
424 store <8 x float> %res, <8 x float>* %a
428 define void @fminnm_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
429 ; CHECK-LABEL: fminnm_v16f32:
430 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
431 ; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
432 ; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
433 ; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
434 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
435 ; VBITS_GE_512-NEXT: ret
437 ; Ensure sensible type legalisation.
438 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
439 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
440 ; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
441 ; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
442 ; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
443 ; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
444 ; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
445 ; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
446 ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
447 ; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
448 ; VBITS_EQ_256-NEXT: ret
449 %op1 = load <16 x float>, <16 x float>* %a
450 %op2 = load <16 x float>, <16 x float>* %b
451 %res = call <16 x float> @llvm.minnum.v16f32(<16 x float> %op1, <16 x float> %op2)
452 store <16 x float> %res, <16 x float>* %a
456 define void @fminnm_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
457 ; CHECK-LABEL: fminnm_v32f32:
458 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
459 ; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
460 ; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
461 ; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
462 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
463 ; VBITS_GE_1024-NEXT: ret
464 %op1 = load <32 x float>, <32 x float>* %a
465 %op2 = load <32 x float>, <32 x float>* %b
466 %res = call <32 x float> @llvm.minnum.v32f32(<32 x float> %op1, <32 x float> %op2)
467 store <32 x float> %res, <32 x float>* %a
471 define void @fminnm_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
472 ; CHECK-LABEL: fminnm_v64f32:
473 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
474 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
475 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
476 ; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
477 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
478 ; VBITS_GE_2048-NEXT: ret
479 %op1 = load <64 x float>, <64 x float>* %a
480 %op2 = load <64 x float>, <64 x float>* %b
481 %res = call <64 x float> @llvm.minnum.v64f32(<64 x float> %op1, <64 x float> %op2)
482 store <64 x float> %res, <64 x float>* %a
486 ; Don't use SVE for 64-bit vectors.
487 define <1 x double> @fminnm_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
488 ; CHECK-LABEL: fminnm_v1f64:
489 ; CHECK: fminnm d0, d0, d1
491 %res = call <1 x double> @llvm.minnum.v1f64(<1 x double> %op1, <1 x double> %op2)
492 ret <1 x double> %res
495 ; Don't use SVE for 128-bit vectors.
496 define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
497 ; CHECK-LABEL: fminnm_v2f64:
498 ; CHECK: fminnm v0.2d, v0.2d, v1.2d
500 %res = call <2 x double> @llvm.minnum.v2f64(<2 x double> %op1, <2 x double> %op2)
501 ret <2 x double> %res
504 define void @fminnm_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
505 ; CHECK-LABEL: fminnm_v4f64:
506 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
507 ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
508 ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
509 ; CHECK-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
510 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
512 %op1 = load <4 x double>, <4 x double>* %a
513 %op2 = load <4 x double>, <4 x double>* %b
514 %res = call <4 x double> @llvm.minnum.v4f64(<4 x double> %op1, <4 x double> %op2)
515 store <4 x double> %res, <4 x double>* %a
519 define void @fminnm_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
520 ; CHECK-LABEL: fminnm_v8f64:
521 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
522 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
523 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
524 ; VBITS_GE_512-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
525 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
526 ; VBITS_GE_512-NEXT: ret
528 ; Ensure sensible type legalisation.
529 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
530 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
531 ; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
532 ; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
533 ; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
534 ; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
535 ; VBITS_EQ_256-DAG: fminnm [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
536 ; VBITS_EQ_256-DAG: fminnm [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
537 ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
538 ; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
539 ; VBITS_EQ_256-NEXT: ret
540 %op1 = load <8 x double>, <8 x double>* %a
541 %op2 = load <8 x double>, <8 x double>* %b
542 %res = call <8 x double> @llvm.minnum.v8f64(<8 x double> %op1, <8 x double> %op2)
543 store <8 x double> %res, <8 x double>* %a
547 define void @fminnm_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
548 ; CHECK-LABEL: fminnm_v16f64:
549 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
550 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
551 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
552 ; VBITS_GE_1024-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
553 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
554 ; VBITS_GE_1024-NEXT: ret
555 %op1 = load <16 x double>, <16 x double>* %a
556 %op2 = load <16 x double>, <16 x double>* %b
557 %res = call <16 x double> @llvm.minnum.v16f64(<16 x double> %op1, <16 x double> %op2)
558 store <16 x double> %res, <16 x double>* %a
562 define void @fminnm_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
563 ; CHECK-LABEL: fminnm_v32f64:
564 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
565 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
566 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
567 ; VBITS_GE_2048-NEXT: fminnm [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
568 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
569 ; VBITS_GE_2048-NEXT: ret
570 %op1 = load <32 x double>, <32 x double>* %a
571 %op2 = load <32 x double>, <32 x double>* %b
572 %res = call <32 x double> @llvm.minnum.v32f64(<32 x double> %op1, <32 x double> %op2)
573 store <32 x double> %res, <32 x double>* %a
581 ; Don't use SVE for 64-bit vectors.
582 define <4 x half> @fmax_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
583 ; CHECK-LABEL: fmax_v4f16:
584 ; CHECK: fmax v0.4h, v0.4h, v1.4h
586 %res = call <4 x half> @llvm.maximum.v4f16(<4 x half> %op1, <4 x half> %op2)
590 ; Don't use SVE for 128-bit vectors.
591 define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
592 ; CHECK-LABEL: fmax_v8f16:
593 ; CHECK: fmax v0.8h, v0.8h, v1.8h
595 %res = call <8 x half> @llvm.maximum.v8f16(<8 x half> %op1, <8 x half> %op2)
599 define void @fmax_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
600 ; CHECK-LABEL: fmax_v16f16:
601 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
602 ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
603 ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
604 ; CHECK-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
605 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
607 %op1 = load <16 x half>, <16 x half>* %a
608 %op2 = load <16 x half>, <16 x half>* %b
609 %res = call <16 x half> @llvm.maximum.v16f16(<16 x half> %op1, <16 x half> %op2)
610 store <16 x half> %res, <16 x half>* %a
614 define void @fmax_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
615 ; CHECK-LABEL: fmax_v32f16:
616 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
617 ; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
618 ; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
619 ; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
620 ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
621 ; VBITS_GE_512-NEXT: ret
623 ; Ensure sensible type legalisation.
624 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
625 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
626 ; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
627 ; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
628 ; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
629 ; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
630 ; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
631 ; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
632 ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
633 ; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
634 ; VBITS_EQ_256-NEXT: ret
635 %op1 = load <32 x half>, <32 x half>* %a
636 %op2 = load <32 x half>, <32 x half>* %b
637 %res = call <32 x half> @llvm.maximum.v32f16(<32 x half> %op1, <32 x half> %op2)
638 store <32 x half> %res, <32 x half>* %a
642 define void @fmax_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
643 ; CHECK-LABEL: fmax_v64f16:
644 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
645 ; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
646 ; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
647 ; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
648 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
649 ; VBITS_GE_1024-NEXT: ret
650 %op1 = load <64 x half>, <64 x half>* %a
651 %op2 = load <64 x half>, <64 x half>* %b
652 %res = call <64 x half> @llvm.maximum.v64f16(<64 x half> %op1, <64 x half> %op2)
653 store <64 x half> %res, <64 x half>* %a
657 define void @fmax_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
658 ; CHECK-LABEL: fmax_v128f16:
659 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
660 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
661 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
662 ; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
663 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
664 ; VBITS_GE_2048-NEXT: ret
665 %op1 = load <128 x half>, <128 x half>* %a
666 %op2 = load <128 x half>, <128 x half>* %b
667 %res = call <128 x half> @llvm.maximum.v128f16(<128 x half> %op1, <128 x half> %op2)
668 store <128 x half> %res, <128 x half>* %a
672 ; Don't use SVE for 64-bit vectors.
673 define <2 x float> @fmax_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
674 ; CHECK-LABEL: fmax_v2f32:
675 ; CHECK: fmax v0.2s, v0.2s, v1.2s
677 %res = call <2 x float> @llvm.maximum.v2f32(<2 x float> %op1, <2 x float> %op2)
681 ; Don't use SVE for 128-bit vectors.
682 define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
683 ; CHECK-LABEL: fmax_v4f32:
684 ; CHECK: fmax v0.4s, v0.4s, v1.4s
686 %res = call <4 x float> @llvm.maximum.v4f32(<4 x float> %op1, <4 x float> %op2)
690 define void @fmax_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
691 ; CHECK-LABEL: fmax_v8f32:
692 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
693 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
694 ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
695 ; CHECK-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
696 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
698 %op1 = load <8 x float>, <8 x float>* %a
699 %op2 = load <8 x float>, <8 x float>* %b
700 %res = call <8 x float> @llvm.maximum.v8f32(<8 x float> %op1, <8 x float> %op2)
701 store <8 x float> %res, <8 x float>* %a
705 define void @fmax_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
706 ; CHECK-LABEL: fmax_v16f32:
707 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
708 ; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
709 ; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
710 ; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
711 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
712 ; VBITS_GE_512-NEXT: ret
714 ; Ensure sensible type legalisation.
715 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
716 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
717 ; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
718 ; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
719 ; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
720 ; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
721 ; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
722 ; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
723 ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
724 ; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
725 ; VBITS_EQ_256-NEXT: ret
726 %op1 = load <16 x float>, <16 x float>* %a
727 %op2 = load <16 x float>, <16 x float>* %b
728 %res = call <16 x float> @llvm.maximum.v16f32(<16 x float> %op1, <16 x float> %op2)
729 store <16 x float> %res, <16 x float>* %a
733 define void @fmax_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
734 ; CHECK-LABEL: fmax_v32f32:
735 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
736 ; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
737 ; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
738 ; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
739 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
740 ; VBITS_GE_1024-NEXT: ret
741 %op1 = load <32 x float>, <32 x float>* %a
742 %op2 = load <32 x float>, <32 x float>* %b
743 %res = call <32 x float> @llvm.maximum.v32f32(<32 x float> %op1, <32 x float> %op2)
744 store <32 x float> %res, <32 x float>* %a
748 define void @fmax_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
749 ; CHECK-LABEL: fmax_v64f32:
750 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
751 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
752 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
753 ; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
754 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
755 ; VBITS_GE_2048-NEXT: ret
756 %op1 = load <64 x float>, <64 x float>* %a
757 %op2 = load <64 x float>, <64 x float>* %b
758 %res = call <64 x float> @llvm.maximum.v64f32(<64 x float> %op1, <64 x float> %op2)
759 store <64 x float> %res, <64 x float>* %a
763 ; Don't use SVE for 64-bit vectors.
764 define <1 x double> @fmax_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
765 ; CHECK-LABEL: fmax_v1f64:
766 ; CHECK: fmax d0, d0, d1
768 %res = call <1 x double> @llvm.maximum.v1f64(<1 x double> %op1, <1 x double> %op2)
769 ret <1 x double> %res
772 ; Don't use SVE for 128-bit vectors.
773 define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
774 ; CHECK-LABEL: fmax_v2f64:
775 ; CHECK: fmax v0.2d, v0.2d, v1.2d
777 %res = call <2 x double> @llvm.maximum.v2f64(<2 x double> %op1, <2 x double> %op2)
778 ret <2 x double> %res
781 define void @fmax_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
782 ; CHECK-LABEL: fmax_v4f64:
783 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
784 ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
785 ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
786 ; CHECK-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
787 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
789 %op1 = load <4 x double>, <4 x double>* %a
790 %op2 = load <4 x double>, <4 x double>* %b
791 %res = call <4 x double> @llvm.maximum.v4f64(<4 x double> %op1, <4 x double> %op2)
792 store <4 x double> %res, <4 x double>* %a
796 define void @fmax_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
797 ; CHECK-LABEL: fmax_v8f64:
798 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
799 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
800 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
801 ; VBITS_GE_512-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
802 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
803 ; VBITS_GE_512-NEXT: ret
805 ; Ensure sensible type legalisation.
806 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
807 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
808 ; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
809 ; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
810 ; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
811 ; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
812 ; VBITS_EQ_256-DAG: fmax [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
813 ; VBITS_EQ_256-DAG: fmax [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
814 ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
815 ; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
816 ; VBITS_EQ_256-NEXT: ret
817 %op1 = load <8 x double>, <8 x double>* %a
818 %op2 = load <8 x double>, <8 x double>* %b
819 %res = call <8 x double> @llvm.maximum.v8f64(<8 x double> %op1, <8 x double> %op2)
820 store <8 x double> %res, <8 x double>* %a
824 define void @fmax_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
825 ; CHECK-LABEL: fmax_v16f64:
826 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
827 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
828 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
829 ; VBITS_GE_1024-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
830 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
831 ; VBITS_GE_1024-NEXT: ret
832 %op1 = load <16 x double>, <16 x double>* %a
833 %op2 = load <16 x double>, <16 x double>* %b
834 %res = call <16 x double> @llvm.maximum.v16f64(<16 x double> %op1, <16 x double> %op2)
835 store <16 x double> %res, <16 x double>* %a
839 define void @fmax_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
840 ; CHECK-LABEL: fmax_v32f64:
841 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
842 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
843 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
844 ; VBITS_GE_2048-NEXT: fmax [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
845 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
846 ; VBITS_GE_2048-NEXT: ret
847 %op1 = load <32 x double>, <32 x double>* %a
848 %op2 = load <32 x double>, <32 x double>* %b
849 %res = call <32 x double> @llvm.maximum.v32f64(<32 x double> %op1, <32 x double> %op2)
850 store <32 x double> %res, <32 x double>* %a
858 ; Don't use SVE for 64-bit vectors.
859 define <4 x half> @fmin_v4f16(<4 x half> %op1, <4 x half> %op2) #0 {
860 ; CHECK-LABEL: fmin_v4f16:
861 ; CHECK: fmin v0.4h, v0.4h, v1.4h
863 %res = call <4 x half> @llvm.minimum.v4f16(<4 x half> %op1, <4 x half> %op2)
867 ; Don't use SVE for 128-bit vectors.
868 define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) #0 {
869 ; CHECK-LABEL: fmin_v8f16:
870 ; CHECK: fmin v0.8h, v0.8h, v1.8h
872 %res = call <8 x half> @llvm.minimum.v8f16(<8 x half> %op1, <8 x half> %op2)
876 define void @fmin_v16f16(<16 x half>* %a, <16 x half>* %b) #0 {
877 ; CHECK-LABEL: fmin_v16f16:
878 ; CHECK: ptrue [[PG:p[0-9]+]].h, vl16
879 ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
880 ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
881 ; CHECK-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
882 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
884 %op1 = load <16 x half>, <16 x half>* %a
885 %op2 = load <16 x half>, <16 x half>* %b
886 %res = call <16 x half> @llvm.minimum.v16f16(<16 x half> %op1, <16 x half> %op2)
887 store <16 x half> %res, <16 x half>* %a
891 define void @fmin_v32f16(<32 x half>* %a, <32 x half>* %b) #0 {
892 ; CHECK-LABEL: fmin_v32f16:
893 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32
894 ; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
895 ; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
896 ; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
897 ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
898 ; VBITS_GE_512-NEXT: ret
900 ; Ensure sensible type legalisation.
901 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
902 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16
903 ; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0]
904 ; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1]
905 ; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1]
906 ; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #1]
907 ; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].h, [[PG]]/m, [[OP1_LO]].h, [[OP2_LO]].h
908 ; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].h, [[PG]]/m, [[OP1_HI]].h, [[OP2_HI]].h
909 ; VBITS_EQ_256-DAG: st1h { [[RES_LO]].h }, [[PG]], [x0]
910 ; VBITS_EQ_256-DAG: st1h { [[RES_HI]].h }, [[PG]], [x0, x[[NUMELTS]], lsl #1]
911 ; VBITS_EQ_256-NEXT: ret
912 %op1 = load <32 x half>, <32 x half>* %a
913 %op2 = load <32 x half>, <32 x half>* %b
914 %res = call <32 x half> @llvm.minimum.v32f16(<32 x half> %op1, <32 x half> %op2)
915 store <32 x half> %res, <32 x half>* %a
919 define void @fmin_v64f16(<64 x half>* %a, <64 x half>* %b) #0 {
920 ; CHECK-LABEL: fmin_v64f16:
921 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64
922 ; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
923 ; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
924 ; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
925 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
926 ; VBITS_GE_1024-NEXT: ret
927 %op1 = load <64 x half>, <64 x half>* %a
928 %op2 = load <64 x half>, <64 x half>* %b
929 %res = call <64 x half> @llvm.minimum.v64f16(<64 x half> %op1, <64 x half> %op2)
930 store <64 x half> %res, <64 x half>* %a
934 define void @fmin_v128f16(<128 x half>* %a, <128 x half>* %b) #0 {
935 ; CHECK-LABEL: fmin_v128f16:
936 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128
937 ; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
938 ; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
939 ; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
940 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0]
941 ; VBITS_GE_2048-NEXT: ret
942 %op1 = load <128 x half>, <128 x half>* %a
943 %op2 = load <128 x half>, <128 x half>* %b
944 %res = call <128 x half> @llvm.minimum.v128f16(<128 x half> %op1, <128 x half> %op2)
945 store <128 x half> %res, <128 x half>* %a
949 ; Don't use SVE for 64-bit vectors.
950 define <2 x float> @fmin_v2f32(<2 x float> %op1, <2 x float> %op2) #0 {
951 ; CHECK-LABEL: fmin_v2f32:
952 ; CHECK: fmin v0.2s, v0.2s, v1.2s
954 %res = call <2 x float> @llvm.minimum.v2f32(<2 x float> %op1, <2 x float> %op2)
958 ; Don't use SVE for 128-bit vectors.
959 define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) #0 {
960 ; CHECK-LABEL: fmin_v4f32:
961 ; CHECK: fmin v0.4s, v0.4s, v1.4s
963 %res = call <4 x float> @llvm.minimum.v4f32(<4 x float> %op1, <4 x float> %op2)
967 define void @fmin_v8f32(<8 x float>* %a, <8 x float>* %b) #0 {
968 ; CHECK-LABEL: fmin_v8f32:
969 ; CHECK: ptrue [[PG:p[0-9]+]].s, vl8
970 ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
971 ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
972 ; CHECK-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
973 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
975 %op1 = load <8 x float>, <8 x float>* %a
976 %op2 = load <8 x float>, <8 x float>* %b
977 %res = call <8 x float> @llvm.minimum.v8f32(<8 x float> %op1, <8 x float> %op2)
978 store <8 x float> %res, <8 x float>* %a
982 define void @fmin_v16f32(<16 x float>* %a, <16 x float>* %b) #0 {
983 ; CHECK-LABEL: fmin_v16f32:
984 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16
985 ; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
986 ; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
987 ; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
988 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
989 ; VBITS_GE_512-NEXT: ret
991 ; Ensure sensible type legalisation.
992 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8
993 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8
994 ; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0]
995 ; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2]
996 ; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1]
997 ; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #2]
998 ; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].s, [[PG]]/m, [[OP1_LO]].s, [[OP2_LO]].s
999 ; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].s, [[PG]]/m, [[OP1_HI]].s, [[OP2_HI]].s
1000 ; VBITS_EQ_256-DAG: st1w { [[RES_LO]].s }, [[PG]], [x0]
1001 ; VBITS_EQ_256-DAG: st1w { [[RES_HI]].s }, [[PG]], [x0, x[[NUMELTS]], lsl #2]
1002 ; VBITS_EQ_256-NEXT: ret
1003 %op1 = load <16 x float>, <16 x float>* %a
1004 %op2 = load <16 x float>, <16 x float>* %b
1005 %res = call <16 x float> @llvm.minimum.v16f32(<16 x float> %op1, <16 x float> %op2)
1006 store <16 x float> %res, <16 x float>* %a
1010 define void @fmin_v32f32(<32 x float>* %a, <32 x float>* %b) #0 {
1011 ; CHECK-LABEL: fmin_v32f32:
1012 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32
1013 ; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
1014 ; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
1015 ; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
1016 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
1017 ; VBITS_GE_1024-NEXT: ret
1018 %op1 = load <32 x float>, <32 x float>* %a
1019 %op2 = load <32 x float>, <32 x float>* %b
1020 %res = call <32 x float> @llvm.minimum.v32f32(<32 x float> %op1, <32 x float> %op2)
1021 store <32 x float> %res, <32 x float>* %a
1025 define void @fmin_v64f32(<64 x float>* %a, <64 x float>* %b) #0 {
1026 ; CHECK-LABEL: fmin_v64f32:
1027 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64
1028 ; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
1029 ; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
1030 ; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
1031 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0]
1032 ; VBITS_GE_2048-NEXT: ret
1033 %op1 = load <64 x float>, <64 x float>* %a
1034 %op2 = load <64 x float>, <64 x float>* %b
1035 %res = call <64 x float> @llvm.minimum.v64f32(<64 x float> %op1, <64 x float> %op2)
1036 store <64 x float> %res, <64 x float>* %a
1040 ; Don't use SVE for 64-bit vectors.
1041 define <1 x double> @fmin_v1f64(<1 x double> %op1, <1 x double> %op2) #0 {
1042 ; CHECK-LABEL: fmin_v1f64:
1043 ; CHECK: fmin d0, d0, d1
1045 %res = call <1 x double> @llvm.minimum.v1f64(<1 x double> %op1, <1 x double> %op2)
1046 ret <1 x double> %res
1049 ; Don't use SVE for 128-bit vectors.
1050 define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) #0 {
1051 ; CHECK-LABEL: fmin_v2f64:
1052 ; CHECK: fmin v0.2d, v0.2d, v1.2d
1054 %res = call <2 x double> @llvm.minimum.v2f64(<2 x double> %op1, <2 x double> %op2)
1055 ret <2 x double> %res
1058 define void @fmin_v4f64(<4 x double>* %a, <4 x double>* %b) #0 {
1059 ; CHECK-LABEL: fmin_v4f64:
1060 ; CHECK: ptrue [[PG:p[0-9]+]].d, vl4
1061 ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1062 ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1063 ; CHECK-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1064 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
1066 %op1 = load <4 x double>, <4 x double>* %a
1067 %op2 = load <4 x double>, <4 x double>* %b
1068 %res = call <4 x double> @llvm.minimum.v4f64(<4 x double> %op1, <4 x double> %op2)
1069 store <4 x double> %res, <4 x double>* %a
1073 define void @fmin_v8f64(<8 x double>* %a, <8 x double>* %b) #0 {
1074 ; CHECK-LABEL: fmin_v8f64:
1075 ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8
1076 ; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1077 ; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1078 ; VBITS_GE_512-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1079 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
1080 ; VBITS_GE_512-NEXT: ret
1082 ; Ensure sensible type legalisation.
1083 ; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4
1084 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4
1085 ; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0]
1086 ; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3]
1087 ; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1]
1088 ; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3]
1089 ; VBITS_EQ_256-DAG: fmin [[RES_LO:z[0-9]+]].d, [[PG]]/m, [[OP1_LO]].d, [[OP2_LO]].d
1090 ; VBITS_EQ_256-DAG: fmin [[RES_HI:z[0-9]+]].d, [[PG]]/m, [[OP1_HI]].d, [[OP2_HI]].d
1091 ; VBITS_EQ_256-DAG: st1d { [[RES_LO]].d }, [[PG]], [x0]
1092 ; VBITS_EQ_256-DAG: st1d { [[RES_HI]].d }, [[PG]], [x0, x[[NUMELTS]], lsl #3]
1093 ; VBITS_EQ_256-NEXT: ret
1094 %op1 = load <8 x double>, <8 x double>* %a
1095 %op2 = load <8 x double>, <8 x double>* %b
1096 %res = call <8 x double> @llvm.minimum.v8f64(<8 x double> %op1, <8 x double> %op2)
1097 store <8 x double> %res, <8 x double>* %a
1101 define void @fmin_v16f64(<16 x double>* %a, <16 x double>* %b) #0 {
1102 ; CHECK-LABEL: fmin_v16f64:
1103 ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16
1104 ; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1105 ; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1106 ; VBITS_GE_1024-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1107 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
1108 ; VBITS_GE_1024-NEXT: ret
1109 %op1 = load <16 x double>, <16 x double>* %a
1110 %op2 = load <16 x double>, <16 x double>* %b
1111 %res = call <16 x double> @llvm.minimum.v16f64(<16 x double> %op1, <16 x double> %op2)
1112 store <16 x double> %res, <16 x double>* %a
1116 define void @fmin_v32f64(<32 x double>* %a, <32 x double>* %b) #0 {
1117 ; CHECK-LABEL: fmin_v32f64:
1118 ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32
1119 ; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
1120 ; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
1121 ; VBITS_GE_2048-NEXT: fmin [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
1122 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0]
1123 ; VBITS_GE_2048-NEXT: ret
1124 %op1 = load <32 x double>, <32 x double>* %a
1125 %op2 = load <32 x double>, <32 x double>* %b
1126 %res = call <32 x double> @llvm.minimum.v32f64(<32 x double> %op1, <32 x double> %op2)
1127 store <32 x double> %res, <32 x double>* %a
1131 attributes #0 = { "target-features"="+sve" }
1133 declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
1134 declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
1135 declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>)
1136 declare <32 x half> @llvm.minnum.v32f16(<32 x half>, <32 x half>)
1137 declare <64 x half> @llvm.minnum.v64f16(<64 x half>, <64 x half>)
1138 declare <128 x half> @llvm.minnum.v128f16(<128 x half>, <128 x half>)
1139 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>)
1140 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
1141 declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
1142 declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>)
1143 declare <32 x float> @llvm.minnum.v32f32(<32 x float>, <32 x float>)
1144 declare <64 x float> @llvm.minnum.v64f32(<64 x float>, <64 x float>)
1145 declare <1 x double> @llvm.minnum.v1f64(<1 x double>, <1 x double>)
1146 declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
1147 declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
1148 declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>)
1149 declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>)
1150 declare <32 x double> @llvm.minnum.v32f64(<32 x double>, <32 x double>)
1152 declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)
1153 declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
1154 declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
1155 declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>)
1156 declare <64 x half> @llvm.maxnum.v64f16(<64 x half>, <64 x half>)
1157 declare <128 x half> @llvm.maxnum.v128f16(<128 x half>, <128 x half>)
1158 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>)
1159 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
1160 declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
1161 declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>)
1162 declare <32 x float> @llvm.maxnum.v32f32(<32 x float>, <32 x float>)
1163 declare <64 x float> @llvm.maxnum.v64f32(<64 x float>, <64 x float>)
1164 declare <1 x double> @llvm.maxnum.v1f64(<1 x double>, <1 x double>)
1165 declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
1166 declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
1167 declare <8 x double> @llvm.maxnum.v8f64(<8 x double>, <8 x double>)
1168 declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>)
1169 declare <32 x double> @llvm.maxnum.v32f64(<32 x double>, <32 x double>)
1171 declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
1172 declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
1173 declare <16 x half> @llvm.minimum.v16f16(<16 x half>, <16 x half>)
1174 declare <32 x half> @llvm.minimum.v32f16(<32 x half>, <32 x half>)
1175 declare <64 x half> @llvm.minimum.v64f16(<64 x half>, <64 x half>)
1176 declare <128 x half> @llvm.minimum.v128f16(<128 x half>, <128 x half>)
1177 declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
1178 declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
1179 declare <8 x float> @llvm.minimum.v8f32(<8 x float>, <8 x float>)
1180 declare <16 x float> @llvm.minimum.v16f32(<16 x float>, <16 x float>)
1181 declare <32 x float> @llvm.minimum.v32f32(<32 x float>, <32 x float>)
1182 declare <64 x float> @llvm.minimum.v64f32(<64 x float>, <64 x float>)
1183 declare <1 x double> @llvm.minimum.v1f64(<1 x double>, <1 x double>)
1184 declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
1185 declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
1186 declare <8 x double> @llvm.minimum.v8f64(<8 x double>, <8 x double>)
1187 declare <16 x double> @llvm.minimum.v16f64(<16 x double>, <16 x double>)
1188 declare <32 x double> @llvm.minimum.v32f64(<32 x double>, <32 x double>)
1190 declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
1191 declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
1192 declare <16 x half> @llvm.maximum.v16f16(<16 x half>, <16 x half>)
1193 declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>)
1194 declare <64 x half> @llvm.maximum.v64f16(<64 x half>, <64 x half>)
1195 declare <128 x half> @llvm.maximum.v128f16(<128 x half>, <128 x half>)
1196 declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
1197 declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
1198 declare <8 x float> @llvm.maximum.v8f32(<8 x float>, <8 x float>)
1199 declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>)
1200 declare <32 x float> @llvm.maximum.v32f32(<32 x float>, <32 x float>)
1201 declare <64 x float> @llvm.maximum.v64f32(<64 x float>, <64 x float>)
1202 declare <1 x double> @llvm.maximum.v1f64(<1 x double>, <1 x double>)
1203 declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
1204 declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
1205 declare <8 x double> @llvm.maximum.v8f64(<8 x double>, <8 x double>)
1206 declare <16 x double> @llvm.maximum.v16f64(<16 x double>, <16 x double>)
1207 declare <32 x double> @llvm.maximum.v32f64(<32 x double>, <32 x double>)