1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3 ; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
6 target triple = "aarch64-unknown-linux-gnu"
8 ; Don't use SVE for 64-bit vectors.
9 define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
10 ; CHECK-LABEL: select_v4f16:
12 ; CHECK-NEXT: shl v2.4h, v2.4h, #15
13 ; CHECK-NEXT: cmlt v2.4h, v2.4h, #0
14 ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
16 %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2
20 ; Don't use SVE for 128-bit vectors.
21 define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) vscale_range(2,0) #0 {
22 ; CHECK-LABEL: select_v8f16:
24 ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
25 ; CHECK-NEXT: shl v2.8h, v2.8h, #15
26 ; CHECK-NEXT: cmlt v2.8h, v2.8h, #0
27 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
29 %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2
33 define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
34 ; CHECK-LABEL: select_v16f16:
36 ; CHECK-NEXT: ptrue p0.h, vl16
37 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
38 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
39 ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
40 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
41 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
43 %op1 = load <16 x half>, ptr %a
44 %op2 = load <16 x half>, ptr %b
45 %mask = fcmp oeq <16 x half> %op1, %op2
46 %sel = select <16 x i1> %mask, <16 x half> %op1, <16 x half> %op2
47 store <16 x half> %sel, ptr %a
51 define void @select_v32f16(ptr %a, ptr %b) #0 {
52 ; VBITS_GE_256-LABEL: select_v32f16:
53 ; VBITS_GE_256: // %bb.0:
54 ; VBITS_GE_256-NEXT: ptrue p0.h, vl16
55 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
56 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
57 ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
58 ; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0]
59 ; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
60 ; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
61 ; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h
62 ; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h
63 ; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h
64 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
65 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
66 ; VBITS_GE_256-NEXT: ret
68 ; VBITS_GE_512-LABEL: select_v32f16:
69 ; VBITS_GE_512: // %bb.0:
70 ; VBITS_GE_512-NEXT: ptrue p0.h, vl32
71 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
72 ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
73 ; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
74 ; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h
75 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0]
76 ; VBITS_GE_512-NEXT: ret
77 %op1 = load <32 x half>, ptr %a
78 %op2 = load <32 x half>, ptr %b
79 %mask = fcmp oeq <32 x half> %op1, %op2
80 %sel = select <32 x i1> %mask, <32 x half> %op1, <32 x half> %op2
81 store <32 x half> %sel, ptr %a
85 define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
86 ; CHECK-LABEL: select_v64f16:
88 ; CHECK-NEXT: ptrue p0.h, vl64
89 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
90 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
91 ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
92 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
93 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
95 %op1 = load <64 x half>, ptr %a
96 %op2 = load <64 x half>, ptr %b
97 %mask = fcmp oeq <64 x half> %op1, %op2
98 %sel = select <64 x i1> %mask, <64 x half> %op1, <64 x half> %op2
99 store <64 x half> %sel, ptr %a
103 define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
104 ; CHECK-LABEL: select_v128f16:
106 ; CHECK-NEXT: ptrue p0.h, vl128
107 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
108 ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1]
109 ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h
110 ; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h
111 ; CHECK-NEXT: st1h { z0.h }, p0, [x0]
113 %op1 = load <128 x half>, ptr %a
114 %op2 = load <128 x half>, ptr %b
115 %mask = fcmp oeq <128 x half> %op1, %op2
116 %sel = select <128 x i1> %mask, <128 x half> %op1, <128 x half> %op2
117 store <128 x half> %sel, ptr %a
121 ; Don't use SVE for 64-bit vectors.
122 define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
123 ; CHECK-LABEL: select_v2f32:
125 ; CHECK-NEXT: shl v2.2s, v2.2s, #31
126 ; CHECK-NEXT: cmlt v2.2s, v2.2s, #0
127 ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
129 %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2
133 ; Don't use SVE for 128-bit vectors.
134 define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) vscale_range(2,0) #0 {
135 ; CHECK-LABEL: select_v4f32:
137 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0
138 ; CHECK-NEXT: shl v2.4s, v2.4s, #31
139 ; CHECK-NEXT: cmlt v2.4s, v2.4s, #0
140 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
142 %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2
146 define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
147 ; CHECK-LABEL: select_v8f32:
149 ; CHECK-NEXT: ptrue p0.s, vl8
150 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
151 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
152 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
153 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
154 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
156 %op1 = load <8 x float>, ptr %a
157 %op2 = load <8 x float>, ptr %b
158 %mask = fcmp oeq <8 x float> %op1, %op2
159 %sel = select <8 x i1> %mask, <8 x float> %op1, <8 x float> %op2
160 store <8 x float> %sel, ptr %a
164 define void @select_v16f32(ptr %a, ptr %b) #0 {
165 ; VBITS_GE_256-LABEL: select_v16f32:
166 ; VBITS_GE_256: // %bb.0:
167 ; VBITS_GE_256-NEXT: ptrue p0.s, vl8
168 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
169 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
170 ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
171 ; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0]
172 ; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
173 ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
174 ; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s
175 ; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s
176 ; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s
177 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
178 ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
179 ; VBITS_GE_256-NEXT: ret
181 ; VBITS_GE_512-LABEL: select_v16f32:
182 ; VBITS_GE_512: // %bb.0:
183 ; VBITS_GE_512-NEXT: ptrue p0.s, vl16
184 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
185 ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
186 ; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
187 ; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s
188 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0]
189 ; VBITS_GE_512-NEXT: ret
190 %op1 = load <16 x float>, ptr %a
191 %op2 = load <16 x float>, ptr %b
192 %mask = fcmp oeq <16 x float> %op1, %op2
193 %sel = select <16 x i1> %mask, <16 x float> %op1, <16 x float> %op2
194 store <16 x float> %sel, ptr %a
198 define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
199 ; CHECK-LABEL: select_v32f32:
201 ; CHECK-NEXT: ptrue p0.s, vl32
202 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
203 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
204 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
205 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
206 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
208 %op1 = load <32 x float>, ptr %a
209 %op2 = load <32 x float>, ptr %b
210 %mask = fcmp oeq <32 x float> %op1, %op2
211 %sel = select <32 x i1> %mask, <32 x float> %op1, <32 x float> %op2
212 store <32 x float> %sel, ptr %a
216 define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
217 ; CHECK-LABEL: select_v64f32:
219 ; CHECK-NEXT: ptrue p0.s, vl64
220 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
221 ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1]
222 ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s
223 ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s
224 ; CHECK-NEXT: st1w { z0.s }, p0, [x0]
226 %op1 = load <64 x float>, ptr %a
227 %op2 = load <64 x float>, ptr %b
228 %mask = fcmp oeq <64 x float> %op1, %op2
229 %sel = select <64 x i1> %mask, <64 x float> %op1, <64 x float> %op2
230 store <64 x float> %sel, ptr %a
234 ; Don't use SVE for 64-bit vectors.
235 define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) vscale_range(2,0) #0 {
236 ; CHECK-LABEL: select_v1f64:
238 ; CHECK-NEXT: tst w0, #0x1
239 ; CHECK-NEXT: csetm x8, ne
240 ; CHECK-NEXT: fmov d2, x8
241 ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
243 %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2
244 ret <1 x double> %sel
247 ; Don't use SVE for 128-bit vectors.
248 define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) vscale_range(2,0) #0 {
249 ; CHECK-LABEL: select_v2f64:
251 ; CHECK-NEXT: ushll v2.2d, v2.2s, #0
252 ; CHECK-NEXT: shl v2.2d, v2.2d, #63
253 ; CHECK-NEXT: cmlt v2.2d, v2.2d, #0
254 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
256 %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2
257 ret <2 x double> %sel
260 define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
261 ; CHECK-LABEL: select_v4f64:
263 ; CHECK-NEXT: ptrue p0.d, vl4
264 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
265 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
266 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
267 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
268 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
270 %op1 = load <4 x double>, ptr %a
271 %op2 = load <4 x double>, ptr %b
272 %mask = fcmp oeq <4 x double> %op1, %op2
273 %sel = select <4 x i1> %mask, <4 x double> %op1, <4 x double> %op2
274 store <4 x double> %sel, ptr %a
278 define void @select_v8f64(ptr %a, ptr %b) #0 {
279 ; VBITS_GE_256-LABEL: select_v8f64:
280 ; VBITS_GE_256: // %bb.0:
281 ; VBITS_GE_256-NEXT: ptrue p0.d, vl4
282 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
283 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
284 ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
285 ; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0]
286 ; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
287 ; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
288 ; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d
289 ; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d
290 ; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d
291 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
292 ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
293 ; VBITS_GE_256-NEXT: ret
295 ; VBITS_GE_512-LABEL: select_v8f64:
296 ; VBITS_GE_512: // %bb.0:
297 ; VBITS_GE_512-NEXT: ptrue p0.d, vl8
298 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
299 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
300 ; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
301 ; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d
302 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0]
303 ; VBITS_GE_512-NEXT: ret
304 %op1 = load <8 x double>, ptr %a
305 %op2 = load <8 x double>, ptr %b
306 %mask = fcmp oeq <8 x double> %op1, %op2
307 %sel = select <8 x i1> %mask, <8 x double> %op1, <8 x double> %op2
308 store <8 x double> %sel, ptr %a
312 define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
313 ; CHECK-LABEL: select_v16f64:
315 ; CHECK-NEXT: ptrue p0.d, vl16
316 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
317 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
318 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
319 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
320 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
322 %op1 = load <16 x double>, ptr %a
323 %op2 = load <16 x double>, ptr %b
324 %mask = fcmp oeq <16 x double> %op1, %op2
325 %sel = select <16 x i1> %mask, <16 x double> %op1, <16 x double> %op2
326 store <16 x double> %sel, ptr %a
330 define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
331 ; CHECK-LABEL: select_v32f64:
333 ; CHECK-NEXT: ptrue p0.d, vl32
334 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
335 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1]
336 ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d
337 ; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d
338 ; CHECK-NEXT: st1d { z0.d }, p0, [x0]
340 %op1 = load <32 x double>, ptr %a
341 %op2 = load <32 x double>, ptr %b
342 %mask = fcmp oeq <32 x double> %op1, %op2
343 %sel = select <32 x i1> %mask, <32 x double> %op1, <32 x double> %op2
344 store <32 x double> %sel, ptr %a
348 attributes #0 = { "target-features"="+sve" }