1 ; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
2 ; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32
3 ; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=32
4 ; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
5 ; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
6 ; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
7 ; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
8 ; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
9 ; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
10 ; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
11 ; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
12 ; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
13 ; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
14 ; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
15 ; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024
16 ; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048
18 target triple = "aarch64-unknown-linux-gnu"
20 ; Don't use SVE when its registers are no bigger than NEON.
23 ; Don't use SVE for 64-bit vectors.
24 define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) #0 {
27 ; CHECK-NEXT: csetm w8, ne
28 ; CHECK-NEXT: dup v2.8b, w8
29 ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
31 %sel = select i1 %mask, <8 x i8> %op1, <8 x i8> %op2
35 ; Don't use SVE for 128-bit vectors.
36 define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) #0 {
37 ; CHECK: select_v16i8:
39 ; CHECK-NEXT: csetm w8, ne
40 ; CHECK-NEXT: dup v2.16b, w8
41 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
43 %sel = select i1 %mask, <16 x i8> %op1, <16 x i8> %op2
47 define void @select_v32i8(<32 x i8>* %a, <32 x i8>* %b, i1 %mask) #0 {
48 ; CHECK: select_v32i8:
49 ; CHECK: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
50 ; CHECK-NEXT: and w[[AND:[0-9]+]], w2, #0x1
51 ; CHECK-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
52 ; CHECK-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
53 ; CHECK-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
54 ; CHECK-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
55 ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].b
56 ; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
57 ; CHECK-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
58 ; CHECK-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
60 %op1 = load volatile <32 x i8>, <32 x i8>* %a
61 %op2 = load volatile <32 x i8>, <32 x i8>* %b
62 %sel = select i1 %mask, <32 x i8> %op1, <32 x i8> %op2
63 store <32 x i8> %sel, <32 x i8>* %a
67 define void @select_v64i8(<64 x i8>* %a, <64 x i8>* %b, i1 %mask) #0 {
68 ; CHECK: select_v64i8:
69 ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
70 ; VBITS_GE_512-NEXT: and w[[AND:[0-9]+]], w2, #0x1
71 ; VBITS_GE_512-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
72 ; VBITS_GE_512-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
73 ; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
74 ; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
75 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].b
76 ; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
77 ; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
78 ; VBITS_GE_512-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
79 ; VBITS_GE_512-NEXT: ret
80 %op1 = load volatile <64 x i8>, <64 x i8>* %a
81 %op2 = load volatile <64 x i8>, <64 x i8>* %b
82 %sel = select i1 %mask, <64 x i8> %op1, <64 x i8> %op2
83 store <64 x i8> %sel, <64 x i8>* %a
87 define void @select_v128i8(<128 x i8>* %a, <128 x i8>* %b, i1 %mask) #0 {
88 ; CHECK: select_v128i8:
89 ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
90 ; VBITS_GE_1024-NEXT: and w[[AND:[0-9]+]], w2, #0x1
91 ; VBITS_GE_1024-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
92 ; VBITS_GE_1024-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
93 ; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
94 ; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
95 ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].b
96 ; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
97 ; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
98 ; VBITS_GE_1024-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
99 ; VBITS_GE_1024-NEXT: ret
100 %op1 = load volatile <128 x i8>, <128 x i8>* %a
101 %op2 = load volatile <128 x i8>, <128 x i8>* %b
102 %sel = select i1 %mask, <128 x i8> %op1, <128 x i8> %op2
103 store <128 x i8> %sel, <128 x i8>* %a
107 define void @select_v256i8(<256 x i8>* %a, <256 x i8>* %b, i1 %mask) #0 {
108 ; CHECK: select_v256i8:
109 ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl[[#min(VBYTES,256)]]
110 ; VBITS_GE_2048-NEXT: and w[[AND:[0-9]+]], w2, #0x1
111 ; VBITS_GE_2048-NEXT: ld1b { [[OP1:z[0-9]+]].b }, [[PG1]]/z, [x0]
112 ; VBITS_GE_2048-NEXT: ld1b { [[OP2:z[0-9]+]].b }, [[PG1]]/z, [x1]
113 ; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].b, w[[AND]]
114 ; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].b, [[TMP1]].b, #0x1
115 ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].b
116 ; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].b, [[PG2]]/z, [[TMP2]].b, #0
117 ; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].b, [[PRES]], [[OP1]].b, [[OP2]].b
118 ; VBITS_GE_2048-NEXT: st1b { [[RES]].b }, [[PG1]], [x0]
119 ; VBITS_GE_2048-NEXT: ret
120 %op1 = load volatile <256 x i8>, <256 x i8>* %a
121 %op2 = load volatile <256 x i8>, <256 x i8>* %b
122 %sel = select i1 %mask, <256 x i8> %op1, <256 x i8> %op2
123 store <256 x i8> %sel, <256 x i8>* %a
127 ; Don't use SVE for 64-bit vectors.
128 define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) #0 {
129 ; CHECK: select_v4i16:
130 ; CHECK: tst w0, #0x1
131 ; CHECK-NEXT: csetm w8, ne
132 ; CHECK-NEXT: dup v2.4h, w8
133 ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
135 %sel = select i1 %mask, <4 x i16> %op1, <4 x i16> %op2
139 ; Don't use SVE for 128-bit vectors.
140 define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) #0 {
141 ; CHECK: select_v8i16:
142 ; CHECK: tst w0, #0x1
143 ; CHECK-NEXT: csetm w8, ne
144 ; CHECK-NEXT: dup v2.8h, w8
145 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
147 %sel = select i1 %mask, <8 x i16> %op1, <8 x i16> %op2
151 define void @select_v16i16(<16 x i16>* %a, <16 x i16>* %b, i1 %mask) #0 {
152 ; CHECK: select_v16i16:
153 ; CHECK: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]]
154 ; CHECK-NEXT: and w[[AND:[0-9]+]], w2, #0x1
155 ; CHECK-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
156 ; CHECK-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
157 ; CHECK-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
158 ; CHECK-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
159 ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].h
160 ; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
161 ; CHECK-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
162 ; CHECK-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
164 %op1 = load volatile <16 x i16>, <16 x i16>* %a
165 %op2 = load volatile <16 x i16>, <16 x i16>* %b
166 %sel = select i1 %mask, <16 x i16> %op1, <16 x i16> %op2
167 store <16 x i16> %sel, <16 x i16>* %a
171 define void @select_v32i16(<32 x i16>* %a, <32 x i16>* %b, i1 %mask) #0 {
172 ; CHECK: select_v32i16:
173 ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]]
174 ; VBITS_GE_512-NEXT: and w[[AND:[0-9]+]], w2, #0x1
175 ; VBITS_GE_512-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
176 ; VBITS_GE_512-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
177 ; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
178 ; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
179 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h
180 ; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
181 ; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
182 ; VBITS_GE_512-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
183 ; VBITS_GE_512-NEXT: ret
184 %op1 = load volatile <32 x i16>, <32 x i16>* %a
185 %op2 = load volatile <32 x i16>, <32 x i16>* %b
186 %sel = select i1 %mask, <32 x i16> %op1, <32 x i16> %op2
187 store <32 x i16> %sel, <32 x i16>* %a
191 define void @select_v64i16(<64 x i16>* %a, <64 x i16>* %b, i1 %mask) #0 {
192 ; CHECK: select_v64i16:
193 ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]]
194 ; VBITS_GE_1024-NEXT: and w[[AND:[0-9]+]], w2, #0x1
195 ; VBITS_GE_1024-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
196 ; VBITS_GE_1024-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
197 ; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
198 ; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
199 ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].h
200 ; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
201 ; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
202 ; VBITS_GE_1024-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
203 ; VBITS_GE_1024-NEXT: ret
204 %op1 = load volatile <64 x i16>, <64 x i16>* %a
205 %op2 = load volatile <64 x i16>, <64 x i16>* %b
206 %sel = select i1 %mask, <64 x i16> %op1, <64 x i16> %op2
207 store <64 x i16> %sel, <64 x i16>* %a
211 define void @select_v128i16(<128 x i16>* %a, <128 x i16>* %b, i1 %mask) #0 {
212 ; CHECK: select_v128i16:
213 ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]]
214 ; VBITS_GE_2048-NEXT: and w[[AND:[0-9]+]], w2, #0x1
215 ; VBITS_GE_2048-NEXT: ld1h { [[OP1:z[0-9]+]].h }, [[PG1]]/z, [x0]
216 ; VBITS_GE_2048-NEXT: ld1h { [[OP2:z[0-9]+]].h }, [[PG1]]/z, [x1]
217 ; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].h, w[[AND]]
218 ; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].h, [[TMP1]].h, #0x1
219 ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].h
220 ; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].h, [[PG2]]/z, [[TMP2]].h, #0
221 ; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].h, [[PRES]], [[OP1]].h, [[OP2]].h
222 ; VBITS_GE_2048-NEXT: st1h { [[RES]].h }, [[PG1]], [x0]
223 ; VBITS_GE_2048-NEXT: ret
224 %op1 = load volatile <128 x i16>, <128 x i16>* %a
225 %op2 = load volatile <128 x i16>, <128 x i16>* %b
226 %sel = select i1 %mask, <128 x i16> %op1, <128 x i16> %op2
227 store <128 x i16> %sel, <128 x i16>* %a
231 ; Don't use SVE for 64-bit vectors.
232 define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) #0 {
233 ; CHECK: select_v2i32:
234 ; CHECK: tst w0, #0x1
235 ; CHECK-NEXT: csetm w8, ne
236 ; CHECK-NEXT: dup v2.2s, w8
237 ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
239 %sel = select i1 %mask, <2 x i32> %op1, <2 x i32> %op2
243 ; Don't use SVE for 128-bit vectors.
244 define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) #0 {
245 ; CHECK: select_v4i32:
246 ; CHECK: tst w0, #0x1
247 ; CHECK-NEXT: csetm w8, ne
248 ; CHECK-NEXT: dup v2.4s, w8
249 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
251 %sel = select i1 %mask, <4 x i32> %op1, <4 x i32> %op2
255 define void @select_v8i32(<8 x i32>* %a, <8 x i32>* %b, i1 %mask) #0 {
256 ; CHECK: select_v8i32:
257 ; CHECK: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]]
258 ; CHECK-NEXT: and w[[AND:[0-9]+]], w2, #0x1
259 ; CHECK-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
260 ; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
261 ; CHECK-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
262 ; CHECK-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
263 ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].s
264 ; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
265 ; CHECK-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
266 ; CHECK-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
268 %op1 = load volatile <8 x i32>, <8 x i32>* %a
269 %op2 = load volatile <8 x i32>, <8 x i32>* %b
270 %sel = select i1 %mask, <8 x i32> %op1, <8 x i32> %op2
271 store <8 x i32> %sel, <8 x i32>* %a
275 define void @select_v16i32(<16 x i32>* %a, <16 x i32>* %b, i1 %mask) #0 {
276 ; CHECK: select_v16i32:
277 ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]]
278 ; VBITS_GE_512-NEXT: and w[[AND:[0-9]+]], w2, #0x1
279 ; VBITS_GE_512-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
280 ; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
281 ; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
282 ; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
283 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s
284 ; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
285 ; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
286 ; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
287 ; VBITS_GE_512-NEXT: ret
288 %op1 = load volatile <16 x i32>, <16 x i32>* %a
289 %op2 = load volatile <16 x i32>, <16 x i32>* %b
290 %sel = select i1 %mask, <16 x i32> %op1, <16 x i32> %op2
291 store <16 x i32> %sel, <16 x i32>* %a
295 define void @select_v32i32(<32 x i32>* %a, <32 x i32>* %b, i1 %mask) #0 {
296 ; CHECK: select_v32i32:
297 ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]]
298 ; VBITS_GE_1024-NEXT: and w[[AND:[0-9]+]], w2, #0x1
299 ; VBITS_GE_1024-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
300 ; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
301 ; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
302 ; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
303 ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].s
304 ; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
305 ; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
306 ; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
307 ; VBITS_GE_1024-NEXT: ret
308 %op1 = load volatile <32 x i32>, <32 x i32>* %a
309 %op2 = load volatile <32 x i32>, <32 x i32>* %b
310 %sel = select i1 %mask, <32 x i32> %op1, <32 x i32> %op2
311 store <32 x i32> %sel, <32 x i32>* %a
315 define void @select_v64i32(<64 x i32>* %a, <64 x i32>* %b, i1 %mask) #0 {
316 ; CHECK: select_v64i32:
317 ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]]
318 ; VBITS_GE_2048-NEXT: and w[[AND:[0-9]+]], w2, #0x1
319 ; VBITS_GE_2048-NEXT: ld1w { [[OP1:z[0-9]+]].s }, [[PG1]]/z, [x0]
320 ; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG1]]/z, [x1]
321 ; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].s, w[[AND]]
322 ; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].s, [[TMP1]].s, #0x1
323 ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].s
324 ; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].s, [[PG2]]/z, [[TMP2]].s, #0
325 ; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].s, [[PRES]], [[OP1]].s, [[OP2]].s
326 ; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG1]], [x0]
327 ; VBITS_GE_2048-NEXT: ret
328 %op1 = load volatile <64 x i32>, <64 x i32>* %a
329 %op2 = load volatile <64 x i32>, <64 x i32>* %b
330 %sel = select i1 %mask, <64 x i32> %op1, <64 x i32> %op2
331 store <64 x i32> %sel, <64 x i32>* %a
335 ; Don't use SVE for 64-bit vectors.
336 define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) #0 {
337 ; CHECK: select_v1i64:
338 ; CHECK: tst w0, #0x1
339 ; CHECK-NEXT: csetm x8, ne
340 ; CHECK-NEXT: fmov d2, x8
341 ; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
343 %sel = select i1 %mask, <1 x i64> %op1, <1 x i64> %op2
347 ; Don't use SVE for 128-bit vectors.
348 define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) #0 {
349 ; CHECK: select_v2i64:
350 ; CHECK: tst w0, #0x1
351 ; CHECK-NEXT: csetm x8, ne
352 ; CHECK-NEXT: dup v2.2d, x8
353 ; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
355 %sel = select i1 %mask, <2 x i64> %op1, <2 x i64> %op2
359 define void @select_v4i64(<4 x i64>* %a, <4 x i64>* %b, i1 %mask) #0 {
360 ; CHECK: select_v4i64:
361 ; CHECK: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]]
362 ; CHECK-NEXT: and w[[AND:[0-9]+]], w2, #0x1
363 ; CHECK-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
364 ; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
365 ; CHECK-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
366 ; CHECK-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
367 ; CHECK-NEXT: ptrue [[PG2:p[0-9]+]].d
368 ; CHECK-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
369 ; CHECK-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
370 ; CHECK-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
372 %op1 = load volatile <4 x i64>, <4 x i64>* %a
373 %op2 = load volatile <4 x i64>, <4 x i64>* %b
374 %sel = select i1 %mask, <4 x i64> %op1, <4 x i64> %op2
375 store <4 x i64> %sel, <4 x i64>* %a
379 define void @select_v8i64(<8 x i64>* %a, <8 x i64>* %b, i1 %mask) #0 {
380 ; CHECK: select_v8i64:
381 ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]]
382 ; VBITS_GE_512-NEXT: and w[[AND:[0-9]+]], w2, #0x1
383 ; VBITS_GE_512-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
384 ; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
385 ; VBITS_GE_512-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
386 ; VBITS_GE_512-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
387 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d
388 ; VBITS_GE_512-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
389 ; VBITS_GE_512-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
390 ; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
391 ; VBITS_GE_512-NEXT: ret
392 %op1 = load volatile <8 x i64>, <8 x i64>* %a
393 %op2 = load volatile <8 x i64>, <8 x i64>* %b
394 %sel = select i1 %mask, <8 x i64> %op1, <8 x i64> %op2
395 store <8 x i64> %sel, <8 x i64>* %a
399 define void @select_v16i64(<16 x i64>* %a, <16 x i64>* %b, i1 %mask) #0 {
400 ; CHECK: select_v16i64:
401 ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]]
402 ; VBITS_GE_1024-NEXT: and w[[AND:[0-9]+]], w2, #0x1
403 ; VBITS_GE_1024-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
404 ; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
405 ; VBITS_GE_1024-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
406 ; VBITS_GE_1024-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
407 ; VBITS_GE_1024-NEXT: ptrue [[PG2:p[0-9]+]].d
408 ; VBITS_GE_1024-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
409 ; VBITS_GE_1024-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
410 ; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
411 ; VBITS_GE_1024-NEXT: ret
412 %op1 = load volatile <16 x i64>, <16 x i64>* %a
413 %op2 = load volatile <16 x i64>, <16 x i64>* %b
414 %sel = select i1 %mask, <16 x i64> %op1, <16 x i64> %op2
415 store <16 x i64> %sel, <16 x i64>* %a
419 define void @select_v32i64(<32 x i64>* %a, <32 x i64>* %b, i1 %mask) #0 {
420 ; CHECK: select_v32i64:
421 ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]]
422 ; VBITS_GE_2048-NEXT: and w[[AND:[0-9]+]], w2, #0x1
423 ; VBITS_GE_2048-NEXT: ld1d { [[OP1:z[0-9]+]].d }, [[PG1]]/z, [x0]
424 ; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG1]]/z, [x1]
425 ; VBITS_GE_2048-NEXT: mov [[TMP1:z[0-9]+]].d, x[[AND]]
426 ; VBITS_GE_2048-NEXT: and [[TMP2:z[0-9]+]].d, [[TMP1]].d, #0x1
427 ; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d
428 ; VBITS_GE_2048-NEXT: cmpne [[PRES:p[0-9]+]].d, [[PG2]]/z, [[TMP2]].d, #0
429 ; VBITS_GE_2048-NEXT: sel [[RES:z[0-9]+]].d, [[PRES]], [[OP1]].d, [[OP2]].d
430 ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG1]], [x0]
431 ; VBITS_GE_2048-NEXT: ret
432 %op1 = load volatile <32 x i64>, <32 x i64>* %a
433 %op2 = load volatile <32 x i64>, <32 x i64>* %b
434 %sel = select i1 %mask, <32 x i64> %op1, <32 x i64> %op2
435 store <32 x i64> %sel, <32 x i64>* %a
439 attributes #0 = { "target-features"="+sve" }