1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
5 ; Test expansion of scalar selects on vectors.
6 ; Evergreen not enabled since it seems to be having problems with doubles.
8 ; GCN-LABEL: {{^}}v_select_v2i8:
15 ; This is worse when i16 is legal and packed is not because
16 ; SelectionDAGBuilder for some reason changes the select type.
19 define amdgpu_kernel void @v_select_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %a.ptr, <2 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
20 %a = load <2 x i8>, <2 x i8> addrspace(1)* %a.ptr, align 2
21 %b = load <2 x i8>, <2 x i8> addrspace(1)* %b.ptr, align 2
22 %cmp = icmp eq i32 %c, 0
23 %select = select i1 %cmp, <2 x i8> %a, <2 x i8> %b
24 store <2 x i8> %select, <2 x i8> addrspace(1)* %out, align 2
28 ; GCN-LABEL: {{^}}v_select_v4i8:
29 ; GCN: v_cndmask_b32_e32
31 define amdgpu_kernel void @v_select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %a.ptr, <4 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
32 %a = load <4 x i8>, <4 x i8> addrspace(1)* %a.ptr
33 %b = load <4 x i8>, <4 x i8> addrspace(1)* %b.ptr
34 %cmp = icmp eq i32 %c, 0
35 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
36 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
40 ; GCN-LABEL: {{^}}v_select_v8i8:
41 ; GCN: v_cndmask_b32_e32
42 ; GCN: v_cndmask_b32_e32
44 define amdgpu_kernel void @v_select_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %a.ptr, <8 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
45 %a = load <8 x i8>, <8 x i8> addrspace(1)* %a.ptr
46 %b = load <8 x i8>, <8 x i8> addrspace(1)* %b.ptr
47 %cmp = icmp eq i32 %c, 0
48 %select = select i1 %cmp, <8 x i8> %a, <8 x i8> %b
49 store <8 x i8> %select, <8 x i8> addrspace(1)* %out, align 4
53 ; GCN-LABEL: {{^}}v_select_v16i8:
54 ; GCN: v_cndmask_b32_e32
55 ; GCN: v_cndmask_b32_e32
56 ; GCN: v_cndmask_b32_e32
57 ; GCN: v_cndmask_b32_e32
59 define amdgpu_kernel void @v_select_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %a.ptr, <16 x i8> addrspace(1)* %b.ptr, i32 %c) #0 {
60 %a = load <16 x i8>, <16 x i8> addrspace(1)* %a.ptr
61 %b = load <16 x i8>, <16 x i8> addrspace(1)* %b.ptr
62 %cmp = icmp eq i32 %c, 0
63 %select = select i1 %cmp, <16 x i8> %a, <16 x i8> %b
64 store <16 x i8> %select, <16 x i8> addrspace(1)* %out, align 4
68 ; GCN-LABEL: {{^}}select_v4i8:
71 define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) #0 {
72 %cmp = icmp eq i8 %c, 0
73 %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
74 store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
78 ; GCN-LABEL: {{^}}select_v2i16:
79 ; GCN: v_cndmask_b32_e32
80 ; GCN-NOT: v_cndmask_b32
81 define amdgpu_kernel void @select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b, i32 %c) #0 {
82 %cmp = icmp eq i32 %c, 0
83 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
84 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4
88 ; GCN-LABEL: {{^}}v_select_v2i16:
89 ; GCN: v_cndmask_b32_e32
91 define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
92 %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.ptr
93 %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.ptr
94 %cmp = icmp eq i32 %c, 0
95 %select = select i1 %cmp, <2 x i16> %a, <2 x i16> %b
96 store <2 x i16> %select, <2 x i16> addrspace(1)* %out, align 4
100 ; GCN-LABEL: {{^}}v_select_v3i16:
101 ; SI: v_cndmask_b32_e32
105 ; GFX9: v_cndmask_b32_e32
112 define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
113 %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr
114 %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr
115 %cmp = icmp eq i32 %c, 0
116 %select = select i1 %cmp, <3 x i16> %a, <3 x i16> %b
117 store <3 x i16> %select, <3 x i16> addrspace(1)* %out, align 4
121 ; GCN-LABEL: {{^}}v_select_v4i16:
122 ; GCN: v_cndmask_b32_e32
123 ; GCN: v_cndmask_b32_e32
125 define amdgpu_kernel void @v_select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %a.ptr, <4 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
126 %a = load <4 x i16>, <4 x i16> addrspace(1)* %a.ptr
127 %b = load <4 x i16>, <4 x i16> addrspace(1)* %b.ptr
128 %cmp = icmp eq i32 %c, 0
129 %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
130 store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
134 ; GCN-LABEL: {{^}}v_select_v8i16:
135 ; GCN: v_cndmask_b32_e32
136 ; GCN: v_cndmask_b32_e32
137 ; GCN: v_cndmask_b32_e32
138 ; GCN: v_cndmask_b32_e32
140 define amdgpu_kernel void @v_select_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %a.ptr, <8 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
141 %a = load <8 x i16>, <8 x i16> addrspace(1)* %a.ptr
142 %b = load <8 x i16>, <8 x i16> addrspace(1)* %b.ptr
143 %cmp = icmp eq i32 %c, 0
144 %select = select i1 %cmp, <8 x i16> %a, <8 x i16> %b
145 store <8 x i16> %select, <8 x i16> addrspace(1)* %out, align 4
149 ; FIXME: Expansion with bitwise operations may be better if doing a
150 ; vector select with SGPR inputs.
152 ; GCN-LABEL: {{^}}s_select_v2i32:
153 ; GCN: v_cndmask_b32_e32
154 ; GCN: v_cndmask_b32_e32
155 ; GCN: buffer_store_dwordx2
156 define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
157 %cmp = icmp eq i32 %c, 0
158 %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
159 store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
163 ; GCN-LABEL: {{^}}s_select_v4i32:
164 ; GCN: v_cndmask_b32_e32
165 ; GCN: v_cndmask_b32_e32
166 ; GCN: v_cndmask_b32_e32
167 ; GCN: v_cndmask_b32_e32
168 ; GCN: buffer_store_dwordx4
169 define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
170 %cmp = icmp eq i32 %c, 0
171 %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
172 store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
176 ; GCN-LABEL: {{^}}v_select_v4i32:
177 ; GCN: buffer_load_dwordx4
178 ; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
179 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
180 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
181 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
182 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
183 ; GCN: buffer_store_dwordx4
184 define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
186 %tmp2 = icmp ult i32 %cond, 32
187 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in
188 %tmp3 = select i1 %tmp2, <4 x i32> %val, <4 x i32> zeroinitializer
189 store <4 x i32> %tmp3, <4 x i32> addrspace(1)* %out, align 16
193 ; GCN-LABEL: {{^}}select_v8i32:
194 ; GCN: v_cndmask_b32_e32
195 ; GCN: v_cndmask_b32_e32
196 ; GCN: v_cndmask_b32_e32
197 ; GCN: v_cndmask_b32_e32
198 ; GCN: v_cndmask_b32_e32
199 ; GCN: v_cndmask_b32_e32
200 ; GCN: v_cndmask_b32_e32
201 ; GCN: v_cndmask_b32_e32
202 define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) #0 {
203 %cmp = icmp eq i32 %c, 0
204 %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
205 store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
209 ; GCN-LABEL: {{^}}s_select_v2f32:
210 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
211 ; GCN-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
213 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
214 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
215 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
216 ; GCN-DAG: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
218 ; GCN-DAG: v_cndmask_b32_e32
219 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
220 ; GCN-DAG: v_cndmask_b32_e32
221 ; GCN: buffer_store_dwordx2
222 define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) #0 {
223 %cmp = icmp eq i32 %c, 0
224 %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
225 store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
229 ; GCN-LABEL: {{^}}s_select_v4f32:
230 ; GCN: s_load_dwordx4
231 ; GCN: s_load_dwordx4
232 ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
234 ; GCN: v_cndmask_b32_e32
235 ; GCN: v_cndmask_b32_e32
236 ; GCN: v_cndmask_b32_e32
237 ; GCN: v_cndmask_b32_e32
239 ; GCN: buffer_store_dwordx4
240 define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) #0 {
241 %cmp = icmp eq i32 %c, 0
242 %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
243 store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
247 ; GCN-LABEL: {{^}}v_select_v4f32:
248 ; GCN: buffer_load_dwordx4
249 ; GCN: v_cmp_lt_u32_e64 vcc, s{{[0-9]+}}, 32
250 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
251 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
252 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
253 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
254 ; GCN: buffer_store_dwordx4
255 define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
257 %tmp2 = icmp ult i32 %cond, 32
258 %val = load <4 x float>, <4 x float> addrspace(1)* %in
259 %tmp3 = select i1 %tmp2, <4 x float> %val, <4 x float> zeroinitializer
260 store <4 x float> %tmp3, <4 x float> addrspace(1)* %out, align 16
264 ; GCN-LABEL: {{^}}select_v8f32:
265 ; GCN: v_cndmask_b32_e32
266 ; GCN: v_cndmask_b32_e32
267 ; GCN: v_cndmask_b32_e32
268 ; GCN: v_cndmask_b32_e32
269 ; GCN: v_cndmask_b32_e32
270 ; GCN: v_cndmask_b32_e32
271 ; GCN: v_cndmask_b32_e32
272 ; GCN: v_cndmask_b32_e32
273 define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) #0 {
274 %cmp = icmp eq i32 %c, 0
275 %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
276 store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
280 ; GCN-LABEL: {{^}}select_v2f64:
281 ; GCN: v_cndmask_b32_e32
282 ; GCN: v_cndmask_b32_e32
283 ; GCN: v_cndmask_b32_e32
284 ; GCN: v_cndmask_b32_e32
285 define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) #0 {
286 %cmp = icmp eq i32 %c, 0
287 %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
288 store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
292 ; GCN-LABEL: {{^}}select_v4f64:
293 ; GCN: v_cndmask_b32_e32
294 ; GCN: v_cndmask_b32_e32
295 ; GCN: v_cndmask_b32_e32
296 ; GCN: v_cndmask_b32_e32
297 ; GCN: v_cndmask_b32_e32
298 ; GCN: v_cndmask_b32_e32
299 ; GCN: v_cndmask_b32_e32
300 ; GCN: v_cndmask_b32_e32
301 define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) #0 {
302 %cmp = icmp eq i32 %c, 0
303 %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
304 store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
308 ; GCN-LABEL: {{^}}select_v8f64:
309 ; GCN: v_cndmask_b32_e32
310 ; GCN: v_cndmask_b32_e32
311 ; GCN: v_cndmask_b32_e32
312 ; GCN: v_cndmask_b32_e32
313 ; GCN: v_cndmask_b32_e32
314 ; GCN: v_cndmask_b32_e32
315 ; GCN: v_cndmask_b32_e32
316 ; GCN: v_cndmask_b32_e32
317 ; GCN: v_cndmask_b32_e32
318 ; GCN: v_cndmask_b32_e32
319 ; GCN: v_cndmask_b32_e32
320 ; GCN: v_cndmask_b32_e32
321 ; GCN: v_cndmask_b32_e32
322 ; GCN: v_cndmask_b32_e32
323 ; GCN: v_cndmask_b32_e32
324 ; GCN: v_cndmask_b32_e32
325 define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) #0 {
326 %cmp = icmp eq i32 %c, 0
327 %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
328 store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
332 ; GCN-LABEL: {{^}}v_select_v2f16:
333 ; GCN: v_cndmask_b32_e32
335 define amdgpu_kernel void @v_select_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %a.ptr, <2 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
336 %a = load <2 x half>, <2 x half> addrspace(1)* %a.ptr
337 %b = load <2 x half>, <2 x half> addrspace(1)* %b.ptr
338 %cmp = icmp eq i32 %c, 0
339 %select = select i1 %cmp, <2 x half> %a, <2 x half> %b
340 store <2 x half> %select, <2 x half> addrspace(1)* %out, align 4
344 ; GCN-LABEL: {{^}}v_select_v3f16:
345 ; GCN: v_cndmask_b32_e32
346 ; GCN: v_cndmask_b32_e32
348 define amdgpu_kernel void @v_select_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %a.ptr, <3 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
349 %a = load <3 x half>, <3 x half> addrspace(1)* %a.ptr
350 %b = load <3 x half>, <3 x half> addrspace(1)* %b.ptr
351 %cmp = icmp eq i32 %c, 0
352 %select = select i1 %cmp, <3 x half> %a, <3 x half> %b
353 store <3 x half> %select, <3 x half> addrspace(1)* %out, align 4
357 ; GCN-LABEL: {{^}}v_select_v4f16:
358 ; GCN: v_cndmask_b32_e32
359 ; GCN: v_cndmask_b32_e32
361 define amdgpu_kernel void @v_select_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %a.ptr, <4 x half> addrspace(1)* %b.ptr, i32 %c) #0 {
362 %a = load <4 x half>, <4 x half> addrspace(1)* %a.ptr
363 %b = load <4 x half>, <4 x half> addrspace(1)* %b.ptr
364 %cmp = icmp eq i32 %c, 0
365 %select = select i1 %cmp, <4 x half> %a, <4 x half> %b
366 store <4 x half> %select, <4 x half> addrspace(1)* %out, align 4
370 ; Function Attrs: nounwind readnone
371 declare i32 @llvm.amdgcn.workitem.id.x() #1
373 attributes #0 = { nounwind }
374 attributes #1 = { nounwind readnone }