1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ;RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck --check-prefixes=SI %s
3 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefixes=VI %s
4 ;RUN: llc < %s -mtriple=r600 -mcpu=redwood | FileCheck --check-prefixes=EG %s
6 define amdgpu_kernel void @test_select_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <2 x i32> %val) {
7 ; SI-LABEL: test_select_v2i32:
8 ; SI: ; %bb.0: ; %entry
9 ; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
10 ; SI-NEXT: s_waitcnt lgkmcnt(0)
11 ; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
12 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
13 ; SI-NEXT: s_mov_b32 s3, 0xf000
14 ; SI-NEXT: s_waitcnt lgkmcnt(0)
15 ; SI-NEXT: s_cmp_gt_i32 s9, s5
16 ; SI-NEXT: s_cselect_b32 s5, s7, s9
17 ; SI-NEXT: s_cmp_gt_i32 s8, s4
18 ; SI-NEXT: s_cselect_b32 s4, s6, s8
19 ; SI-NEXT: s_mov_b32 s2, -1
20 ; SI-NEXT: v_mov_b32_e32 v1, s5
21 ; SI-NEXT: v_mov_b32_e32 v0, s4
22 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
25 ; VI-LABEL: test_select_v2i32:
26 ; VI: ; %bb.0: ; %entry
27 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
28 ; VI-NEXT: s_waitcnt lgkmcnt(0)
29 ; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
30 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
31 ; VI-NEXT: s_mov_b32 s3, 0xf000
32 ; VI-NEXT: s_mov_b32 s2, -1
33 ; VI-NEXT: s_waitcnt lgkmcnt(0)
34 ; VI-NEXT: s_cmp_gt_i32 s9, s5
35 ; VI-NEXT: s_cselect_b32 s5, s7, s9
36 ; VI-NEXT: s_cmp_gt_i32 s8, s4
37 ; VI-NEXT: s_cselect_b32 s4, s6, s8
38 ; VI-NEXT: v_mov_b32_e32 v0, s4
39 ; VI-NEXT: v_mov_b32_e32 v1, s5
40 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
43 ; EG-LABEL: test_select_v2i32:
44 ; EG: ; %bb.0: ; %entry
45 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
47 ; EG-NEXT: ALU 5, @12, KC0[CB0:0-32], KC1[]
48 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
51 ; EG-NEXT: Fetch clause starting at 6:
52 ; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
53 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
54 ; EG-NEXT: ALU clause starting at 10:
55 ; EG-NEXT: MOV T0.X, KC0[2].Z,
56 ; EG-NEXT: MOV * T1.X, KC0[2].W,
57 ; EG-NEXT: ALU clause starting at 12:
58 ; EG-NEXT: SETGT_INT * T0.W, T0.Y, T1.Y,
59 ; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, KC0[3].Z,
60 ; EG-NEXT: SETGT_INT * T0.W, T0.X, T1.X,
61 ; EG-NEXT: CNDE_INT T0.X, PV.W, T0.X, KC0[3].Y,
62 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
63 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
65 %load0 = load <2 x i32>, ptr addrspace(1) %in0
66 %load1 = load <2 x i32>, ptr addrspace(1) %in1
67 %cmp = icmp sgt <2 x i32> %load0, %load1
68 %result = select <2 x i1> %cmp, <2 x i32> %val, <2 x i32> %load0
69 store <2 x i32> %result, ptr addrspace(1) %out
73 define amdgpu_kernel void @test_select_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
74 ; SI-LABEL: test_select_v2f32:
75 ; SI: ; %bb.0: ; %entry
76 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
77 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
78 ; SI-NEXT: s_waitcnt lgkmcnt(0)
79 ; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
80 ; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
81 ; SI-NEXT: s_mov_b32 s3, 0xf000
82 ; SI-NEXT: s_mov_b32 s2, -1
83 ; SI-NEXT: s_waitcnt lgkmcnt(0)
84 ; SI-NEXT: v_mov_b32_e32 v0, s4
85 ; SI-NEXT: v_mov_b32_e32 v1, s5
86 ; SI-NEXT: v_mov_b32_e32 v2, s7
87 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, s7, v1
88 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
89 ; SI-NEXT: v_mov_b32_e32 v2, s6
90 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0
91 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
92 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
95 ; VI-LABEL: test_select_v2f32:
96 ; VI: ; %bb.0: ; %entry
97 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
98 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
99 ; VI-NEXT: s_mov_b32 s7, 0xf000
100 ; VI-NEXT: s_mov_b32 s6, -1
101 ; VI-NEXT: s_waitcnt lgkmcnt(0)
102 ; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
103 ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
104 ; VI-NEXT: s_mov_b32 s4, s0
105 ; VI-NEXT: s_mov_b32 s5, s1
106 ; VI-NEXT: s_waitcnt lgkmcnt(0)
107 ; VI-NEXT: v_mov_b32_e32 v1, s9
108 ; VI-NEXT: v_mov_b32_e32 v0, s8
109 ; VI-NEXT: v_mov_b32_e32 v2, s3
110 ; VI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v1
111 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
112 ; VI-NEXT: v_mov_b32_e32 v2, s2
113 ; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0
114 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
115 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
118 ; EG-LABEL: test_select_v2f32:
119 ; EG: ; %bb.0: ; %entry
120 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
122 ; EG-NEXT: ALU 5, @12, KC0[CB0:0-32], KC1[]
123 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
126 ; EG-NEXT: Fetch clause starting at 6:
127 ; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1
128 ; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
129 ; EG-NEXT: ALU clause starting at 10:
130 ; EG-NEXT: MOV T0.X, KC0[2].Z,
131 ; EG-NEXT: MOV * T1.X, KC0[2].W,
132 ; EG-NEXT: ALU clause starting at 12:
133 ; EG-NEXT: SETNE_DX10 * T0.W, T0.Y, T1.Y,
134 ; EG-NEXT: CNDE_INT T0.Y, PV.W, T1.Y, T0.Y,
135 ; EG-NEXT: SETNE_DX10 * T0.W, T0.X, T1.X,
136 ; EG-NEXT: CNDE_INT T0.X, PV.W, T1.X, T0.X,
137 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
138 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
140 %0 = load <2 x float>, ptr addrspace(1) %in0
141 %1 = load <2 x float>, ptr addrspace(1) %in1
142 %cmp = fcmp une <2 x float> %0, %1
143 %result = select <2 x i1> %cmp, <2 x float> %0, <2 x float> %1
144 store <2 x float> %result, ptr addrspace(1) %out
148 define amdgpu_kernel void @test_select_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1, <4 x i32> %val) {
149 ; SI-LABEL: test_select_v4i32:
150 ; SI: ; %bb.0: ; %entry
151 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
152 ; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
153 ; SI-NEXT: s_waitcnt lgkmcnt(0)
154 ; SI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
155 ; SI-NEXT: s_load_dwordx4 s[12:15], s[6:7], 0x0
156 ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x11
157 ; SI-NEXT: s_mov_b32 s3, 0xf000
158 ; SI-NEXT: s_waitcnt lgkmcnt(0)
159 ; SI-NEXT: s_cmp_gt_i32 s10, s14
160 ; SI-NEXT: s_cselect_b32 s6, s6, s10
161 ; SI-NEXT: s_cmp_gt_i32 s9, s13
162 ; SI-NEXT: s_cselect_b32 s5, s5, s9
163 ; SI-NEXT: s_cmp_gt_i32 s11, s15
164 ; SI-NEXT: s_cselect_b32 s7, s7, s11
165 ; SI-NEXT: s_cmp_gt_i32 s8, s12
166 ; SI-NEXT: s_cselect_b32 s4, s4, s8
167 ; SI-NEXT: s_mov_b32 s2, -1
168 ; SI-NEXT: v_mov_b32_e32 v2, s6
169 ; SI-NEXT: v_mov_b32_e32 v1, s5
170 ; SI-NEXT: v_mov_b32_e32 v3, s7
171 ; SI-NEXT: v_mov_b32_e32 v0, s4
172 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
175 ; VI-LABEL: test_select_v4i32:
176 ; VI: ; %bb.0: ; %entry
177 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
178 ; VI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x34
179 ; VI-NEXT: s_mov_b32 s7, 0xf000
180 ; VI-NEXT: s_mov_b32 s6, -1
181 ; VI-NEXT: s_waitcnt lgkmcnt(0)
182 ; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
183 ; VI-NEXT: s_load_dwordx4 s[12:15], s[12:13], 0x0
184 ; VI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x44
185 ; VI-NEXT: s_mov_b32 s4, s0
186 ; VI-NEXT: s_mov_b32 s5, s1
187 ; VI-NEXT: s_waitcnt lgkmcnt(0)
188 ; VI-NEXT: s_cmp_gt_i32 s10, s14
189 ; VI-NEXT: s_cselect_b32 s0, s18, s10
190 ; VI-NEXT: s_cmp_gt_i32 s9, s13
191 ; VI-NEXT: s_cselect_b32 s1, s17, s9
192 ; VI-NEXT: s_cmp_gt_i32 s11, s15
193 ; VI-NEXT: s_cselect_b32 s2, s19, s11
194 ; VI-NEXT: s_cmp_gt_i32 s8, s12
195 ; VI-NEXT: s_cselect_b32 s3, s16, s8
196 ; VI-NEXT: v_mov_b32_e32 v0, s3
197 ; VI-NEXT: v_mov_b32_e32 v1, s1
198 ; VI-NEXT: v_mov_b32_e32 v2, s0
199 ; VI-NEXT: v_mov_b32_e32 v3, s2
200 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
203 ; EG-LABEL: test_select_v4i32:
204 ; EG: ; %bb.0: ; %entry
205 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
207 ; EG-NEXT: ALU 9, @12, KC0[CB0:0-32], KC1[]
208 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
211 ; EG-NEXT: Fetch clause starting at 6:
212 ; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
213 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
214 ; EG-NEXT: ALU clause starting at 10:
215 ; EG-NEXT: MOV T0.X, KC0[2].Z,
216 ; EG-NEXT: MOV * T1.X, KC0[2].W,
217 ; EG-NEXT: ALU clause starting at 12:
218 ; EG-NEXT: SETGT_INT T1.W, T0.W, T1.W,
219 ; EG-NEXT: SETGT_INT * T2.W, T0.Z, T1.Z,
220 ; EG-NEXT: CNDE_INT * T0.W, PV.W, T0.W, KC0[4].X,
221 ; EG-NEXT: CNDE_INT T0.Z, T2.W, T0.Z, KC0[3].W,
222 ; EG-NEXT: SETGT_INT * T1.W, T0.Y, T1.Y,
223 ; EG-NEXT: CNDE_INT T0.Y, PV.W, T0.Y, KC0[3].Z,
224 ; EG-NEXT: SETGT_INT * T1.W, T0.X, T1.X,
225 ; EG-NEXT: CNDE_INT T0.X, PV.W, T0.X, KC0[3].Y,
226 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
227 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
229 %load0 = load <4 x i32>, ptr addrspace(1) %in0
230 %load1 = load <4 x i32>, ptr addrspace(1) %in1
231 %cmp = icmp sgt <4 x i32> %load0, %load1
232 %result = select <4 x i1> %cmp, <4 x i32> %val, <4 x i32> %load0
233 store <4 x i32> %result, ptr addrspace(1) %out
237 define amdgpu_kernel void @test_select_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
238 ; SI-LABEL: test_select_v4f32:
239 ; SI: ; %bb.0: ; %entry
240 ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
241 ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
242 ; SI-NEXT: s_waitcnt lgkmcnt(0)
243 ; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
244 ; SI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
245 ; SI-NEXT: s_mov_b32 s3, 0xf000
246 ; SI-NEXT: s_mov_b32 s2, -1
247 ; SI-NEXT: s_waitcnt lgkmcnt(0)
248 ; SI-NEXT: v_mov_b32_e32 v0, s8
249 ; SI-NEXT: v_mov_b32_e32 v1, s9
250 ; SI-NEXT: v_mov_b32_e32 v2, s10
251 ; SI-NEXT: v_mov_b32_e32 v3, s11
252 ; SI-NEXT: v_mov_b32_e32 v4, s7
253 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, s7, v3
254 ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
255 ; SI-NEXT: v_mov_b32_e32 v4, s6
256 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v2
257 ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
258 ; SI-NEXT: v_mov_b32_e32 v4, s5
259 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, s5, v1
260 ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
261 ; SI-NEXT: v_mov_b32_e32 v4, s4
262 ; SI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
263 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
264 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
267 ; VI-LABEL: test_select_v4f32:
268 ; VI: ; %bb.0: ; %entry
269 ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34
270 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
271 ; VI-NEXT: s_mov_b32 s7, 0xf000
272 ; VI-NEXT: s_mov_b32 s6, -1
273 ; VI-NEXT: s_waitcnt lgkmcnt(0)
274 ; VI-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
275 ; VI-NEXT: s_mov_b32 s4, s0
276 ; VI-NEXT: s_mov_b32 s5, s1
277 ; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
278 ; VI-NEXT: s_waitcnt lgkmcnt(0)
279 ; VI-NEXT: v_mov_b32_e32 v3, s11
280 ; VI-NEXT: v_mov_b32_e32 v2, s10
281 ; VI-NEXT: v_mov_b32_e32 v1, s9
282 ; VI-NEXT: v_mov_b32_e32 v4, s3
283 ; VI-NEXT: v_cmp_neq_f32_e32 vcc, s3, v3
284 ; VI-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
285 ; VI-NEXT: v_mov_b32_e32 v4, s2
286 ; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v2
287 ; VI-NEXT: v_mov_b32_e32 v0, s8
288 ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
289 ; VI-NEXT: v_mov_b32_e32 v4, s1
290 ; VI-NEXT: v_cmp_neq_f32_e32 vcc, s1, v1
291 ; VI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
292 ; VI-NEXT: v_mov_b32_e32 v4, s0
293 ; VI-NEXT: v_cmp_neq_f32_e32 vcc, s0, v0
294 ; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
295 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
298 ; EG-LABEL: test_select_v4f32:
299 ; EG: ; %bb.0: ; %entry
300 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
302 ; EG-NEXT: ALU 9, @12, KC0[CB0:0-32], KC1[]
303 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
306 ; EG-NEXT: Fetch clause starting at 6:
307 ; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
308 ; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
309 ; EG-NEXT: ALU clause starting at 10:
310 ; EG-NEXT: MOV T0.X, KC0[2].Z,
311 ; EG-NEXT: MOV * T1.X, KC0[2].W,
312 ; EG-NEXT: ALU clause starting at 12:
313 ; EG-NEXT: SETNE_DX10 T2.W, T0.W, T1.W,
314 ; EG-NEXT: SETNE_DX10 * T3.W, T0.Z, T1.Z,
315 ; EG-NEXT: CNDE_INT * T0.W, PV.W, T1.W, T0.W,
316 ; EG-NEXT: CNDE_INT T0.Z, T3.W, T1.Z, T0.Z,
317 ; EG-NEXT: SETNE_DX10 * T1.W, T0.Y, T1.Y,
318 ; EG-NEXT: CNDE_INT T0.Y, PV.W, T1.Y, T0.Y,
319 ; EG-NEXT: SETNE_DX10 * T1.W, T0.X, T1.X,
320 ; EG-NEXT: CNDE_INT T0.X, PV.W, T1.X, T0.X,
321 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
322 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
324 %0 = load <4 x float>, ptr addrspace(1) %in0
325 %1 = load <4 x float>, ptr addrspace(1) %in1
326 %cmp = fcmp une <4 x float> %0, %1
327 %result = select <4 x i1> %cmp, <4 x float> %0, <4 x float> %1
328 store <4 x float> %result, ptr addrspace(1) %out