1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2 ; RUN: llc -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=CI %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
9 define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
10 ; EG-LABEL: v_test_imin_sle_i32:
12 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
14 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
15 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
18 ; EG-NEXT: Fetch clause starting at 6:
19 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
20 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
21 ; EG-NEXT: ALU clause starting at 10:
22 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
23 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
24 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
25 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
26 ; EG-NEXT: ALU clause starting at 14:
27 ; EG-NEXT: MIN_INT T0.X, T0.X, T1.X,
28 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
29 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
30 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
32 ; CI-LABEL: v_test_imin_sle_i32:
34 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
35 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
36 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
37 ; CI-NEXT: s_waitcnt lgkmcnt(0)
38 ; CI-NEXT: v_mov_b32_e32 v1, s3
39 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
40 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
41 ; CI-NEXT: v_mov_b32_e32 v3, s5
42 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
43 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
44 ; CI-NEXT: flat_load_dword v5, v[0:1]
45 ; CI-NEXT: flat_load_dword v2, v[2:3]
46 ; CI-NEXT: v_mov_b32_e32 v1, s1
47 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
48 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
49 ; CI-NEXT: s_waitcnt vmcnt(0)
50 ; CI-NEXT: v_min_i32_e32 v2, v5, v2
51 ; CI-NEXT: flat_store_dword v[0:1], v2
54 ; VI-LABEL: v_test_imin_sle_i32:
56 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
57 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
58 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
59 ; VI-NEXT: s_waitcnt lgkmcnt(0)
60 ; VI-NEXT: v_mov_b32_e32 v1, s3
61 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
62 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
63 ; VI-NEXT: v_mov_b32_e32 v3, s5
64 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
65 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
66 ; VI-NEXT: flat_load_dword v5, v[0:1]
67 ; VI-NEXT: flat_load_dword v2, v[2:3]
68 ; VI-NEXT: v_mov_b32_e32 v1, s1
69 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
70 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
71 ; VI-NEXT: s_waitcnt vmcnt(0)
72 ; VI-NEXT: v_min_i32_e32 v2, v5, v2
73 ; VI-NEXT: flat_store_dword v[0:1], v2
76 ; GFX9-LABEL: v_test_imin_sle_i32:
78 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
79 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
80 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
81 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
83 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
84 ; GFX9-NEXT: s_waitcnt vmcnt(0)
85 ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2
86 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
89 ; GFX10-LABEL: v_test_imin_sle_i32:
91 ; GFX10-NEXT: s_clause 0x1
92 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
93 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
94 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
95 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
96 ; GFX10-NEXT: s_clause 0x1
97 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
98 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
99 ; GFX10-NEXT: s_waitcnt vmcnt(0)
100 ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2
101 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
102 ; GFX10-NEXT: s_endpgm
104 ; GFX11-LABEL: v_test_imin_sle_i32:
106 ; GFX11-NEXT: s_clause 0x1
107 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
108 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
109 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
110 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
111 ; GFX11-NEXT: s_clause 0x1
112 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
113 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
114 ; GFX11-NEXT: s_waitcnt vmcnt(0)
115 ; GFX11-NEXT: v_min_i32_e32 v1, v1, v2
116 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
117 ; GFX11-NEXT: s_nop 0
118 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
119 ; GFX11-NEXT: s_endpgm
120 %tid = call i32 @llvm.amdgcn.workitem.id.x()
121 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
122 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
123 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
124 %a = load i32, ptr addrspace(1) %a.gep, align 4
125 %b = load i32, ptr addrspace(1) %b.gep, align 4
126 %cmp = icmp sle i32 %a, %b
127 %val = select i1 %cmp, i32 %a, i32 %b
128 store i32 %val, ptr addrspace(1) %out.gep, align 4
132 define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
133 ; EG-LABEL: s_test_imin_sle_i32:
135 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
136 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
139 ; EG-NEXT: ALU clause starting at 4:
140 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
141 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
142 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
144 ; CI-LABEL: s_test_imin_sle_i32:
146 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
147 ; CI-NEXT: s_waitcnt lgkmcnt(0)
148 ; CI-NEXT: s_min_i32 s2, s2, s3
149 ; CI-NEXT: v_mov_b32_e32 v0, s0
150 ; CI-NEXT: v_mov_b32_e32 v1, s1
151 ; CI-NEXT: v_mov_b32_e32 v2, s2
152 ; CI-NEXT: flat_store_dword v[0:1], v2
155 ; VI-LABEL: s_test_imin_sle_i32:
157 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
158 ; VI-NEXT: s_waitcnt lgkmcnt(0)
159 ; VI-NEXT: s_min_i32 s2, s2, s3
160 ; VI-NEXT: v_mov_b32_e32 v0, s0
161 ; VI-NEXT: v_mov_b32_e32 v1, s1
162 ; VI-NEXT: v_mov_b32_e32 v2, s2
163 ; VI-NEXT: flat_store_dword v[0:1], v2
166 ; GFX9-LABEL: s_test_imin_sle_i32:
168 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
169 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
170 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
171 ; GFX9-NEXT: s_min_i32 s2, s2, s3
172 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
173 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
174 ; GFX9-NEXT: s_endpgm
176 ; GFX10-LABEL: s_test_imin_sle_i32:
178 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
179 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
180 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
181 ; GFX10-NEXT: s_min_i32 s2, s2, s3
182 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
183 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
184 ; GFX10-NEXT: s_endpgm
186 ; GFX11-LABEL: s_test_imin_sle_i32:
188 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
189 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
190 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
191 ; GFX11-NEXT: s_min_i32 s2, s2, s3
192 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
193 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
194 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
195 ; GFX11-NEXT: s_nop 0
196 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
197 ; GFX11-NEXT: s_endpgm
198 %cmp = icmp sle i32 %a, %b
199 %val = select i1 %cmp, i32 %a, i32 %b
200 store i32 %val, ptr addrspace(1) %out, align 4
204 define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
205 ; EG-LABEL: s_test_imin_sle_v1i32:
207 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
208 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
211 ; EG-NEXT: ALU clause starting at 4:
212 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
213 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
214 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
216 ; CI-LABEL: s_test_imin_sle_v1i32:
218 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
219 ; CI-NEXT: s_waitcnt lgkmcnt(0)
220 ; CI-NEXT: s_min_i32 s2, s2, s3
221 ; CI-NEXT: v_mov_b32_e32 v0, s0
222 ; CI-NEXT: v_mov_b32_e32 v1, s1
223 ; CI-NEXT: v_mov_b32_e32 v2, s2
224 ; CI-NEXT: flat_store_dword v[0:1], v2
227 ; VI-LABEL: s_test_imin_sle_v1i32:
229 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
230 ; VI-NEXT: s_waitcnt lgkmcnt(0)
231 ; VI-NEXT: s_min_i32 s2, s2, s3
232 ; VI-NEXT: v_mov_b32_e32 v0, s0
233 ; VI-NEXT: v_mov_b32_e32 v1, s1
234 ; VI-NEXT: v_mov_b32_e32 v2, s2
235 ; VI-NEXT: flat_store_dword v[0:1], v2
238 ; GFX9-LABEL: s_test_imin_sle_v1i32:
240 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
241 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
242 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
243 ; GFX9-NEXT: s_min_i32 s2, s2, s3
244 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
245 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
246 ; GFX9-NEXT: s_endpgm
248 ; GFX10-LABEL: s_test_imin_sle_v1i32:
250 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
251 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
252 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX10-NEXT: s_min_i32 s2, s2, s3
254 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
255 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
256 ; GFX10-NEXT: s_endpgm
258 ; GFX11-LABEL: s_test_imin_sle_v1i32:
260 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
261 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
262 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
263 ; GFX11-NEXT: s_min_i32 s2, s2, s3
264 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
265 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
266 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
267 ; GFX11-NEXT: s_nop 0
268 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
269 ; GFX11-NEXT: s_endpgm
270 %cmp = icmp sle <1 x i32> %a, %b
271 %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
272 store <1 x i32> %val, ptr addrspace(1) %out
276 define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b) #0 {
277 ; EG-LABEL: s_test_imin_sle_v4i32:
279 ; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
280 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
283 ; EG-NEXT: ALU clause starting at 4:
284 ; EG-NEXT: MIN_INT * T0.W, KC0[4].X, KC0[5].X,
285 ; EG-NEXT: MIN_INT * T0.Z, KC0[3].W, KC0[4].W,
286 ; EG-NEXT: MIN_INT * T0.Y, KC0[3].Z, KC0[4].Z,
287 ; EG-NEXT: MIN_INT * T0.X, KC0[3].Y, KC0[4].Y,
288 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
289 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
291 ; CI-LABEL: s_test_imin_sle_v4i32:
293 ; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4
294 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
295 ; CI-NEXT: s_waitcnt lgkmcnt(0)
296 ; CI-NEXT: s_min_i32 s2, s11, s15
297 ; CI-NEXT: s_min_i32 s3, s10, s14
298 ; CI-NEXT: s_min_i32 s4, s9, s13
299 ; CI-NEXT: s_min_i32 s5, s8, s12
300 ; CI-NEXT: v_mov_b32_e32 v5, s1
301 ; CI-NEXT: v_mov_b32_e32 v0, s5
302 ; CI-NEXT: v_mov_b32_e32 v1, s4
303 ; CI-NEXT: v_mov_b32_e32 v2, s3
304 ; CI-NEXT: v_mov_b32_e32 v3, s2
305 ; CI-NEXT: v_mov_b32_e32 v4, s0
306 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
309 ; VI-LABEL: s_test_imin_sle_v4i32:
311 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
312 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
313 ; VI-NEXT: s_waitcnt lgkmcnt(0)
314 ; VI-NEXT: s_min_i32 s2, s11, s15
315 ; VI-NEXT: s_min_i32 s3, s10, s14
316 ; VI-NEXT: s_min_i32 s4, s9, s13
317 ; VI-NEXT: s_min_i32 s5, s8, s12
318 ; VI-NEXT: v_mov_b32_e32 v5, s1
319 ; VI-NEXT: v_mov_b32_e32 v0, s5
320 ; VI-NEXT: v_mov_b32_e32 v1, s4
321 ; VI-NEXT: v_mov_b32_e32 v2, s3
322 ; VI-NEXT: v_mov_b32_e32 v3, s2
323 ; VI-NEXT: v_mov_b32_e32 v4, s0
324 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
327 ; GFX9-LABEL: s_test_imin_sle_v4i32:
329 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
330 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
331 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
332 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
333 ; GFX9-NEXT: s_min_i32 s2, s11, s15
334 ; GFX9-NEXT: s_min_i32 s3, s10, s14
335 ; GFX9-NEXT: s_min_i32 s4, s9, s13
336 ; GFX9-NEXT: s_min_i32 s5, s8, s12
337 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
338 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
339 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
340 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
341 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
342 ; GFX9-NEXT: s_endpgm
344 ; GFX10-LABEL: s_test_imin_sle_v4i32:
346 ; GFX10-NEXT: s_clause 0x1
347 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
348 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
349 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
350 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
351 ; GFX10-NEXT: s_min_i32 s2, s11, s15
352 ; GFX10-NEXT: s_min_i32 s3, s10, s14
353 ; GFX10-NEXT: s_min_i32 s4, s8, s12
354 ; GFX10-NEXT: s_min_i32 s5, s9, s13
355 ; GFX10-NEXT: v_mov_b32_e32 v0, s4
356 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
357 ; GFX10-NEXT: v_mov_b32_e32 v2, s3
358 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
359 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
360 ; GFX10-NEXT: s_endpgm
362 ; GFX11-LABEL: s_test_imin_sle_v4i32:
364 ; GFX11-NEXT: s_clause 0x1
365 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10
366 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
367 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
368 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
369 ; GFX11-NEXT: s_min_i32 s2, s7, s11
370 ; GFX11-NEXT: s_min_i32 s3, s6, s10
371 ; GFX11-NEXT: s_min_i32 s4, s4, s8
372 ; GFX11-NEXT: s_min_i32 s5, s5, s9
373 ; GFX11-NEXT: v_mov_b32_e32 v0, s4
374 ; GFX11-NEXT: v_mov_b32_e32 v1, s5
375 ; GFX11-NEXT: v_mov_b32_e32 v2, s3
376 ; GFX11-NEXT: v_mov_b32_e32 v3, s2
377 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
378 ; GFX11-NEXT: s_nop 0
379 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
380 ; GFX11-NEXT: s_endpgm
381 %cmp = icmp sle <4 x i32> %a, %b
382 %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
383 store <4 x i32> %val, ptr addrspace(1) %out
387 define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
388 ; EG-LABEL: s_test_imin_sle_i8:
390 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
392 ; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
393 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
396 ; EG-NEXT: Fetch clause starting at 6:
397 ; EG-NEXT: VTX_READ_8 T1.X, T0.X, 72, #3
398 ; EG-NEXT: VTX_READ_8 T0.X, T0.X, 108, #3
399 ; EG-NEXT: ALU clause starting at 10:
400 ; EG-NEXT: MOV * T0.X, 0.0,
401 ; EG-NEXT: ALU clause starting at 11:
402 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
403 ; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
404 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
405 ; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
406 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
407 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
408 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
409 ; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
410 ; EG-NEXT: LSHL T0.X, PV.W, PS,
411 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
412 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
413 ; EG-NEXT: MOV T0.Y, 0.0,
414 ; EG-NEXT: MOV * T0.Z, 0.0,
415 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
416 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
418 ; CI-LABEL: s_test_imin_sle_i8:
420 ; CI-NEXT: s_load_dword s2, s[4:5], 0xa
421 ; CI-NEXT: s_load_dword s3, s[4:5], 0x13
422 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
423 ; CI-NEXT: s_waitcnt lgkmcnt(0)
424 ; CI-NEXT: s_sext_i32_i8 s2, s2
425 ; CI-NEXT: s_sext_i32_i8 s3, s3
426 ; CI-NEXT: s_min_i32 s2, s2, s3
427 ; CI-NEXT: v_mov_b32_e32 v0, s0
428 ; CI-NEXT: v_mov_b32_e32 v1, s1
429 ; CI-NEXT: v_mov_b32_e32 v2, s2
430 ; CI-NEXT: flat_store_byte v[0:1], v2
433 ; VI-LABEL: s_test_imin_sle_i8:
435 ; VI-NEXT: s_load_dword s2, s[4:5], 0x28
436 ; VI-NEXT: s_load_dword s3, s[4:5], 0x4c
437 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
438 ; VI-NEXT: s_waitcnt lgkmcnt(0)
439 ; VI-NEXT: s_sext_i32_i8 s2, s2
440 ; VI-NEXT: s_sext_i32_i8 s3, s3
441 ; VI-NEXT: s_min_i32 s2, s2, s3
442 ; VI-NEXT: v_mov_b32_e32 v0, s0
443 ; VI-NEXT: v_mov_b32_e32 v1, s1
444 ; VI-NEXT: v_mov_b32_e32 v2, s2
445 ; VI-NEXT: flat_store_byte v[0:1], v2
448 ; GFX9-LABEL: s_test_imin_sle_i8:
450 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28
451 ; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c
452 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
453 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
454 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
455 ; GFX9-NEXT: s_sext_i32_i8 s2, s2
456 ; GFX9-NEXT: s_sext_i32_i8 s3, s3
457 ; GFX9-NEXT: s_min_i32 s2, s2, s3
458 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
459 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
460 ; GFX9-NEXT: s_endpgm
462 ; GFX10-LABEL: s_test_imin_sle_i8:
464 ; GFX10-NEXT: s_clause 0x2
465 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28
466 ; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c
467 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
468 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
469 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
470 ; GFX10-NEXT: s_sext_i32_i8 s2, s2
471 ; GFX10-NEXT: s_sext_i32_i8 s3, s3
472 ; GFX10-NEXT: s_min_i32 s2, s2, s3
473 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
474 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
475 ; GFX10-NEXT: s_endpgm
477 ; GFX11-LABEL: s_test_imin_sle_i8:
479 ; GFX11-NEXT: s_clause 0x2
480 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28
481 ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c
482 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
483 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
484 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
485 ; GFX11-NEXT: s_sext_i32_i8 s2, s2
486 ; GFX11-NEXT: s_sext_i32_i8 s3, s3
487 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
488 ; GFX11-NEXT: s_min_i32 s2, s2, s3
489 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
490 ; GFX11-NEXT: global_store_b8 v0, v1, s[0:1]
491 ; GFX11-NEXT: s_nop 0
492 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
493 ; GFX11-NEXT: s_endpgm
494 %cmp = icmp sle i8 %a, %b
495 %val = select i1 %cmp, i8 %a, i8 %b
496 store i8 %val, ptr addrspace(1) %out
500 ; FIXME: Why vector and sdwa for last element?
502 define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
503 ; EG-LABEL: s_test_imin_sle_v4i8:
505 ; EG-NEXT: ALU 0, @22, KC0[], KC1[]
507 ; EG-NEXT: ALU 30, @23, KC0[CB0:0-32], KC1[]
508 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
511 ; EG-NEXT: Fetch clause starting at 6:
512 ; EG-NEXT: VTX_READ_8 T5.X, T4.X, 74, #3
513 ; EG-NEXT: VTX_READ_8 T6.X, T4.X, 108, #3
514 ; EG-NEXT: VTX_READ_8 T7.X, T4.X, 72, #3
515 ; EG-NEXT: VTX_READ_8 T8.X, T4.X, 111, #3
516 ; EG-NEXT: VTX_READ_8 T9.X, T4.X, 75, #3
517 ; EG-NEXT: VTX_READ_8 T10.X, T4.X, 109, #3
518 ; EG-NEXT: VTX_READ_8 T11.X, T4.X, 73, #3
519 ; EG-NEXT: VTX_READ_8 T4.X, T4.X, 110, #3
520 ; EG-NEXT: ALU clause starting at 22:
521 ; EG-NEXT: MOV * T4.X, 0.0,
522 ; EG-NEXT: ALU clause starting at 23:
523 ; EG-NEXT: BFE_INT T0.Z, T5.X, 0.0, literal.x,
524 ; EG-NEXT: BFE_INT * T0.W, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
525 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
526 ; EG-NEXT: BFE_INT T4.X, T11.X, 0.0, literal.x,
527 ; EG-NEXT: BFE_INT T0.Y, T10.X, 0.0, literal.x, BS:VEC_120/SCL_212
528 ; EG-NEXT: BFE_INT * T1.Z, T9.X, 0.0, literal.x, BS:VEC_201
529 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
530 ; EG-NEXT: BFE_INT T1.W, T8.X, 0.0, literal.x,
531 ; EG-NEXT: MIN_INT * T0.W, T0.Z, T0.W,
532 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
533 ; EG-NEXT: MIN_INT T0.Z, T1.Z, PV.W,
534 ; EG-NEXT: AND_INT T0.W, PS, literal.x,
535 ; EG-NEXT: MIN_INT * T1.W, T4.X, T0.Y,
536 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
537 ; EG-NEXT: AND_INT T4.X, PS, literal.x,
538 ; EG-NEXT: LSHL T0.Y, PV.W, literal.y,
539 ; EG-NEXT: BFE_INT T1.Z, T7.X, 0.0, literal.z,
540 ; EG-NEXT: BFE_INT T0.W, T6.X, 0.0, literal.z, BS:VEC_120/SCL_212
541 ; EG-NEXT: LSHL * T1.W, PV.Z, literal.w,
542 ; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
543 ; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
544 ; EG-NEXT: MIN_INT T0.Z, PV.Z, PV.W,
545 ; EG-NEXT: OR_INT T0.W, PS, PV.Y,
546 ; EG-NEXT: LSHL * T1.W, PV.X, literal.x,
547 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
548 ; EG-NEXT: OR_INT T0.W, PV.W, PS,
549 ; EG-NEXT: AND_INT * T1.W, PV.Z, literal.x,
550 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
551 ; EG-NEXT: OR_INT T4.X, PV.W, PS,
552 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
553 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
555 ; CI-LABEL: s_test_imin_sle_v4i8:
557 ; CI-NEXT: s_load_dword s2, s[4:5], 0xa
558 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
559 ; CI-NEXT: s_load_dword s3, s[4:5], 0x13
560 ; CI-NEXT: s_waitcnt lgkmcnt(0)
561 ; CI-NEXT: s_ashr_i32 s4, s2, 24
562 ; CI-NEXT: s_sext_i32_i8 s5, s2
563 ; CI-NEXT: s_bfe_i32 s6, s2, 0x80008
564 ; CI-NEXT: s_bfe_i32 s2, s2, 0x80010
565 ; CI-NEXT: s_ashr_i32 s7, s3, 24
566 ; CI-NEXT: s_sext_i32_i8 s8, s3
567 ; CI-NEXT: s_bfe_i32 s9, s3, 0x80008
568 ; CI-NEXT: s_bfe_i32 s3, s3, 0x80010
569 ; CI-NEXT: s_min_i32 s2, s2, s3
570 ; CI-NEXT: s_min_i32 s4, s4, s7
571 ; CI-NEXT: s_and_b32 s2, s2, 0xff
572 ; CI-NEXT: s_lshl_b32 s4, s4, 24
573 ; CI-NEXT: s_lshl_b32 s2, s2, 16
574 ; CI-NEXT: s_or_b32 s2, s4, s2
575 ; CI-NEXT: s_min_i32 s3, s6, s9
576 ; CI-NEXT: s_min_i32 s4, s5, s8
577 ; CI-NEXT: s_lshl_b32 s3, s3, 8
578 ; CI-NEXT: s_and_b32 s4, s4, 0xff
579 ; CI-NEXT: s_or_b32 s3, s4, s3
580 ; CI-NEXT: s_and_b32 s3, s3, 0xffff
581 ; CI-NEXT: s_or_b32 s2, s3, s2
582 ; CI-NEXT: v_mov_b32_e32 v0, s0
583 ; CI-NEXT: v_mov_b32_e32 v1, s1
584 ; CI-NEXT: v_mov_b32_e32 v2, s2
585 ; CI-NEXT: flat_store_dword v[0:1], v2
588 ; VI-LABEL: s_test_imin_sle_v4i8:
590 ; VI-NEXT: s_load_dword s2, s[4:5], 0x28
591 ; VI-NEXT: s_load_dword s3, s[4:5], 0x4c
592 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
593 ; VI-NEXT: s_waitcnt lgkmcnt(0)
594 ; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2
595 ; VI-NEXT: v_lshrrev_b16_e64 v1, 8, s3
596 ; VI-NEXT: s_ashr_i32 s4, s2, 24
597 ; VI-NEXT: s_bfe_i32 s5, s2, 0x80010
598 ; VI-NEXT: s_sext_i32_i8 s2, s2
599 ; VI-NEXT: s_ashr_i32 s6, s3, 24
600 ; VI-NEXT: s_bfe_i32 s7, s3, 0x80010
601 ; VI-NEXT: s_sext_i32_i8 s3, s3
602 ; VI-NEXT: s_min_i32 s4, s4, s6
603 ; VI-NEXT: s_min_i32 s2, s2, s3
604 ; VI-NEXT: s_min_i32 s3, s5, s7
605 ; VI-NEXT: v_min_i32_sdwa v0, sext(v0), sext(v1) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
606 ; VI-NEXT: v_lshlrev_b16_e64 v1, 8, s4
607 ; VI-NEXT: v_mov_b32_e32 v2, s3
608 ; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
609 ; VI-NEXT: v_mov_b32_e32 v2, s2
610 ; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
611 ; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
612 ; VI-NEXT: v_mov_b32_e32 v0, s0
613 ; VI-NEXT: v_mov_b32_e32 v1, s1
614 ; VI-NEXT: flat_store_dword v[0:1], v2
617 ; GFX9-LABEL: s_test_imin_sle_v4i8:
619 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28
620 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
621 ; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c
622 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
623 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
624 ; GFX9-NEXT: s_lshr_b32 s4, s2, 16
625 ; GFX9-NEXT: s_bfe_i32 s6, s4, 0x80000
626 ; GFX9-NEXT: s_lshr_b32 s7, s3, 16
627 ; GFX9-NEXT: s_bfe_i32 s8, s7, 0x80000
628 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
629 ; GFX9-NEXT: v_mov_b32_e32 v2, s8
630 ; GFX9-NEXT: s_bfe_i32 s9, s3, 0x80000
631 ; GFX9-NEXT: v_min_i16_sdwa v1, sext(s4), sext(v1) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
632 ; GFX9-NEXT: v_min_i16_e32 v2, s6, v2
633 ; GFX9-NEXT: s_bfe_i32 s5, s2, 0x80000
634 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
635 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
636 ; GFX9-NEXT: v_mov_b32_e32 v3, s9
637 ; GFX9-NEXT: v_min_i16_sdwa v2, sext(s2), sext(v2) dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
638 ; GFX9-NEXT: v_min_i16_e32 v3, s5, v3
639 ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
640 ; GFX9-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
641 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
642 ; GFX9-NEXT: s_endpgm
644 ; GFX10-LABEL: s_test_imin_sle_v4i8:
646 ; GFX10-NEXT: s_clause 0x2
647 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28
648 ; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c
649 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
650 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
651 ; GFX10-NEXT: s_lshr_b32 s4, s2, 16
652 ; GFX10-NEXT: s_lshr_b32 s5, s3, 16
653 ; GFX10-NEXT: v_ashrrev_i16 v0, 8, s2
654 ; GFX10-NEXT: v_ashrrev_i16 v1, 8, s4
655 ; GFX10-NEXT: v_ashrrev_i16 v2, 8, s5
656 ; GFX10-NEXT: v_ashrrev_i16 v3, 8, s3
657 ; GFX10-NEXT: s_bfe_i32 s2, s2, 0x80000
658 ; GFX10-NEXT: s_bfe_i32 s3, s3, 0x80000
659 ; GFX10-NEXT: s_bfe_i32 s4, s4, 0x80000
660 ; GFX10-NEXT: v_min_i16 v1, v1, v2
661 ; GFX10-NEXT: v_min_i16 v0, v0, v3
662 ; GFX10-NEXT: s_bfe_i32 s5, s5, 0x80000
663 ; GFX10-NEXT: v_min_i16 v2, s2, s3
664 ; GFX10-NEXT: v_min_i16 v3, s4, s5
665 ; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
666 ; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
667 ; GFX10-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
668 ; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
669 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
670 ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
671 ; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
672 ; GFX10-NEXT: s_endpgm
674 ; GFX11-LABEL: s_test_imin_sle_v4i8:
676 ; GFX11-NEXT: s_clause 0x2
677 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28
678 ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c
679 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
680 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
681 ; GFX11-NEXT: s_lshr_b32 s4, s2, 16
682 ; GFX11-NEXT: s_lshr_b32 s5, s3, 16
683 ; GFX11-NEXT: v_ashrrev_i16 v0, 8, s2
684 ; GFX11-NEXT: v_ashrrev_i16 v1, 8, s3
685 ; GFX11-NEXT: v_ashrrev_i16 v2, 8, s4
686 ; GFX11-NEXT: v_ashrrev_i16 v3, 8, s5
687 ; GFX11-NEXT: s_bfe_i32 s2, s2, 0x80000
688 ; GFX11-NEXT: s_bfe_i32 s3, s3, 0x80000
689 ; GFX11-NEXT: s_bfe_i32 s4, s4, 0x80000
690 ; GFX11-NEXT: s_bfe_i32 s5, s5, 0x80000
691 ; GFX11-NEXT: v_min_i16 v4, s2, s3
692 ; GFX11-NEXT: v_min_i16 v5, s4, s5
693 ; GFX11-NEXT: v_min_i16 v2, v2, v3
694 ; GFX11-NEXT: v_min_i16 v0, v0, v1
695 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
696 ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v4
697 ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v5
698 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
699 ; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2
700 ; GFX11-NEXT: v_lshlrev_b16 v0, 8, v0
701 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
702 ; GFX11-NEXT: v_or_b32_e32 v2, v3, v2
703 ; GFX11-NEXT: v_or_b32_e32 v0, v1, v0
704 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
705 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
706 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
707 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
708 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
709 ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1
710 ; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
711 ; GFX11-NEXT: s_nop 0
712 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
713 ; GFX11-NEXT: s_endpgm
714 %cmp = icmp sle <4 x i8> %a, %b
715 %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
716 store <4 x i8> %val, ptr addrspace(1) %out
720 define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16> %a, <2 x i16> %b) #0 {
721 ; EG-LABEL: s_test_imin_sle_v2i16:
723 ; EG-NEXT: ALU 0, @14, KC0[], KC1[]
725 ; EG-NEXT: ALU 13, @15, KC0[CB0:0-32], KC1[]
726 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
729 ; EG-NEXT: Fetch clause starting at 6:
730 ; EG-NEXT: VTX_READ_16 T5.X, T4.X, 42, #3
731 ; EG-NEXT: VTX_READ_16 T6.X, T4.X, 44, #3
732 ; EG-NEXT: VTX_READ_16 T7.X, T4.X, 40, #3
733 ; EG-NEXT: VTX_READ_16 T4.X, T4.X, 46, #3
734 ; EG-NEXT: ALU clause starting at 14:
735 ; EG-NEXT: MOV * T4.X, 0.0,
736 ; EG-NEXT: ALU clause starting at 15:
737 ; EG-NEXT: BFE_INT T5.X, T5.X, 0.0, literal.x,
738 ; EG-NEXT: BFE_INT T0.Y, T4.X, 0.0, literal.x, BS:VEC_120/SCL_212
739 ; EG-NEXT: BFE_INT * T0.Z, T7.X, 0.0, literal.x, BS:VEC_201
740 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
741 ; EG-NEXT: BFE_INT * T0.W, T6.X, 0.0, literal.x,
742 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
743 ; EG-NEXT: MIN_INT T0.W, T0.Z, PV.W,
744 ; EG-NEXT: MIN_INT * T1.W, T5.X, T0.Y,
745 ; EG-NEXT: LSHL T1.W, PS, literal.x,
746 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
747 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
748 ; EG-NEXT: OR_INT T4.X, PV.W, PS,
749 ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
750 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
752 ; CI-LABEL: s_test_imin_sle_v2i16:
754 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
755 ; CI-NEXT: s_waitcnt lgkmcnt(0)
756 ; CI-NEXT: s_ashr_i32 s4, s2, 16
757 ; CI-NEXT: s_sext_i32_i16 s2, s2
758 ; CI-NEXT: s_ashr_i32 s5, s3, 16
759 ; CI-NEXT: s_sext_i32_i16 s3, s3
760 ; CI-NEXT: s_min_i32 s4, s4, s5
761 ; CI-NEXT: s_min_i32 s2, s2, s3
762 ; CI-NEXT: s_lshl_b32 s3, s4, 16
763 ; CI-NEXT: s_and_b32 s2, s2, 0xffff
764 ; CI-NEXT: s_or_b32 s2, s2, s3
765 ; CI-NEXT: v_mov_b32_e32 v0, s0
766 ; CI-NEXT: v_mov_b32_e32 v1, s1
767 ; CI-NEXT: v_mov_b32_e32 v2, s2
768 ; CI-NEXT: flat_store_dword v[0:1], v2
771 ; VI-LABEL: s_test_imin_sle_v2i16:
773 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
774 ; VI-NEXT: s_waitcnt lgkmcnt(0)
775 ; VI-NEXT: s_ashr_i32 s4, s2, 16
776 ; VI-NEXT: s_sext_i32_i16 s2, s2
777 ; VI-NEXT: s_ashr_i32 s5, s3, 16
778 ; VI-NEXT: s_sext_i32_i16 s3, s3
779 ; VI-NEXT: s_min_i32 s4, s4, s5
780 ; VI-NEXT: s_min_i32 s2, s2, s3
781 ; VI-NEXT: s_lshl_b32 s3, s4, 16
782 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
783 ; VI-NEXT: s_or_b32 s2, s2, s3
784 ; VI-NEXT: v_mov_b32_e32 v0, s0
785 ; VI-NEXT: v_mov_b32_e32 v1, s1
786 ; VI-NEXT: v_mov_b32_e32 v2, s2
787 ; VI-NEXT: flat_store_dword v[0:1], v2
790 ; GFX9-LABEL: s_test_imin_sle_v2i16:
792 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
793 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
794 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
795 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
796 ; GFX9-NEXT: v_pk_min_i16 v1, s2, v1
797 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
798 ; GFX9-NEXT: s_endpgm
800 ; GFX10-LABEL: s_test_imin_sle_v2i16:
802 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
803 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
804 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
805 ; GFX10-NEXT: v_pk_min_i16 v1, s2, s3
806 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
807 ; GFX10-NEXT: s_endpgm
809 ; GFX11-LABEL: s_test_imin_sle_v2i16:
811 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
812 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
813 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
814 ; GFX11-NEXT: v_pk_min_i16 v1, s2, s3
815 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
816 ; GFX11-NEXT: s_nop 0
817 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
818 ; GFX11-NEXT: s_endpgm
819 %cmp = icmp sle <2 x i16> %a, %b
820 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
821 store <2 x i16> %val, ptr addrspace(1) %out
825 define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16> %a, <4 x i16> %b) #0 {
826 ; EG-LABEL: s_test_imin_sle_v4i16:
828 ; EG-NEXT: ALU 1, @28, KC0[], KC1[]
830 ; EG-NEXT: ALU 9, @30, KC0[], KC1[]
832 ; EG-NEXT: ALU 10, @40, KC0[], KC1[]
834 ; EG-NEXT: ALU 10, @51, KC0[], KC1[]
836 ; EG-NEXT: ALU 11, @62, KC0[CB0:0-32], KC1[]
837 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XY, T5.X, 1
840 ; EG-NEXT: Fetch clause starting at 12:
841 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 50, #3
842 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 58, #3
843 ; EG-NEXT: Fetch clause starting at 16:
844 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 48, #3
845 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 56, #3
846 ; EG-NEXT: Fetch clause starting at 20:
847 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 46, #3
848 ; EG-NEXT: VTX_READ_16 T7.X, T5.X, 54, #3
849 ; EG-NEXT: Fetch clause starting at 24:
850 ; EG-NEXT: VTX_READ_16 T6.X, T5.X, 44, #3
851 ; EG-NEXT: VTX_READ_16 T5.X, T5.X, 52, #3
852 ; EG-NEXT: ALU clause starting at 28:
853 ; EG-NEXT: MOV * T0.Y, T3.X,
854 ; EG-NEXT: MOV * T5.X, 0.0,
855 ; EG-NEXT: ALU clause starting at 30:
856 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
857 ; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
858 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
859 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
860 ; EG-NEXT: LSHL T0.W, PV.W, literal.x,
861 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
862 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
863 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
864 ; EG-NEXT: MOV * T3.X, PV.W,
865 ; EG-NEXT: MOV * T0.Y, PV.X,
866 ; EG-NEXT: ALU clause starting at 40:
867 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
868 ; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
869 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
870 ; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
871 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
872 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
873 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
874 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
875 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
876 ; EG-NEXT: MOV T3.X, PV.W,
877 ; EG-NEXT: MOV * T0.Y, T2.X,
878 ; EG-NEXT: ALU clause starting at 51:
879 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
880 ; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
881 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
882 ; EG-NEXT: MIN_INT T0.W, PV.Z, PV.W,
883 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
884 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
885 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
886 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
887 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
888 ; EG-NEXT: MOV * T2.X, PV.W,
889 ; EG-NEXT: MOV * T0.Y, PV.X,
890 ; EG-NEXT: ALU clause starting at 62:
891 ; EG-NEXT: BFE_INT T0.Z, T6.X, 0.0, literal.x,
892 ; EG-NEXT: BFE_INT * T0.W, T5.X, 0.0, literal.x, BS:VEC_120/SCL_212
893 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
894 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
895 ; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
896 ; EG-NEXT: AND_INT T1.W, T0.Y, literal.y,
897 ; EG-NEXT: AND_INT * T0.W, PV.W, literal.z,
898 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
899 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
900 ; EG-NEXT: OR_INT * T6.X, PV.W, PS,
901 ; EG-NEXT: MOV T2.X, PV.X,
902 ; EG-NEXT: MOV * T6.Y, T3.X,
904 ; CI-LABEL: s_test_imin_sle_v4i16:
906 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
907 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
908 ; CI-NEXT: s_waitcnt lgkmcnt(0)
909 ; CI-NEXT: s_ashr_i32 s6, s0, 16
910 ; CI-NEXT: s_ashr_i32 s7, s1, 16
911 ; CI-NEXT: s_sext_i32_i16 s0, s0
912 ; CI-NEXT: s_sext_i32_i16 s1, s1
913 ; CI-NEXT: s_ashr_i32 s8, s2, 16
914 ; CI-NEXT: s_ashr_i32 s9, s3, 16
915 ; CI-NEXT: s_sext_i32_i16 s2, s2
916 ; CI-NEXT: s_sext_i32_i16 s3, s3
917 ; CI-NEXT: s_min_i32 s7, s7, s9
918 ; CI-NEXT: s_min_i32 s1, s1, s3
919 ; CI-NEXT: s_min_i32 s3, s6, s8
920 ; CI-NEXT: s_min_i32 s0, s0, s2
921 ; CI-NEXT: s_lshl_b32 s7, s7, 16
922 ; CI-NEXT: s_and_b32 s1, s1, 0xffff
923 ; CI-NEXT: s_lshl_b32 s3, s3, 16
924 ; CI-NEXT: s_and_b32 s0, s0, 0xffff
925 ; CI-NEXT: s_or_b32 s1, s1, s7
926 ; CI-NEXT: s_or_b32 s0, s0, s3
927 ; CI-NEXT: v_mov_b32_e32 v2, s4
928 ; CI-NEXT: v_mov_b32_e32 v0, s0
929 ; CI-NEXT: v_mov_b32_e32 v1, s1
930 ; CI-NEXT: v_mov_b32_e32 v3, s5
931 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
934 ; VI-LABEL: s_test_imin_sle_v4i16:
936 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
937 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
938 ; VI-NEXT: s_waitcnt lgkmcnt(0)
939 ; VI-NEXT: s_ashr_i32 s6, s1, 16
940 ; VI-NEXT: s_sext_i32_i16 s1, s1
941 ; VI-NEXT: s_ashr_i32 s8, s3, 16
942 ; VI-NEXT: s_sext_i32_i16 s3, s3
943 ; VI-NEXT: s_ashr_i32 s7, s0, 16
944 ; VI-NEXT: s_sext_i32_i16 s0, s0
945 ; VI-NEXT: s_ashr_i32 s9, s2, 16
946 ; VI-NEXT: s_sext_i32_i16 s2, s2
947 ; VI-NEXT: s_min_i32 s6, s6, s8
948 ; VI-NEXT: s_min_i32 s1, s1, s3
949 ; VI-NEXT: s_min_i32 s7, s7, s9
950 ; VI-NEXT: s_min_i32 s0, s0, s2
951 ; VI-NEXT: s_lshl_b32 s2, s6, 16
952 ; VI-NEXT: s_and_b32 s1, s1, 0xffff
953 ; VI-NEXT: s_or_b32 s1, s1, s2
954 ; VI-NEXT: s_lshl_b32 s2, s7, 16
955 ; VI-NEXT: s_and_b32 s0, s0, 0xffff
956 ; VI-NEXT: s_or_b32 s0, s0, s2
957 ; VI-NEXT: v_mov_b32_e32 v2, s4
958 ; VI-NEXT: v_mov_b32_e32 v0, s0
959 ; VI-NEXT: v_mov_b32_e32 v1, s1
960 ; VI-NEXT: v_mov_b32_e32 v3, s5
961 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
964 ; GFX9-LABEL: s_test_imin_sle_v4i16:
966 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
967 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
968 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
969 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
970 ; GFX9-NEXT: v_mov_b32_e32 v0, s3
971 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
972 ; GFX9-NEXT: v_pk_min_i16 v1, s1, v0
973 ; GFX9-NEXT: v_pk_min_i16 v0, s0, v3
974 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
975 ; GFX9-NEXT: s_endpgm
977 ; GFX10-LABEL: s_test_imin_sle_v4i16:
979 ; GFX10-NEXT: s_clause 0x1
980 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
981 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
982 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
983 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
984 ; GFX10-NEXT: v_pk_min_i16 v1, s1, s3
985 ; GFX10-NEXT: v_pk_min_i16 v0, s0, s2
986 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
987 ; GFX10-NEXT: s_endpgm
989 ; GFX11-LABEL: s_test_imin_sle_v4i16:
991 ; GFX11-NEXT: s_clause 0x1
992 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8
993 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
994 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
995 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
996 ; GFX11-NEXT: v_pk_min_i16 v1, s5, s7
997 ; GFX11-NEXT: v_pk_min_i16 v0, s4, s6
998 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
999 ; GFX11-NEXT: s_nop 0
1000 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1001 ; GFX11-NEXT: s_endpgm
1002 %cmp = icmp sle <4 x i16> %a, %b
1003 %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
1004 store <4 x i16> %val, ptr addrspace(1) %out
1008 define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
1009 ; EG-LABEL: v_test_imin_slt_i32:
1011 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
1013 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
1014 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1017 ; EG-NEXT: Fetch clause starting at 6:
1018 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
1019 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1020 ; EG-NEXT: ALU clause starting at 10:
1021 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1022 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1023 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1024 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
1025 ; EG-NEXT: ALU clause starting at 14:
1026 ; EG-NEXT: MIN_INT T0.X, T0.X, T1.X,
1027 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1028 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
1029 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1031 ; CI-LABEL: v_test_imin_slt_i32:
1033 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1034 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
1035 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1036 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1037 ; CI-NEXT: v_mov_b32_e32 v1, s3
1038 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1039 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1040 ; CI-NEXT: v_mov_b32_e32 v3, s5
1041 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1042 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1043 ; CI-NEXT: flat_load_dword v5, v[0:1]
1044 ; CI-NEXT: flat_load_dword v2, v[2:3]
1045 ; CI-NEXT: v_mov_b32_e32 v1, s1
1046 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
1047 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1048 ; CI-NEXT: s_waitcnt vmcnt(0)
1049 ; CI-NEXT: v_min_i32_e32 v2, v5, v2
1050 ; CI-NEXT: flat_store_dword v[0:1], v2
1053 ; VI-LABEL: v_test_imin_slt_i32:
1055 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1056 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1057 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1058 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1059 ; VI-NEXT: v_mov_b32_e32 v1, s3
1060 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1061 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1062 ; VI-NEXT: v_mov_b32_e32 v3, s5
1063 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1064 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1065 ; VI-NEXT: flat_load_dword v5, v[0:1]
1066 ; VI-NEXT: flat_load_dword v2, v[2:3]
1067 ; VI-NEXT: v_mov_b32_e32 v1, s1
1068 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1069 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1070 ; VI-NEXT: s_waitcnt vmcnt(0)
1071 ; VI-NEXT: v_min_i32_e32 v2, v5, v2
1072 ; VI-NEXT: flat_store_dword v[0:1], v2
1075 ; GFX9-LABEL: v_test_imin_slt_i32:
1077 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1078 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1079 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1080 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1081 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1082 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1083 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1084 ; GFX9-NEXT: v_min_i32_e32 v1, v1, v2
1085 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1086 ; GFX9-NEXT: s_endpgm
1088 ; GFX10-LABEL: v_test_imin_slt_i32:
1090 ; GFX10-NEXT: s_clause 0x1
1091 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1092 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1093 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1094 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1095 ; GFX10-NEXT: s_clause 0x1
1096 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1097 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
1098 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1099 ; GFX10-NEXT: v_min_i32_e32 v1, v1, v2
1100 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1101 ; GFX10-NEXT: s_endpgm
1103 ; GFX11-LABEL: v_test_imin_slt_i32:
1105 ; GFX11-NEXT: s_clause 0x1
1106 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1107 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
1108 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1109 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1110 ; GFX11-NEXT: s_clause 0x1
1111 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
1112 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
1113 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1114 ; GFX11-NEXT: v_min_i32_e32 v1, v1, v2
1115 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
1116 ; GFX11-NEXT: s_nop 0
1117 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1118 ; GFX11-NEXT: s_endpgm
1119 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1120 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %aptr, i32 %tid
1121 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %bptr, i32 %tid
1122 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
1123 %a = load i32, ptr addrspace(1) %a.gep, align 4
1124 %b = load i32, ptr addrspace(1) %b.gep, align 4
1125 %cmp = icmp slt i32 %a, %b
1126 %val = select i1 %cmp, i32 %a, i32 %b
1127 store i32 %val, ptr addrspace(1) %out.gep, align 4
1131 define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
1132 ; EG-LABEL: v_test_imin_slt_i16:
1134 ; EG-NEXT: ALU 1, @12, KC0[CB0:0-32], KC1[]
1136 ; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
1137 ; EG-NEXT: TEX 0 @10
1138 ; EG-NEXT: ALU 16, @15, KC0[CB0:0-32], KC1[]
1139 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
1142 ; EG-NEXT: Fetch clause starting at 8:
1143 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
1144 ; EG-NEXT: Fetch clause starting at 10:
1145 ; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
1146 ; EG-NEXT: ALU clause starting at 12:
1147 ; EG-NEXT: LSHL * T0.W, T0.X, 1,
1148 ; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
1149 ; EG-NEXT: ALU clause starting at 14:
1150 ; EG-NEXT: ADD_INT * T1.X, KC0[2].Z, T0.W,
1151 ; EG-NEXT: ALU clause starting at 15:
1152 ; EG-NEXT: BFE_INT T0.Z, T0.X, 0.0, literal.x,
1153 ; EG-NEXT: BFE_INT T1.W, T1.X, 0.0, literal.x, BS:VEC_120/SCL_212
1154 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1155 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
1156 ; EG-NEXT: AND_INT T2.W, PS, literal.x,
1157 ; EG-NEXT: MIN_INT * T1.W, PV.W, PV.Z,
1158 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1159 ; EG-NEXT: AND_INT T1.W, PS, literal.x,
1160 ; EG-NEXT: LSHL * T2.W, PV.W, literal.y,
1161 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
1162 ; EG-NEXT: LSHL T1.X, PV.W, PS,
1163 ; EG-NEXT: LSHL * T1.W, literal.x, PS,
1164 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1165 ; EG-NEXT: MOV T1.Y, 0.0,
1166 ; EG-NEXT: MOV * T1.Z, 0.0,
1167 ; EG-NEXT: LSHR * T0.X, T0.W, literal.x,
1168 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1170 ; CI-LABEL: v_test_imin_slt_i16:
1172 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1173 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
1174 ; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
1175 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1176 ; CI-NEXT: v_mov_b32_e32 v1, s3
1177 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1178 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1179 ; CI-NEXT: v_mov_b32_e32 v3, s5
1180 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1181 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1182 ; CI-NEXT: flat_load_sshort v5, v[0:1]
1183 ; CI-NEXT: flat_load_sshort v2, v[2:3]
1184 ; CI-NEXT: v_mov_b32_e32 v1, s1
1185 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
1186 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1187 ; CI-NEXT: s_waitcnt vmcnt(0)
1188 ; CI-NEXT: v_min_i32_e32 v2, v5, v2
1189 ; CI-NEXT: flat_store_short v[0:1], v2
1192 ; VI-LABEL: v_test_imin_slt_i16:
1194 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1195 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1196 ; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
1197 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1198 ; VI-NEXT: v_mov_b32_e32 v1, s3
1199 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1200 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1201 ; VI-NEXT: v_mov_b32_e32 v3, s5
1202 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1203 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1204 ; VI-NEXT: flat_load_ushort v5, v[0:1]
1205 ; VI-NEXT: flat_load_ushort v2, v[2:3]
1206 ; VI-NEXT: v_mov_b32_e32 v1, s1
1207 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1208 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1209 ; VI-NEXT: s_waitcnt vmcnt(0)
1210 ; VI-NEXT: v_min_i16_e32 v2, v5, v2
1211 ; VI-NEXT: flat_store_short v[0:1], v2
1214 ; GFX9-LABEL: v_test_imin_slt_i16:
1216 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1217 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1218 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1219 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1220 ; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
1221 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
1222 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1223 ; GFX9-NEXT: v_min_i16_e32 v1, v1, v2
1224 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
1225 ; GFX9-NEXT: s_endpgm
1227 ; GFX10-LABEL: v_test_imin_slt_i16:
1229 ; GFX10-NEXT: s_clause 0x1
1230 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1231 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1232 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1233 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1234 ; GFX10-NEXT: s_clause 0x1
1235 ; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
1236 ; GFX10-NEXT: global_load_ushort v2, v0, s[6:7]
1237 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1238 ; GFX10-NEXT: v_min_i16 v1, v1, v2
1239 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
1240 ; GFX10-NEXT: s_endpgm
1242 ; GFX11-LABEL: v_test_imin_slt_i16:
1244 ; GFX11-NEXT: s_clause 0x1
1245 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1246 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
1247 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
1248 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1249 ; GFX11-NEXT: s_clause 0x1
1250 ; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
1251 ; GFX11-NEXT: global_load_u16 v2, v0, s[0:1]
1252 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1253 ; GFX11-NEXT: v_min_i16 v1, v1, v2
1254 ; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
1255 ; GFX11-NEXT: s_nop 0
1256 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1257 ; GFX11-NEXT: s_endpgm
1258 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1259 %a.gep = getelementptr inbounds i16, ptr addrspace(1) %aptr, i32 %tid
1260 %b.gep = getelementptr inbounds i16, ptr addrspace(1) %bptr, i32 %tid
1261 %out.gep = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
1263 %a = load i16, ptr addrspace(1) %a.gep
1264 %b = load i16, ptr addrspace(1) %b.gep
1265 %cmp = icmp slt i16 %a, %b
1266 %val = select i1 %cmp, i16 %a, i16 %b
1267 store i16 %val, ptr addrspace(1) %out.gep
1271 define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
1272 ; EG-LABEL: s_test_imin_slt_i32:
1274 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1275 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1278 ; EG-NEXT: ALU clause starting at 4:
1279 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1280 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, KC0[2].W,
1281 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1283 ; CI-LABEL: s_test_imin_slt_i32:
1285 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1286 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1287 ; CI-NEXT: s_min_i32 s2, s2, s3
1288 ; CI-NEXT: v_mov_b32_e32 v0, s0
1289 ; CI-NEXT: v_mov_b32_e32 v1, s1
1290 ; CI-NEXT: v_mov_b32_e32 v2, s2
1291 ; CI-NEXT: flat_store_dword v[0:1], v2
1294 ; VI-LABEL: s_test_imin_slt_i32:
1296 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1297 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1298 ; VI-NEXT: s_min_i32 s2, s2, s3
1299 ; VI-NEXT: v_mov_b32_e32 v0, s0
1300 ; VI-NEXT: v_mov_b32_e32 v1, s1
1301 ; VI-NEXT: v_mov_b32_e32 v2, s2
1302 ; VI-NEXT: flat_store_dword v[0:1], v2
1305 ; GFX9-LABEL: s_test_imin_slt_i32:
1307 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1308 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1309 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1310 ; GFX9-NEXT: s_min_i32 s2, s2, s3
1311 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1312 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1313 ; GFX9-NEXT: s_endpgm
1315 ; GFX10-LABEL: s_test_imin_slt_i32:
1317 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1318 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1319 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1320 ; GFX10-NEXT: s_min_i32 s2, s2, s3
1321 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
1322 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1323 ; GFX10-NEXT: s_endpgm
1325 ; GFX11-LABEL: s_test_imin_slt_i32:
1327 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
1328 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1329 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1330 ; GFX11-NEXT: s_min_i32 s2, s2, s3
1331 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1332 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
1333 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1334 ; GFX11-NEXT: s_nop 0
1335 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1336 ; GFX11-NEXT: s_endpgm
1337 %cmp = icmp slt i32 %a, %b
1338 %val = select i1 %cmp, i32 %a, i32 %b
1339 store i32 %val, ptr addrspace(1) %out, align 4
1343 define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 {
1344 ; EG-LABEL: s_test_imin_slt_v2i32:
1346 ; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
1347 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1350 ; EG-NEXT: ALU clause starting at 4:
1351 ; EG-NEXT: MIN_INT * T0.Y, KC0[3].X, KC0[3].Z,
1352 ; EG-NEXT: MIN_INT * T0.X, KC0[2].W, KC0[3].Y,
1353 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
1354 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1356 ; CI-LABEL: s_test_imin_slt_v2i32:
1358 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2
1359 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1360 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1361 ; CI-NEXT: s_min_i32 s1, s1, s3
1362 ; CI-NEXT: s_min_i32 s0, s0, s2
1363 ; CI-NEXT: v_mov_b32_e32 v2, s4
1364 ; CI-NEXT: v_mov_b32_e32 v0, s0
1365 ; CI-NEXT: v_mov_b32_e32 v1, s1
1366 ; CI-NEXT: v_mov_b32_e32 v3, s5
1367 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1370 ; VI-LABEL: s_test_imin_slt_v2i32:
1372 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
1373 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1374 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1375 ; VI-NEXT: s_min_i32 s1, s1, s3
1376 ; VI-NEXT: s_min_i32 s0, s0, s2
1377 ; VI-NEXT: v_mov_b32_e32 v2, s4
1378 ; VI-NEXT: v_mov_b32_e32 v0, s0
1379 ; VI-NEXT: v_mov_b32_e32 v1, s1
1380 ; VI-NEXT: v_mov_b32_e32 v3, s5
1381 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
1384 ; GFX9-LABEL: s_test_imin_slt_v2i32:
1386 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
1387 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1388 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1389 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1390 ; GFX9-NEXT: s_min_i32 s1, s1, s3
1391 ; GFX9-NEXT: s_min_i32 s0, s0, s2
1392 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1393 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1394 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
1395 ; GFX9-NEXT: s_endpgm
1397 ; GFX10-LABEL: s_test_imin_slt_v2i32:
1399 ; GFX10-NEXT: s_clause 0x1
1400 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x8
1401 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
1402 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1403 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1404 ; GFX10-NEXT: s_min_i32 s0, s0, s2
1405 ; GFX10-NEXT: s_min_i32 s1, s1, s3
1406 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1407 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1408 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
1409 ; GFX10-NEXT: s_endpgm
1411 ; GFX11-LABEL: s_test_imin_slt_v2i32:
1413 ; GFX11-NEXT: s_clause 0x1
1414 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x8
1415 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
1416 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1417 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1418 ; GFX11-NEXT: s_min_i32 s2, s4, s6
1419 ; GFX11-NEXT: s_min_i32 s3, s5, s7
1420 ; GFX11-NEXT: v_mov_b32_e32 v0, s2
1421 ; GFX11-NEXT: v_mov_b32_e32 v1, s3
1422 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
1423 ; GFX11-NEXT: s_nop 0
1424 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1425 ; GFX11-NEXT: s_endpgm
1426 %cmp = icmp slt <2 x i32> %a, %b
1427 %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
1428 store <2 x i32> %val, ptr addrspace(1) %out
1432 define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
1433 ; EG-LABEL: s_test_imin_slt_imm_i32:
1435 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1436 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1439 ; EG-NEXT: ALU clause starting at 4:
1440 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1441 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, literal.y,
1442 ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
1444 ; CI-LABEL: s_test_imin_slt_imm_i32:
1446 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
1447 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1448 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1449 ; CI-NEXT: s_min_i32 s2, s2, 8
1450 ; CI-NEXT: v_mov_b32_e32 v0, s0
1451 ; CI-NEXT: v_mov_b32_e32 v1, s1
1452 ; CI-NEXT: v_mov_b32_e32 v2, s2
1453 ; CI-NEXT: flat_store_dword v[0:1], v2
1456 ; VI-LABEL: s_test_imin_slt_imm_i32:
1458 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
1459 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1460 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1461 ; VI-NEXT: s_min_i32 s2, s2, 8
1462 ; VI-NEXT: v_mov_b32_e32 v0, s0
1463 ; VI-NEXT: v_mov_b32_e32 v1, s1
1464 ; VI-NEXT: v_mov_b32_e32 v2, s2
1465 ; VI-NEXT: flat_store_dword v[0:1], v2
1468 ; GFX9-LABEL: s_test_imin_slt_imm_i32:
1470 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
1471 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1472 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1473 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1474 ; GFX9-NEXT: s_min_i32 s2, s2, 8
1475 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1476 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1477 ; GFX9-NEXT: s_endpgm
1479 ; GFX10-LABEL: s_test_imin_slt_imm_i32:
1481 ; GFX10-NEXT: s_clause 0x1
1482 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8
1483 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1484 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1485 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1486 ; GFX10-NEXT: s_min_i32 s2, s2, 8
1487 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
1488 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1489 ; GFX10-NEXT: s_endpgm
1491 ; GFX11-LABEL: s_test_imin_slt_imm_i32:
1493 ; GFX11-NEXT: s_clause 0x1
1494 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
1495 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
1496 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1497 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1498 ; GFX11-NEXT: s_min_i32 s2, s2, 8
1499 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1500 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
1501 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1502 ; GFX11-NEXT: s_nop 0
1503 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1504 ; GFX11-NEXT: s_endpgm
1505 %cmp = icmp slt i32 %a, 8
1506 %val = select i1 %cmp, i32 %a, i32 8
1507 store i32 %val, ptr addrspace(1) %out, align 4
1511 define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a) #0 {
1512 ; EG-LABEL: s_test_imin_sle_imm_i32:
1514 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
1515 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
1518 ; EG-NEXT: ALU clause starting at 4:
1519 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
1520 ; EG-NEXT: MIN_INT * T1.X, KC0[2].Z, literal.y,
1521 ; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
1523 ; CI-LABEL: s_test_imin_sle_imm_i32:
1525 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
1526 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1527 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1528 ; CI-NEXT: s_min_i32 s2, s2, 8
1529 ; CI-NEXT: v_mov_b32_e32 v0, s0
1530 ; CI-NEXT: v_mov_b32_e32 v1, s1
1531 ; CI-NEXT: v_mov_b32_e32 v2, s2
1532 ; CI-NEXT: flat_store_dword v[0:1], v2
1535 ; VI-LABEL: s_test_imin_sle_imm_i32:
1537 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
1538 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1539 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1540 ; VI-NEXT: s_min_i32 s2, s2, 8
1541 ; VI-NEXT: v_mov_b32_e32 v0, s0
1542 ; VI-NEXT: v_mov_b32_e32 v1, s1
1543 ; VI-NEXT: v_mov_b32_e32 v2, s2
1544 ; VI-NEXT: flat_store_dword v[0:1], v2
1547 ; GFX9-LABEL: s_test_imin_sle_imm_i32:
1549 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
1550 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1551 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1552 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1553 ; GFX9-NEXT: s_min_i32 s2, s2, 8
1554 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1555 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1556 ; GFX9-NEXT: s_endpgm
1558 ; GFX10-LABEL: s_test_imin_sle_imm_i32:
1560 ; GFX10-NEXT: s_clause 0x1
1561 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8
1562 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
1563 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
1564 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1565 ; GFX10-NEXT: s_min_i32 s2, s2, 8
1566 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
1567 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1568 ; GFX10-NEXT: s_endpgm
1570 ; GFX11-LABEL: s_test_imin_sle_imm_i32:
1572 ; GFX11-NEXT: s_clause 0x1
1573 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
1574 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
1575 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1576 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1577 ; GFX11-NEXT: s_min_i32 s2, s2, 8
1578 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1579 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
1580 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
1581 ; GFX11-NEXT: s_nop 0
1582 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1583 ; GFX11-NEXT: s_endpgm
1584 %cmp = icmp sle i32 %a, 8
1585 %val = select i1 %cmp, i32 %a, i32 8
1586 store i32 %val, ptr addrspace(1) %out, align 4
1590 define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1591 ; EG-LABEL: v_test_umin_ule_i32:
1593 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
1595 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
1596 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1599 ; EG-NEXT: Fetch clause starting at 6:
1600 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
1601 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
1602 ; EG-NEXT: ALU clause starting at 10:
1603 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1604 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1605 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1606 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
1607 ; EG-NEXT: ALU clause starting at 14:
1608 ; EG-NEXT: MIN_UINT T0.X, T0.X, T1.X,
1609 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1610 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
1611 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1613 ; CI-LABEL: v_test_umin_ule_i32:
1615 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1616 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
1617 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1618 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1619 ; CI-NEXT: v_mov_b32_e32 v1, s3
1620 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1621 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1622 ; CI-NEXT: v_mov_b32_e32 v3, s5
1623 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1624 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1625 ; CI-NEXT: flat_load_dword v5, v[0:1]
1626 ; CI-NEXT: flat_load_dword v2, v[2:3]
1627 ; CI-NEXT: v_mov_b32_e32 v1, s1
1628 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
1629 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1630 ; CI-NEXT: s_waitcnt vmcnt(0)
1631 ; CI-NEXT: v_min_u32_e32 v2, v5, v2
1632 ; CI-NEXT: flat_store_dword v[0:1], v2
1635 ; VI-LABEL: v_test_umin_ule_i32:
1637 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1638 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1639 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1640 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1641 ; VI-NEXT: v_mov_b32_e32 v1, s3
1642 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1643 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1644 ; VI-NEXT: v_mov_b32_e32 v3, s5
1645 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1646 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1647 ; VI-NEXT: flat_load_dword v5, v[0:1]
1648 ; VI-NEXT: flat_load_dword v2, v[2:3]
1649 ; VI-NEXT: v_mov_b32_e32 v1, s1
1650 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
1651 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1652 ; VI-NEXT: s_waitcnt vmcnt(0)
1653 ; VI-NEXT: v_min_u32_e32 v2, v5, v2
1654 ; VI-NEXT: flat_store_dword v[0:1], v2
1657 ; GFX9-LABEL: v_test_umin_ule_i32:
1659 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1660 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1661 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1662 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1663 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
1664 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
1665 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1666 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2
1667 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
1668 ; GFX9-NEXT: s_endpgm
1670 ; GFX10-LABEL: v_test_umin_ule_i32:
1672 ; GFX10-NEXT: s_clause 0x1
1673 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1674 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1675 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1676 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1677 ; GFX10-NEXT: s_clause 0x1
1678 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
1679 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
1680 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1681 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2
1682 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
1683 ; GFX10-NEXT: s_endpgm
1685 ; GFX11-LABEL: v_test_umin_ule_i32:
1687 ; GFX11-NEXT: s_clause 0x1
1688 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1689 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
1690 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1691 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1692 ; GFX11-NEXT: s_clause 0x1
1693 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
1694 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
1695 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1696 ; GFX11-NEXT: v_min_u32_e32 v1, v1, v2
1697 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
1698 ; GFX11-NEXT: s_nop 0
1699 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1700 ; GFX11-NEXT: s_endpgm
1701 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1702 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
1703 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
1704 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
1705 %a = load i32, ptr addrspace(1) %a.gep, align 4
1706 %b = load i32, ptr addrspace(1) %b.gep, align 4
1707 %cmp = icmp ule i32 %a, %b
1708 %val = select i1 %cmp, i32 %a, i32 %b
1709 store i32 %val, ptr addrspace(1) %out.gep, align 4
1713 define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1714 ; EG-LABEL: v_test_umin_ule_v3i32:
1716 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
1718 ; EG-NEXT: ALU 9, @14, KC0[CB0:0-32], KC1[]
1719 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
1720 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1722 ; EG-NEXT: Fetch clause starting at 6:
1723 ; EG-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1
1724 ; EG-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1
1725 ; EG-NEXT: ALU clause starting at 10:
1726 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1727 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1728 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1729 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
1730 ; EG-NEXT: ALU clause starting at 14:
1731 ; EG-NEXT: MIN_UINT * T0.Y, T2.Y, T1.Y,
1732 ; EG-NEXT: MIN_UINT T0.X, T2.X, T1.X,
1733 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1734 ; EG-NEXT: LSHR T1.X, PV.W, literal.x,
1735 ; EG-NEXT: MIN_UINT * T2.X, T2.Z, T1.Z,
1736 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1737 ; EG-NEXT: ADD_INT * T0.W, T0.W, literal.x,
1738 ; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
1739 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
1740 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1742 ; CI-LABEL: v_test_umin_ule_v3i32:
1744 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1745 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
1746 ; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1747 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1748 ; CI-NEXT: v_mov_b32_e32 v1, s3
1749 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
1750 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1751 ; CI-NEXT: v_mov_b32_e32 v2, s5
1752 ; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6
1753 ; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
1754 ; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
1755 ; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
1756 ; CI-NEXT: v_mov_b32_e32 v7, s1
1757 ; CI-NEXT: v_add_i32_e32 v6, vcc, s0, v6
1758 ; CI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
1759 ; CI-NEXT: s_waitcnt vmcnt(0)
1760 ; CI-NEXT: v_min_u32_e32 v2, v2, v5
1761 ; CI-NEXT: v_min_u32_e32 v1, v1, v4
1762 ; CI-NEXT: v_min_u32_e32 v0, v0, v3
1763 ; CI-NEXT: flat_store_dwordx3 v[6:7], v[0:2]
1766 ; VI-LABEL: v_test_umin_ule_v3i32:
1768 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1769 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1770 ; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1771 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1772 ; VI-NEXT: v_mov_b32_e32 v1, s3
1773 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
1774 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1775 ; VI-NEXT: v_mov_b32_e32 v2, s5
1776 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6
1777 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
1778 ; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
1779 ; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
1780 ; VI-NEXT: v_mov_b32_e32 v7, s1
1781 ; VI-NEXT: v_add_u32_e32 v6, vcc, s0, v6
1782 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc
1783 ; VI-NEXT: s_waitcnt vmcnt(0)
1784 ; VI-NEXT: v_min_u32_e32 v2, v2, v5
1785 ; VI-NEXT: v_min_u32_e32 v1, v1, v4
1786 ; VI-NEXT: v_min_u32_e32 v0, v0, v3
1787 ; VI-NEXT: flat_store_dwordx3 v[6:7], v[0:2]
1790 ; GFX9-LABEL: v_test_umin_ule_v3i32:
1792 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1793 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1794 ; GFX9-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1795 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1796 ; GFX9-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3]
1797 ; GFX9-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7]
1798 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1799 ; GFX9-NEXT: v_min_u32_e32 v2, v2, v5
1800 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v4
1801 ; GFX9-NEXT: v_min_u32_e32 v0, v0, v3
1802 ; GFX9-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
1803 ; GFX9-NEXT: s_endpgm
1805 ; GFX10-LABEL: v_test_umin_ule_v3i32:
1807 ; GFX10-NEXT: s_clause 0x1
1808 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1809 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1810 ; GFX10-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1811 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1812 ; GFX10-NEXT: s_clause 0x1
1813 ; GFX10-NEXT: global_load_dwordx3 v[0:2], v6, s[2:3]
1814 ; GFX10-NEXT: global_load_dwordx3 v[3:5], v6, s[6:7]
1815 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1816 ; GFX10-NEXT: v_min_u32_e32 v2, v2, v5
1817 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v4
1818 ; GFX10-NEXT: v_min_u32_e32 v0, v0, v3
1819 ; GFX10-NEXT: global_store_dwordx3 v6, v[0:2], s[0:1]
1820 ; GFX10-NEXT: s_endpgm
1822 ; GFX11-LABEL: v_test_umin_ule_v3i32:
1824 ; GFX11-NEXT: s_clause 0x1
1825 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
1826 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
1827 ; GFX11-NEXT: v_lshlrev_b32_e32 v6, 4, v0
1828 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1829 ; GFX11-NEXT: s_clause 0x1
1830 ; GFX11-NEXT: global_load_b96 v[0:2], v6, s[6:7]
1831 ; GFX11-NEXT: global_load_b96 v[3:5], v6, s[0:1]
1832 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1833 ; GFX11-NEXT: v_min_u32_e32 v2, v2, v5
1834 ; GFX11-NEXT: v_min_u32_e32 v1, v1, v4
1835 ; GFX11-NEXT: v_min_u32_e32 v0, v0, v3
1836 ; GFX11-NEXT: global_store_b96 v6, v[0:2], s[4:5]
1837 ; GFX11-NEXT: s_nop 0
1838 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1839 ; GFX11-NEXT: s_endpgm
1840 %tid = call i32 @llvm.amdgcn.workitem.id.x()
1841 %a.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %a.ptr, i32 %tid
1842 %b.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %b.ptr, i32 %tid
1843 %out.gep = getelementptr inbounds <3 x i32>, ptr addrspace(1) %out, i32 %tid
1845 %a = load <3 x i32>, ptr addrspace(1) %a.gep
1846 %b = load <3 x i32>, ptr addrspace(1) %b.gep
1847 %cmp = icmp ule <3 x i32> %a, %b
1848 %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
1849 store <3 x i32> %val, ptr addrspace(1) %out.gep
1853 ; FIXME: Reduce unused packed component to scalar
1855 define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
1856 ; EG-LABEL: v_test_umin_ule_v3i16:
1858 ; EG-NEXT: ALU 3, @20, KC0[CB0:0-32], KC1[]
1860 ; EG-NEXT: ALU 11, @24, KC0[CB0:0-32], KC1[]
1861 ; EG-NEXT: TEX 3 @12
1862 ; EG-NEXT: ALU 8, @36, KC0[], KC1[]
1863 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T8.X, 0
1864 ; EG-NEXT: MEM_RAT MSKOR T7.XW, T0.X
1866 ; EG-NEXT: Fetch clause starting at 8:
1867 ; EG-NEXT: VTX_READ_16 T7.X, T6.X, 4, #1
1868 ; EG-NEXT: VTX_READ_16 T8.X, T0.X, 4, #1
1869 ; EG-NEXT: Fetch clause starting at 12:
1870 ; EG-NEXT: VTX_READ_16 T8.X, T6.X, 0, #1
1871 ; EG-NEXT: VTX_READ_16 T9.X, T0.X, 0, #1
1872 ; EG-NEXT: VTX_READ_16 T6.X, T6.X, 2, #1
1873 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1
1874 ; EG-NEXT: ALU clause starting at 20:
1875 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
1876 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1877 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
1878 ; EG-NEXT: ADD_INT * T6.X, KC0[2].W, PV.W,
1879 ; EG-NEXT: ALU clause starting at 24:
1880 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
1881 ; EG-NEXT: ADD_INT * T1.W, PV.W, literal.x,
1882 ; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
1883 ; EG-NEXT: AND_INT * T2.W, PV.W, literal.x,
1884 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1885 ; EG-NEXT: LSHL T2.W, PV.W, literal.x,
1886 ; EG-NEXT: MIN_UINT * T3.W, T8.X, T7.X,
1887 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
1888 ; EG-NEXT: LSHL T7.X, PS, PV.W,
1889 ; EG-NEXT: LSHL * T7.W, literal.x, PV.W,
1890 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
1891 ; EG-NEXT: MOV * T7.Y, 0.0,
1892 ; EG-NEXT: ALU clause starting at 36:
1893 ; EG-NEXT: MOV T7.Z, 0.0,
1894 ; EG-NEXT: MIN_UINT * T2.W, T0.X, T6.X,
1895 ; EG-NEXT: LSHR T0.X, T1.W, literal.x,
1896 ; EG-NEXT: LSHL T1.W, PV.W, literal.y,
1897 ; EG-NEXT: MIN_UINT * T2.W, T9.X, T8.X,
1898 ; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
1899 ; EG-NEXT: OR_INT T6.X, PV.W, PS,
1900 ; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
1901 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
1903 ; CI-LABEL: v_test_umin_ule_v3i16:
1905 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1906 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
1907 ; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1908 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1909 ; CI-NEXT: v_mov_b32_e32 v1, s3
1910 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
1911 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1912 ; CI-NEXT: v_mov_b32_e32 v3, s5
1913 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
1914 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1915 ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1916 ; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1917 ; CI-NEXT: v_mov_b32_e32 v5, s1
1918 ; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
1919 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
1920 ; CI-NEXT: v_add_i32_e32 v6, vcc, 4, v4
1921 ; CI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
1922 ; CI-NEXT: s_waitcnt vmcnt(1)
1923 ; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
1924 ; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1925 ; CI-NEXT: s_waitcnt vmcnt(0)
1926 ; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
1927 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
1928 ; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
1929 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3
1930 ; CI-NEXT: v_min_u32_e32 v0, v0, v2
1931 ; CI-NEXT: v_min_u32_e32 v2, v8, v9
1932 ; CI-NEXT: v_min_u32_e32 v1, v1, v3
1933 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
1934 ; CI-NEXT: v_or_b32_e32 v0, v0, v2
1935 ; CI-NEXT: flat_store_short v[6:7], v1
1936 ; CI-NEXT: flat_store_dword v[4:5], v0
1939 ; VI-LABEL: v_test_umin_ule_v3i16:
1941 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1942 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
1943 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1944 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1945 ; VI-NEXT: v_mov_b32_e32 v1, s3
1946 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
1947 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
1948 ; VI-NEXT: v_mov_b32_e32 v3, s5
1949 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
1950 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
1951 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
1952 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
1953 ; VI-NEXT: v_mov_b32_e32 v5, s1
1954 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
1955 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
1956 ; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v4
1957 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
1958 ; VI-NEXT: s_waitcnt vmcnt(0)
1959 ; VI-NEXT: v_min_u16_e32 v8, v0, v2
1960 ; VI-NEXT: v_min_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1961 ; VI-NEXT: v_min_u16_e32 v1, v1, v3
1962 ; VI-NEXT: v_or_b32_e32 v0, v8, v0
1963 ; VI-NEXT: flat_store_short v[6:7], v1
1964 ; VI-NEXT: flat_store_dword v[4:5], v0
1967 ; GFX9-LABEL: v_test_umin_ule_v3i16:
1969 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1970 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1971 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1972 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1973 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
1974 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
1975 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1976 ; GFX9-NEXT: v_pk_min_u16 v1, v1, v3
1977 ; GFX9-NEXT: v_pk_min_u16 v0, v0, v2
1978 ; GFX9-NEXT: global_store_short v4, v1, s[0:1] offset:4
1979 ; GFX9-NEXT: global_store_dword v4, v0, s[0:1]
1980 ; GFX9-NEXT: s_endpgm
1982 ; GFX10-LABEL: v_test_umin_ule_v3i16:
1984 ; GFX10-NEXT: s_clause 0x1
1985 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
1986 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
1987 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
1988 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1989 ; GFX10-NEXT: s_clause 0x1
1990 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
1991 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
1992 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1993 ; GFX10-NEXT: v_pk_min_u16 v1, v1, v3
1994 ; GFX10-NEXT: v_pk_min_u16 v0, v0, v2
1995 ; GFX10-NEXT: global_store_short v4, v1, s[0:1] offset:4
1996 ; GFX10-NEXT: global_store_dword v4, v0, s[0:1]
1997 ; GFX10-NEXT: s_endpgm
1999 ; GFX11-LABEL: v_test_umin_ule_v3i16:
2001 ; GFX11-NEXT: s_clause 0x1
2002 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
2003 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
2004 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
2005 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2006 ; GFX11-NEXT: s_clause 0x1
2007 ; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
2008 ; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1]
2009 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2010 ; GFX11-NEXT: v_pk_min_u16 v1, v1, v3
2011 ; GFX11-NEXT: v_pk_min_u16 v0, v0, v2
2012 ; GFX11-NEXT: s_clause 0x1
2013 ; GFX11-NEXT: global_store_b16 v4, v1, s[4:5] offset:4
2014 ; GFX11-NEXT: global_store_b32 v4, v0, s[4:5]
2015 ; GFX11-NEXT: s_nop 0
2016 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2017 ; GFX11-NEXT: s_endpgm
2018 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2019 %a.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
2020 %b.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
2021 %out.gep = getelementptr inbounds <3 x i16>, ptr addrspace(1) %out, i32 %tid
2023 %a = load <3 x i16>, ptr addrspace(1) %a.gep
2024 %b = load <3 x i16>, ptr addrspace(1) %b.gep
2025 %cmp = icmp ule <3 x i16> %a, %b
2026 %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
2027 store <3 x i16> %val, ptr addrspace(1) %out.gep
2031 define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
2032 ; EG-LABEL: s_test_umin_ule_i32:
2034 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2035 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2038 ; EG-NEXT: ALU clause starting at 4:
2039 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2040 ; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2041 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2043 ; CI-LABEL: s_test_umin_ule_i32:
2045 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2046 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2047 ; CI-NEXT: s_min_u32 s2, s2, s3
2048 ; CI-NEXT: v_mov_b32_e32 v0, s0
2049 ; CI-NEXT: v_mov_b32_e32 v1, s1
2050 ; CI-NEXT: v_mov_b32_e32 v2, s2
2051 ; CI-NEXT: flat_store_dword v[0:1], v2
2054 ; VI-LABEL: s_test_umin_ule_i32:
2056 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2057 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2058 ; VI-NEXT: s_min_u32 s2, s2, s3
2059 ; VI-NEXT: v_mov_b32_e32 v0, s0
2060 ; VI-NEXT: v_mov_b32_e32 v1, s1
2061 ; VI-NEXT: v_mov_b32_e32 v2, s2
2062 ; VI-NEXT: flat_store_dword v[0:1], v2
2065 ; GFX9-LABEL: s_test_umin_ule_i32:
2067 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2068 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2069 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2070 ; GFX9-NEXT: s_min_u32 s2, s2, s3
2071 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2072 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2073 ; GFX9-NEXT: s_endpgm
2075 ; GFX10-LABEL: s_test_umin_ule_i32:
2077 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2078 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2079 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2080 ; GFX10-NEXT: s_min_u32 s2, s2, s3
2081 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
2082 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2083 ; GFX10-NEXT: s_endpgm
2085 ; GFX11-LABEL: s_test_umin_ule_i32:
2087 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
2088 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2089 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2090 ; GFX11-NEXT: s_min_u32 s2, s2, s3
2091 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2092 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
2093 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2094 ; GFX11-NEXT: s_nop 0
2095 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2096 ; GFX11-NEXT: s_endpgm
2097 %cmp = icmp ule i32 %a, %b
2098 %val = select i1 %cmp, i32 %a, i32 %b
2099 store i32 %val, ptr addrspace(1) %out, align 4
2103 define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2104 ; EG-LABEL: v_test_umin_ult_i32:
2106 ; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[]
2108 ; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[]
2109 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2112 ; EG-NEXT: Fetch clause starting at 6:
2113 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
2114 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2115 ; EG-NEXT: ALU clause starting at 10:
2116 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
2117 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2118 ; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W,
2119 ; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W,
2120 ; EG-NEXT: ALU clause starting at 14:
2121 ; EG-NEXT: MIN_UINT T0.X, T0.X, T1.X,
2122 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
2123 ; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
2124 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2126 ; CI-LABEL: v_test_umin_ult_i32:
2128 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2129 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
2130 ; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
2131 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2132 ; CI-NEXT: v_mov_b32_e32 v1, s3
2133 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
2134 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2135 ; CI-NEXT: v_mov_b32_e32 v3, s5
2136 ; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
2137 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2138 ; CI-NEXT: flat_load_dword v5, v[0:1]
2139 ; CI-NEXT: flat_load_dword v2, v[2:3]
2140 ; CI-NEXT: v_mov_b32_e32 v1, s1
2141 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
2142 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2143 ; CI-NEXT: s_waitcnt vmcnt(0)
2144 ; CI-NEXT: v_min_u32_e32 v2, v5, v2
2145 ; CI-NEXT: flat_store_dword v[0:1], v2
2148 ; VI-LABEL: v_test_umin_ult_i32:
2150 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2151 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
2152 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
2153 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2154 ; VI-NEXT: v_mov_b32_e32 v1, s3
2155 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
2156 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2157 ; VI-NEXT: v_mov_b32_e32 v3, s5
2158 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
2159 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
2160 ; VI-NEXT: flat_load_dword v5, v[0:1]
2161 ; VI-NEXT: flat_load_dword v2, v[2:3]
2162 ; VI-NEXT: v_mov_b32_e32 v1, s1
2163 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
2164 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2165 ; VI-NEXT: s_waitcnt vmcnt(0)
2166 ; VI-NEXT: v_min_u32_e32 v2, v5, v2
2167 ; VI-NEXT: flat_store_dword v[0:1], v2
2170 ; GFX9-LABEL: v_test_umin_ult_i32:
2172 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2173 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
2174 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2175 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2176 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
2177 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
2178 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2179 ; GFX9-NEXT: v_min_u32_e32 v1, v1, v2
2180 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2181 ; GFX9-NEXT: s_endpgm
2183 ; GFX10-LABEL: v_test_umin_ult_i32:
2185 ; GFX10-NEXT: s_clause 0x1
2186 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2187 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
2188 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2189 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2190 ; GFX10-NEXT: s_clause 0x1
2191 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
2192 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
2193 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2194 ; GFX10-NEXT: v_min_u32_e32 v1, v1, v2
2195 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2196 ; GFX10-NEXT: s_endpgm
2198 ; GFX11-LABEL: v_test_umin_ult_i32:
2200 ; GFX11-NEXT: s_clause 0x1
2201 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
2202 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
2203 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2204 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2205 ; GFX11-NEXT: s_clause 0x1
2206 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
2207 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
2208 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2209 ; GFX11-NEXT: v_min_u32_e32 v1, v1, v2
2210 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
2211 ; GFX11-NEXT: s_nop 0
2212 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2213 ; GFX11-NEXT: s_endpgm
2214 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2215 %a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr, i32 %tid
2216 %b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr, i32 %tid
2217 %out.gep = getelementptr inbounds i32, ptr addrspace(1) %out, i32 %tid
2218 %a = load i32, ptr addrspace(1) %a.gep, align 4
2219 %b = load i32, ptr addrspace(1) %b.gep, align 4
2220 %cmp = icmp ult i32 %a, %b
2221 %val = select i1 %cmp, i32 %a, i32 %b
2222 store i32 %val, ptr addrspace(1) %out.gep, align 4
2226 define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
2227 ; EG-LABEL: v_test_umin_ult_i8:
2229 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2231 ; EG-NEXT: ALU 12, @12, KC0[CB0:0-32], KC1[]
2232 ; EG-NEXT: MEM_RAT MSKOR T1.XW, T0.X
2235 ; EG-NEXT: Fetch clause starting at 6:
2236 ; EG-NEXT: VTX_READ_8 T2.X, T2.X, 0, #1
2237 ; EG-NEXT: VTX_READ_8 T1.X, T1.X, 0, #1
2238 ; EG-NEXT: ALU clause starting at 10:
2239 ; EG-NEXT: ADD_INT T1.X, KC0[2].Z, T0.X,
2240 ; EG-NEXT: ADD_INT * T2.X, KC0[2].W, T0.X,
2241 ; EG-NEXT: ALU clause starting at 12:
2242 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.X,
2243 ; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
2244 ; EG-NEXT: MIN_UINT * T2.W, T1.X, T2.X,
2245 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2246 ; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
2247 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2248 ; EG-NEXT: LSHL T1.X, T2.W, PV.W,
2249 ; EG-NEXT: LSHL * T1.W, literal.x, PV.W,
2250 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2251 ; EG-NEXT: MOV T1.Y, 0.0,
2252 ; EG-NEXT: MOV * T1.Z, 0.0,
2253 ; EG-NEXT: LSHR * T0.X, T0.W, literal.x,
2254 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2256 ; CI-LABEL: v_test_umin_ult_i8:
2258 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2259 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
2260 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2261 ; CI-NEXT: v_mov_b32_e32 v2, s3
2262 ; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
2263 ; CI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
2264 ; CI-NEXT: v_mov_b32_e32 v4, s5
2265 ; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v0
2266 ; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
2267 ; CI-NEXT: flat_load_ubyte v2, v[1:2]
2268 ; CI-NEXT: flat_load_ubyte v3, v[3:4]
2269 ; CI-NEXT: v_mov_b32_e32 v1, s1
2270 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
2271 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2272 ; CI-NEXT: s_waitcnt vmcnt(0)
2273 ; CI-NEXT: v_min_u32_e32 v2, v2, v3
2274 ; CI-NEXT: flat_store_byte v[0:1], v2
2277 ; VI-LABEL: v_test_umin_ult_i8:
2279 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2280 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
2281 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2282 ; VI-NEXT: v_mov_b32_e32 v2, s3
2283 ; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0
2284 ; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
2285 ; VI-NEXT: v_mov_b32_e32 v4, s5
2286 ; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v0
2287 ; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
2288 ; VI-NEXT: flat_load_ubyte v2, v[1:2]
2289 ; VI-NEXT: flat_load_ubyte v3, v[3:4]
2290 ; VI-NEXT: v_mov_b32_e32 v1, s1
2291 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
2292 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
2293 ; VI-NEXT: s_waitcnt vmcnt(0)
2294 ; VI-NEXT: v_min_u16_e32 v2, v2, v3
2295 ; VI-NEXT: flat_store_byte v[0:1], v2
2298 ; GFX9-LABEL: v_test_umin_ult_i8:
2300 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2301 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
2302 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2303 ; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
2304 ; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7]
2305 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2306 ; GFX9-NEXT: v_min_u16_e32 v1, v1, v2
2307 ; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
2308 ; GFX9-NEXT: s_endpgm
2310 ; GFX10-LABEL: v_test_umin_ult_i8:
2312 ; GFX10-NEXT: s_clause 0x1
2313 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2314 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
2315 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2316 ; GFX10-NEXT: s_clause 0x1
2317 ; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
2318 ; GFX10-NEXT: global_load_ubyte v2, v0, s[6:7]
2319 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2320 ; GFX10-NEXT: v_min_u16 v1, v1, v2
2321 ; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
2322 ; GFX10-NEXT: s_endpgm
2324 ; GFX11-LABEL: v_test_umin_ult_i8:
2326 ; GFX11-NEXT: s_clause 0x1
2327 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
2328 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
2329 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2330 ; GFX11-NEXT: s_clause 0x1
2331 ; GFX11-NEXT: global_load_u8 v1, v0, s[6:7]
2332 ; GFX11-NEXT: global_load_u8 v2, v0, s[0:1]
2333 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2334 ; GFX11-NEXT: v_min_u16 v1, v1, v2
2335 ; GFX11-NEXT: global_store_b8 v0, v1, s[4:5]
2336 ; GFX11-NEXT: s_nop 0
2337 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2338 ; GFX11-NEXT: s_endpgm
2339 %tid = call i32 @llvm.amdgcn.workitem.id.x()
2340 %a.gep = getelementptr inbounds i8, ptr addrspace(1) %a.ptr, i32 %tid
2341 %b.gep = getelementptr inbounds i8, ptr addrspace(1) %b.ptr, i32 %tid
2342 %out.gep = getelementptr inbounds i8, ptr addrspace(1) %out, i32 %tid
2344 %a = load i8, ptr addrspace(1) %a.gep, align 1
2345 %b = load i8, ptr addrspace(1) %b.gep, align 1
2346 %cmp = icmp ult i8 %a, %b
2347 %val = select i1 %cmp, i8 %a, i8 %b
2348 store i8 %val, ptr addrspace(1) %out.gep, align 1
2352 define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
2353 ; EG-LABEL: s_test_umin_ult_i32:
2355 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2356 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2359 ; EG-NEXT: ALU clause starting at 4:
2360 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2361 ; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2362 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2364 ; CI-LABEL: s_test_umin_ult_i32:
2366 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2367 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2368 ; CI-NEXT: s_min_u32 s2, s2, s3
2369 ; CI-NEXT: v_mov_b32_e32 v0, s0
2370 ; CI-NEXT: v_mov_b32_e32 v1, s1
2371 ; CI-NEXT: v_mov_b32_e32 v2, s2
2372 ; CI-NEXT: flat_store_dword v[0:1], v2
2375 ; VI-LABEL: s_test_umin_ult_i32:
2377 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2378 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2379 ; VI-NEXT: s_min_u32 s2, s2, s3
2380 ; VI-NEXT: v_mov_b32_e32 v0, s0
2381 ; VI-NEXT: v_mov_b32_e32 v1, s1
2382 ; VI-NEXT: v_mov_b32_e32 v2, s2
2383 ; VI-NEXT: flat_store_dword v[0:1], v2
2386 ; GFX9-LABEL: s_test_umin_ult_i32:
2388 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2389 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2390 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2391 ; GFX9-NEXT: s_min_u32 s2, s2, s3
2392 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2393 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2394 ; GFX9-NEXT: s_endpgm
2396 ; GFX10-LABEL: s_test_umin_ult_i32:
2398 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2399 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2400 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2401 ; GFX10-NEXT: s_min_u32 s2, s2, s3
2402 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
2403 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2404 ; GFX10-NEXT: s_endpgm
2406 ; GFX11-LABEL: s_test_umin_ult_i32:
2408 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
2409 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2410 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2411 ; GFX11-NEXT: s_min_u32 s2, s2, s3
2412 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2413 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
2414 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2415 ; GFX11-NEXT: s_nop 0
2416 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2417 ; GFX11-NEXT: s_endpgm
2418 %cmp = icmp ult i32 %a, %b
2419 %val = select i1 %cmp, i32 %a, i32 %b
2420 store i32 %val, ptr addrspace(1) %out, align 4
2424 define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2425 ; EG-LABEL: v_test_umin_ult_i32_multi_use:
2427 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2429 ; EG-NEXT: ALU 16, @12, KC0[CB0:0-32], KC1[]
2430 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 0
2431 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
2433 ; EG-NEXT: Fetch clause starting at 6:
2434 ; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1
2435 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
2436 ; EG-NEXT: ALU clause starting at 10:
2437 ; EG-NEXT: MOV T0.X, KC0[2].W,
2438 ; EG-NEXT: MOV * T1.X, KC0[3].X,
2439 ; EG-NEXT: ALU clause starting at 12:
2440 ; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
2441 ; EG-NEXT: SETGT_UINT * T1.W, T1.X, T0.X,
2442 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2443 ; EG-NEXT: AND_INT T1.W, PS, 1,
2444 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2445 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2446 ; EG-NEXT: LSHL T2.X, PV.W, PS,
2447 ; EG-NEXT: LSHL * T2.W, literal.x, PS,
2448 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2449 ; EG-NEXT: MOV T2.Y, 0.0,
2450 ; EG-NEXT: MOV * T2.Z, 0.0,
2451 ; EG-NEXT: LSHR T3.X, KC0[2].Z, literal.x,
2452 ; EG-NEXT: SETGE_UINT * T0.W, T0.X, T1.X,
2453 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2454 ; EG-NEXT: CNDE_INT T0.X, PV.W, T0.X, T1.X,
2455 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2456 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2458 ; CI-LABEL: v_test_umin_ult_i32_multi_use:
2460 ; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
2461 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2462 ; CI-NEXT: s_load_dword s4, s[4:5], 0x0
2463 ; CI-NEXT: s_load_dword s5, s[6:7], 0x0
2464 ; CI-NEXT: v_mov_b32_e32 v0, s0
2465 ; CI-NEXT: v_mov_b32_e32 v1, s1
2466 ; CI-NEXT: v_mov_b32_e32 v2, s2
2467 ; CI-NEXT: v_mov_b32_e32 v3, s3
2468 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2469 ; CI-NEXT: s_cmp_lt_u32 s4, s5
2470 ; CI-NEXT: s_cselect_b64 s[0:1], -1, 0
2471 ; CI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
2472 ; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec
2473 ; CI-NEXT: s_cselect_b32 s0, s4, s5
2474 ; CI-NEXT: v_mov_b32_e32 v5, s0
2475 ; CI-NEXT: flat_store_dword v[0:1], v5
2476 ; CI-NEXT: flat_store_byte v[2:3], v4
2479 ; VI-LABEL: v_test_umin_ult_i32_multi_use:
2481 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
2482 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2483 ; VI-NEXT: s_load_dword s4, s[4:5], 0x0
2484 ; VI-NEXT: s_load_dword s5, s[6:7], 0x0
2485 ; VI-NEXT: v_mov_b32_e32 v0, s0
2486 ; VI-NEXT: v_mov_b32_e32 v1, s1
2487 ; VI-NEXT: v_mov_b32_e32 v2, s2
2488 ; VI-NEXT: v_mov_b32_e32 v3, s3
2489 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2490 ; VI-NEXT: s_cmp_lt_u32 s4, s5
2491 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
2492 ; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
2493 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
2494 ; VI-NEXT: s_cselect_b32 s0, s4, s5
2495 ; VI-NEXT: v_mov_b32_e32 v5, s0
2496 ; VI-NEXT: flat_store_dword v[0:1], v5
2497 ; VI-NEXT: flat_store_byte v[2:3], v4
2500 ; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
2502 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
2503 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2504 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2505 ; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0
2506 ; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0
2507 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2508 ; GFX9-NEXT: s_cmp_lt_u32 s8, s9
2509 ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
2510 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
2511 ; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
2512 ; GFX9-NEXT: s_cselect_b32 s4, s8, s9
2513 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
2514 ; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
2515 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
2516 ; GFX9-NEXT: s_endpgm
2518 ; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
2520 ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
2521 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
2522 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2523 ; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0
2524 ; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0
2525 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2526 ; GFX10-NEXT: s_cmp_lt_u32 s8, s9
2527 ; GFX10-NEXT: s_cselect_b32 s4, -1, 0
2528 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
2529 ; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
2530 ; GFX10-NEXT: s_cselect_b32 s4, s8, s9
2531 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
2532 ; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
2533 ; GFX10-NEXT: global_store_byte v1, v0, s[2:3]
2534 ; GFX10-NEXT: s_endpgm
2536 ; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
2538 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
2539 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
2540 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2541 ; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
2542 ; GFX11-NEXT: s_load_b32 s5, s[6:7], 0x0
2543 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2544 ; GFX11-NEXT: s_cmp_lt_u32 s4, s5
2545 ; GFX11-NEXT: s_cselect_b32 s6, -1, 0
2546 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
2547 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
2548 ; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
2549 ; GFX11-NEXT: s_cselect_b32 s4, s4, s5
2550 ; GFX11-NEXT: v_mov_b32_e32 v2, s4
2551 ; GFX11-NEXT: s_clause 0x1
2552 ; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
2553 ; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
2554 ; GFX11-NEXT: s_nop 0
2555 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2556 ; GFX11-NEXT: s_endpgm
2557 %a = load i32, ptr addrspace(1) %aptr, align 4
2558 %b = load i32, ptr addrspace(1) %bptr, align 4
2559 %cmp = icmp ult i32 %a, %b
2560 %val = select i1 %cmp, i32 %a, i32 %b
2561 store i32 %val, ptr addrspace(1) %out0, align 4
2562 store i1 %cmp, ptr addrspace(1) %out1
2566 define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) #0 {
2567 ; EG-LABEL: v_test_umin_ult_i16_multi_use:
2569 ; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
2571 ; EG-NEXT: ALU 24, @12, KC0[CB0:0-32], KC1[]
2572 ; EG-NEXT: MEM_RAT MSKOR T2.XW, T3.X
2573 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
2575 ; EG-NEXT: Fetch clause starting at 6:
2576 ; EG-NEXT: VTX_READ_16 T1.X, T1.X, 0, #1
2577 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
2578 ; EG-NEXT: ALU clause starting at 10:
2579 ; EG-NEXT: MOV T0.X, KC0[2].W,
2580 ; EG-NEXT: MOV * T1.X, KC0[3].X,
2581 ; EG-NEXT: ALU clause starting at 12:
2582 ; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
2583 ; EG-NEXT: SETGE_UINT * T1.W, T0.X, T1.X,
2584 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2585 ; EG-NEXT: CNDE_INT T1.W, PS, T0.X, T1.X,
2586 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2587 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2588 ; EG-NEXT: LSHL T2.X, PV.W, PS,
2589 ; EG-NEXT: LSHL * T2.W, literal.x, PS,
2590 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
2591 ; EG-NEXT: MOV T2.Y, 0.0,
2592 ; EG-NEXT: AND_INT T0.W, KC0[2].Z, literal.x,
2593 ; EG-NEXT: SETGT_UINT * T1.W, T1.X, T0.X,
2594 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2595 ; EG-NEXT: AND_INT T1.W, PS, 1,
2596 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
2597 ; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
2598 ; EG-NEXT: LSHL T0.X, PV.W, PS,
2599 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
2600 ; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
2601 ; EG-NEXT: MOV T0.Y, 0.0,
2602 ; EG-NEXT: MOV T2.Z, 0.0,
2603 ; EG-NEXT: MOV * T0.Z, 0.0,
2604 ; EG-NEXT: LSHR T1.X, KC0[2].Z, literal.x,
2605 ; EG-NEXT: LSHR * T3.X, KC0[2].Y, literal.x,
2606 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2608 ; CI-LABEL: v_test_umin_ult_i16_multi_use:
2610 ; CI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
2611 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2612 ; CI-NEXT: v_mov_b32_e32 v0, s4
2613 ; CI-NEXT: v_mov_b32_e32 v1, s5
2614 ; CI-NEXT: v_mov_b32_e32 v2, s6
2615 ; CI-NEXT: v_mov_b32_e32 v3, s7
2616 ; CI-NEXT: flat_load_ushort v4, v[0:1]
2617 ; CI-NEXT: flat_load_ushort v5, v[2:3]
2618 ; CI-NEXT: v_mov_b32_e32 v0, s0
2619 ; CI-NEXT: v_mov_b32_e32 v1, s1
2620 ; CI-NEXT: v_mov_b32_e32 v2, s2
2621 ; CI-NEXT: v_mov_b32_e32 v3, s3
2622 ; CI-NEXT: s_waitcnt vmcnt(0)
2623 ; CI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
2624 ; CI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
2625 ; CI-NEXT: flat_store_short v[0:1], v4
2626 ; CI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2627 ; CI-NEXT: flat_store_byte v[2:3], v0
2630 ; VI-LABEL: v_test_umin_ult_i16_multi_use:
2632 ; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
2633 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2634 ; VI-NEXT: v_mov_b32_e32 v0, s4
2635 ; VI-NEXT: v_mov_b32_e32 v1, s5
2636 ; VI-NEXT: v_mov_b32_e32 v2, s6
2637 ; VI-NEXT: v_mov_b32_e32 v3, s7
2638 ; VI-NEXT: flat_load_ushort v4, v[0:1]
2639 ; VI-NEXT: flat_load_ushort v5, v[2:3]
2640 ; VI-NEXT: v_mov_b32_e32 v0, s0
2641 ; VI-NEXT: v_mov_b32_e32 v1, s1
2642 ; VI-NEXT: v_mov_b32_e32 v2, s2
2643 ; VI-NEXT: v_mov_b32_e32 v3, s3
2644 ; VI-NEXT: s_waitcnt vmcnt(0)
2645 ; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
2646 ; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
2647 ; VI-NEXT: flat_store_short v[0:1], v4
2648 ; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
2649 ; VI-NEXT: flat_store_byte v[2:3], v0
2652 ; GFX9-LABEL: v_test_umin_ult_i16_multi_use:
2654 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
2655 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2656 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2657 ; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
2658 ; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
2659 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2660 ; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
2661 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
2662 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
2663 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
2664 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
2665 ; GFX9-NEXT: s_endpgm
2667 ; GFX10-LABEL: v_test_umin_ult_i16_multi_use:
2669 ; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
2670 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2671 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2672 ; GFX10-NEXT: s_clause 0x1
2673 ; GFX10-NEXT: global_load_ushort v1, v0, s[4:5]
2674 ; GFX10-NEXT: global_load_ushort v2, v0, s[6:7]
2675 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2676 ; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
2677 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
2678 ; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
2679 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
2680 ; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
2681 ; GFX10-NEXT: s_endpgm
2683 ; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
2685 ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
2686 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2687 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2688 ; GFX11-NEXT: s_clause 0x1
2689 ; GFX11-NEXT: global_load_u16 v1, v0, s[4:5]
2690 ; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
2691 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2692 ; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
2693 ; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
2694 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
2695 ; GFX11-NEXT: s_clause 0x1
2696 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
2697 ; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
2698 ; GFX11-NEXT: s_nop 0
2699 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2700 ; GFX11-NEXT: s_endpgm
2701 %a = load i16, ptr addrspace(1) %aptr, align 2
2702 %b = load i16, ptr addrspace(1) %bptr, align 2
2703 %cmp = icmp ult i16 %a, %b
2704 %val = select i1 %cmp, i16 %a, i16 %b
2705 store i16 %val, ptr addrspace(1) %out0, align 2
2706 store i1 %cmp, ptr addrspace(1) %out1
2710 define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32> %a, <1 x i32> %b) #0 {
2711 ; EG-LABEL: s_test_umin_ult_v1i32:
2713 ; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
2714 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2717 ; EG-NEXT: ALU clause starting at 4:
2718 ; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
2719 ; EG-NEXT: MIN_UINT * T1.X, KC0[2].Z, KC0[2].W,
2720 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2722 ; CI-LABEL: s_test_umin_ult_v1i32:
2724 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2725 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2726 ; CI-NEXT: s_min_u32 s2, s2, s3
2727 ; CI-NEXT: v_mov_b32_e32 v0, s0
2728 ; CI-NEXT: v_mov_b32_e32 v1, s1
2729 ; CI-NEXT: v_mov_b32_e32 v2, s2
2730 ; CI-NEXT: flat_store_dword v[0:1], v2
2733 ; VI-LABEL: s_test_umin_ult_v1i32:
2735 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2736 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2737 ; VI-NEXT: s_min_u32 s2, s2, s3
2738 ; VI-NEXT: v_mov_b32_e32 v0, s0
2739 ; VI-NEXT: v_mov_b32_e32 v1, s1
2740 ; VI-NEXT: v_mov_b32_e32 v2, s2
2741 ; VI-NEXT: flat_store_dword v[0:1], v2
2744 ; GFX9-LABEL: s_test_umin_ult_v1i32:
2746 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2747 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2748 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2749 ; GFX9-NEXT: s_min_u32 s2, s2, s3
2750 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2751 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
2752 ; GFX9-NEXT: s_endpgm
2754 ; GFX10-LABEL: s_test_umin_ult_v1i32:
2756 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
2757 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
2758 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2759 ; GFX10-NEXT: s_min_u32 s2, s2, s3
2760 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
2761 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
2762 ; GFX10-NEXT: s_endpgm
2764 ; GFX11-LABEL: s_test_umin_ult_v1i32:
2766 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
2767 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
2768 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2769 ; GFX11-NEXT: s_min_u32 s2, s2, s3
2770 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2771 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
2772 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
2773 ; GFX11-NEXT: s_nop 0
2774 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2775 ; GFX11-NEXT: s_endpgm
2776 %cmp = icmp ult <1 x i32> %a, %b
2777 %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
2778 store <1 x i32> %val, ptr addrspace(1) %out
2782 define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x i32> %b) #0 {
2783 ; EG-LABEL: s_test_umin_ult_v8i32:
2785 ; EG-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[]
2786 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
2787 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2789 ; EG-NEXT: ALU clause starting at 4:
2790 ; EG-NEXT: MIN_UINT * T0.W, KC0[5].X, KC0[7].X,
2791 ; EG-NEXT: MIN_UINT * T0.Z, KC0[4].W, KC0[6].W,
2792 ; EG-NEXT: MIN_UINT * T0.Y, KC0[4].Z, KC0[6].Z,
2793 ; EG-NEXT: MIN_UINT * T0.X, KC0[4].Y, KC0[6].Y,
2794 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
2795 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2796 ; EG-NEXT: MIN_UINT * T2.W, KC0[6].X, KC0[8].X,
2797 ; EG-NEXT: MIN_UINT * T2.Z, KC0[5].W, KC0[7].W,
2798 ; EG-NEXT: MIN_UINT * T2.Y, KC0[5].Z, KC0[7].Z,
2799 ; EG-NEXT: MIN_UINT * T2.X, KC0[5].Y, KC0[7].Y,
2800 ; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
2801 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
2802 ; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
2803 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
2805 ; CI-LABEL: s_test_umin_ult_v8i32:
2807 ; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x8
2808 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2809 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2810 ; CI-NEXT: s_min_u32 s4, s11, s19
2811 ; CI-NEXT: s_min_u32 s5, s10, s18
2812 ; CI-NEXT: s_min_u32 s6, s9, s17
2813 ; CI-NEXT: s_min_u32 s7, s8, s16
2814 ; CI-NEXT: s_min_u32 s2, s15, s23
2815 ; CI-NEXT: s_min_u32 s3, s14, s22
2816 ; CI-NEXT: s_min_u32 s8, s13, s21
2817 ; CI-NEXT: s_min_u32 s9, s12, s20
2818 ; CI-NEXT: v_mov_b32_e32 v3, s2
2819 ; CI-NEXT: s_add_u32 s2, s0, 16
2820 ; CI-NEXT: v_mov_b32_e32 v2, s3
2821 ; CI-NEXT: s_addc_u32 s3, s1, 0
2822 ; CI-NEXT: v_mov_b32_e32 v5, s3
2823 ; CI-NEXT: v_mov_b32_e32 v0, s9
2824 ; CI-NEXT: v_mov_b32_e32 v1, s8
2825 ; CI-NEXT: v_mov_b32_e32 v4, s2
2826 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2827 ; CI-NEXT: v_mov_b32_e32 v5, s1
2828 ; CI-NEXT: v_mov_b32_e32 v0, s7
2829 ; CI-NEXT: v_mov_b32_e32 v1, s6
2830 ; CI-NEXT: v_mov_b32_e32 v2, s5
2831 ; CI-NEXT: v_mov_b32_e32 v3, s4
2832 ; CI-NEXT: v_mov_b32_e32 v4, s0
2833 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2836 ; VI-LABEL: s_test_umin_ult_v8i32:
2838 ; VI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20
2839 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2840 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2841 ; VI-NEXT: s_min_u32 s4, s11, s19
2842 ; VI-NEXT: s_min_u32 s5, s10, s18
2843 ; VI-NEXT: s_min_u32 s6, s9, s17
2844 ; VI-NEXT: s_min_u32 s7, s8, s16
2845 ; VI-NEXT: s_min_u32 s2, s15, s23
2846 ; VI-NEXT: s_min_u32 s3, s14, s22
2847 ; VI-NEXT: s_min_u32 s8, s13, s21
2848 ; VI-NEXT: s_min_u32 s9, s12, s20
2849 ; VI-NEXT: v_mov_b32_e32 v3, s2
2850 ; VI-NEXT: s_add_u32 s2, s0, 16
2851 ; VI-NEXT: v_mov_b32_e32 v2, s3
2852 ; VI-NEXT: s_addc_u32 s3, s1, 0
2853 ; VI-NEXT: v_mov_b32_e32 v5, s3
2854 ; VI-NEXT: v_mov_b32_e32 v0, s9
2855 ; VI-NEXT: v_mov_b32_e32 v1, s8
2856 ; VI-NEXT: v_mov_b32_e32 v4, s2
2857 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2858 ; VI-NEXT: v_mov_b32_e32 v5, s1
2859 ; VI-NEXT: v_mov_b32_e32 v0, s7
2860 ; VI-NEXT: v_mov_b32_e32 v1, s6
2861 ; VI-NEXT: v_mov_b32_e32 v2, s5
2862 ; VI-NEXT: v_mov_b32_e32 v3, s4
2863 ; VI-NEXT: v_mov_b32_e32 v4, s0
2864 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
2867 ; GFX9-LABEL: s_test_umin_ult_v8i32:
2869 ; GFX9-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20
2870 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2871 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
2872 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2873 ; GFX9-NEXT: s_min_u32 s4, s9, s17
2874 ; GFX9-NEXT: s_min_u32 s5, s8, s16
2875 ; GFX9-NEXT: s_min_u32 s6, s15, s23
2876 ; GFX9-NEXT: s_min_u32 s7, s14, s22
2877 ; GFX9-NEXT: s_min_u32 s8, s13, s21
2878 ; GFX9-NEXT: s_min_u32 s9, s12, s20
2879 ; GFX9-NEXT: s_min_u32 s2, s11, s19
2880 ; GFX9-NEXT: s_min_u32 s3, s10, s18
2881 ; GFX9-NEXT: v_mov_b32_e32 v0, s9
2882 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
2883 ; GFX9-NEXT: v_mov_b32_e32 v2, s7
2884 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
2885 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
2886 ; GFX9-NEXT: s_nop 0
2887 ; GFX9-NEXT: v_mov_b32_e32 v0, s5
2888 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2889 ; GFX9-NEXT: v_mov_b32_e32 v2, s3
2890 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
2891 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
2892 ; GFX9-NEXT: s_endpgm
2894 ; GFX10-LABEL: s_test_umin_ult_v8i32:
2896 ; GFX10-NEXT: s_clause 0x1
2897 ; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x20
2898 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2899 ; GFX10-NEXT: v_mov_b32_e32 v8, 0
2900 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2901 ; GFX10-NEXT: s_min_u32 s4, s9, s17
2902 ; GFX10-NEXT: s_min_u32 s5, s8, s16
2903 ; GFX10-NEXT: s_min_u32 s6, s15, s23
2904 ; GFX10-NEXT: s_min_u32 s7, s14, s22
2905 ; GFX10-NEXT: s_min_u32 s8, s12, s20
2906 ; GFX10-NEXT: s_min_u32 s9, s13, s21
2907 ; GFX10-NEXT: s_min_u32 s2, s11, s19
2908 ; GFX10-NEXT: s_min_u32 s3, s10, s18
2909 ; GFX10-NEXT: v_mov_b32_e32 v0, s8
2910 ; GFX10-NEXT: v_mov_b32_e32 v1, s9
2911 ; GFX10-NEXT: v_mov_b32_e32 v2, s7
2912 ; GFX10-NEXT: v_mov_b32_e32 v3, s6
2913 ; GFX10-NEXT: v_mov_b32_e32 v4, s5
2914 ; GFX10-NEXT: v_mov_b32_e32 v5, s4
2915 ; GFX10-NEXT: v_mov_b32_e32 v6, s3
2916 ; GFX10-NEXT: v_mov_b32_e32 v7, s2
2917 ; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
2918 ; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
2919 ; GFX10-NEXT: s_endpgm
2921 ; GFX11-LABEL: s_test_umin_ult_v8i32:
2923 ; GFX11-NEXT: s_clause 0x1
2924 ; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x20
2925 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
2926 ; GFX11-NEXT: v_mov_b32_e32 v8, 0
2927 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2928 ; GFX11-NEXT: s_min_u32 s2, s7, s15
2929 ; GFX11-NEXT: s_min_u32 s3, s6, s14
2930 ; GFX11-NEXT: s_min_u32 s6, s11, s19
2931 ; GFX11-NEXT: s_min_u32 s7, s10, s18
2932 ; GFX11-NEXT: s_min_u32 s8, s8, s16
2933 ; GFX11-NEXT: s_min_u32 s9, s9, s17
2934 ; GFX11-NEXT: s_min_u32 s5, s5, s13
2935 ; GFX11-NEXT: s_min_u32 s4, s4, s12
2936 ; GFX11-NEXT: v_mov_b32_e32 v0, s8
2937 ; GFX11-NEXT: v_mov_b32_e32 v1, s9
2938 ; GFX11-NEXT: v_mov_b32_e32 v2, s7
2939 ; GFX11-NEXT: v_mov_b32_e32 v3, s6
2940 ; GFX11-NEXT: v_mov_b32_e32 v4, s4
2941 ; GFX11-NEXT: v_mov_b32_e32 v5, s5
2942 ; GFX11-NEXT: v_mov_b32_e32 v6, s3
2943 ; GFX11-NEXT: v_mov_b32_e32 v7, s2
2944 ; GFX11-NEXT: s_clause 0x1
2945 ; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
2946 ; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
2947 ; GFX11-NEXT: s_nop 0
2948 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2949 ; GFX11-NEXT: s_endpgm
2950 %cmp = icmp ult <8 x i32> %a, %b
2951 %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
2952 store <8 x i32> %val, ptr addrspace(1) %out
2956 define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16> %a, <8 x i16> %b) #0 {
2957 ; EG-LABEL: s_test_umin_ult_v8i16:
2959 ; EG-NEXT: ALU 1, @52, KC0[], KC1[]
2960 ; EG-NEXT: TEX 1 @20
2961 ; EG-NEXT: ALU 9, @54, KC0[], KC1[]
2962 ; EG-NEXT: TEX 1 @24
2963 ; EG-NEXT: ALU 8, @64, KC0[], KC1[]
2964 ; EG-NEXT: TEX 1 @28
2965 ; EG-NEXT: ALU 10, @73, KC0[], KC1[]
2966 ; EG-NEXT: TEX 1 @32
2967 ; EG-NEXT: ALU 8, @84, KC0[], KC1[]
2968 ; EG-NEXT: TEX 1 @36
2969 ; EG-NEXT: ALU 10, @93, KC0[], KC1[]
2970 ; EG-NEXT: TEX 1 @40
2971 ; EG-NEXT: ALU 8, @104, KC0[], KC1[]
2972 ; EG-NEXT: TEX 1 @44
2973 ; EG-NEXT: ALU 10, @113, KC0[], KC1[]
2974 ; EG-NEXT: TEX 1 @48
2975 ; EG-NEXT: ALU 10, @124, KC0[CB0:0-32], KC1[]
2976 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
2979 ; EG-NEXT: Fetch clause starting at 20:
2980 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 66, #3
2981 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 82, #3
2982 ; EG-NEXT: Fetch clause starting at 24:
2983 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 64, #3
2984 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 80, #3
2985 ; EG-NEXT: Fetch clause starting at 28:
2986 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 62, #3
2987 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 78, #3
2988 ; EG-NEXT: Fetch clause starting at 32:
2989 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 60, #3
2990 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 76, #3
2991 ; EG-NEXT: Fetch clause starting at 36:
2992 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 58, #3
2993 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 74, #3
2994 ; EG-NEXT: Fetch clause starting at 40:
2995 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 56, #3
2996 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 72, #3
2997 ; EG-NEXT: Fetch clause starting at 44:
2998 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 54, #3
2999 ; EG-NEXT: VTX_READ_16 T9.X, T7.X, 70, #3
3000 ; EG-NEXT: Fetch clause starting at 48:
3001 ; EG-NEXT: VTX_READ_16 T8.X, T7.X, 52, #3
3002 ; EG-NEXT: VTX_READ_16 T7.X, T7.X, 68, #3
3003 ; EG-NEXT: ALU clause starting at 52:
3004 ; EG-NEXT: MOV * T0.Y, T3.X,
3005 ; EG-NEXT: MOV * T7.X, 0.0,
3006 ; EG-NEXT: ALU clause starting at 54:
3007 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3008 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3009 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3010 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3011 ; EG-NEXT: LSHL T0.W, PV.W, literal.x,
3012 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
3013 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
3014 ; EG-NEXT: OR_INT * T0.W, PS, PV.W,
3015 ; EG-NEXT: MOV * T3.X, PV.W,
3016 ; EG-NEXT: MOV * T0.Y, PV.X,
3017 ; EG-NEXT: ALU clause starting at 64:
3018 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3019 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3020 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3021 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
3022 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3023 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
3024 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3025 ; EG-NEXT: MOV T3.X, PV.W,
3026 ; EG-NEXT: MOV * T0.Y, T2.X,
3027 ; EG-NEXT: ALU clause starting at 73:
3028 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3029 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3030 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3031 ; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
3032 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
3033 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3034 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3035 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3036 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3037 ; EG-NEXT: MOV * T2.X, PV.W,
3038 ; EG-NEXT: MOV * T0.Y, PV.X,
3039 ; EG-NEXT: ALU clause starting at 84:
3040 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3041 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3042 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3043 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
3044 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3045 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
3046 ; EG-NEXT: OR_INT * T7.Z, PV.W, PS,
3047 ; EG-NEXT: MOV T2.X, PV.Z,
3048 ; EG-NEXT: MOV * T0.Y, T5.X,
3049 ; EG-NEXT: ALU clause starting at 93:
3050 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3051 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3052 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3053 ; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
3054 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
3055 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3056 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3057 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3058 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3059 ; EG-NEXT: MOV * T5.X, PV.W,
3060 ; EG-NEXT: MOV * T0.Y, PV.X,
3061 ; EG-NEXT: ALU clause starting at 104:
3062 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3063 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3064 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3065 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.x,
3066 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3067 ; EG-NEXT: -65536(nan), 0(0.000000e+00)
3068 ; EG-NEXT: OR_INT * T0.W, PV.W, PS,
3069 ; EG-NEXT: MOV T5.X, PV.W,
3070 ; EG-NEXT: MOV * T0.Y, T4.X,
3071 ; EG-NEXT: ALU clause starting at 113:
3072 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3073 ; EG-NEXT: AND_INT * T1.W, T9.X, literal.x,
3074 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3075 ; EG-NEXT: MIN_UINT T0.W, PV.W, PS,
3076 ; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
3077 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3078 ; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
3079 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3080 ; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
3081 ; EG-NEXT: MOV * T4.X, PV.W,
3082 ; EG-NEXT: MOV * T0.Y, PV.X,
3083 ; EG-NEXT: ALU clause starting at 124:
3084 ; EG-NEXT: AND_INT T0.W, T8.X, literal.x,
3085 ; EG-NEXT: AND_INT * T1.W, T7.X, literal.x,
3086 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3087 ; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
3088 ; EG-NEXT: AND_INT T2.W, T0.Y, literal.y,
3089 ; EG-NEXT: MIN_UINT * T0.W, PV.W, PS,
3090 ; EG-NEXT: 2(2.802597e-45), -65536(nan)
3091 ; EG-NEXT: OR_INT * T7.X, PV.W, PS,
3092 ; EG-NEXT: MOV T4.X, PV.X,
3093 ; EG-NEXT: MOV * T7.W, T3.X,
3094 ; EG-NEXT: MOV * T7.Y, T5.X,
3096 ; CI-LABEL: s_test_umin_ult_v8i16:
3098 ; CI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x4
3099 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3100 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3101 ; CI-NEXT: s_lshr_b32 s2, s8, 16
3102 ; CI-NEXT: s_and_b32 s3, s8, 0xffff
3103 ; CI-NEXT: s_lshr_b32 s4, s9, 16
3104 ; CI-NEXT: s_and_b32 s5, s9, 0xffff
3105 ; CI-NEXT: s_lshr_b32 s6, s10, 16
3106 ; CI-NEXT: s_and_b32 s7, s10, 0xffff
3107 ; CI-NEXT: s_lshr_b32 s8, s11, 16
3108 ; CI-NEXT: s_and_b32 s9, s11, 0xffff
3109 ; CI-NEXT: s_lshr_b32 s10, s12, 16
3110 ; CI-NEXT: s_and_b32 s11, s12, 0xffff
3111 ; CI-NEXT: s_lshr_b32 s12, s13, 16
3112 ; CI-NEXT: s_lshr_b32 s16, s14, 16
3113 ; CI-NEXT: s_lshr_b32 s17, s15, 16
3114 ; CI-NEXT: s_and_b32 s13, s13, 0xffff
3115 ; CI-NEXT: s_and_b32 s14, s14, 0xffff
3116 ; CI-NEXT: s_and_b32 s15, s15, 0xffff
3117 ; CI-NEXT: s_min_u32 s8, s8, s17
3118 ; CI-NEXT: s_min_u32 s6, s6, s16
3119 ; CI-NEXT: s_min_u32 s4, s4, s12
3120 ; CI-NEXT: s_min_u32 s2, s2, s10
3121 ; CI-NEXT: s_min_u32 s9, s9, s15
3122 ; CI-NEXT: s_lshl_b32 s8, s8, 16
3123 ; CI-NEXT: s_min_u32 s7, s7, s14
3124 ; CI-NEXT: s_lshl_b32 s6, s6, 16
3125 ; CI-NEXT: s_min_u32 s5, s5, s13
3126 ; CI-NEXT: s_lshl_b32 s4, s4, 16
3127 ; CI-NEXT: s_min_u32 s3, s3, s11
3128 ; CI-NEXT: s_lshl_b32 s2, s2, 16
3129 ; CI-NEXT: s_or_b32 s8, s9, s8
3130 ; CI-NEXT: s_or_b32 s6, s7, s6
3131 ; CI-NEXT: s_or_b32 s4, s5, s4
3132 ; CI-NEXT: s_or_b32 s2, s3, s2
3133 ; CI-NEXT: v_mov_b32_e32 v5, s1
3134 ; CI-NEXT: v_mov_b32_e32 v0, s2
3135 ; CI-NEXT: v_mov_b32_e32 v1, s4
3136 ; CI-NEXT: v_mov_b32_e32 v2, s6
3137 ; CI-NEXT: v_mov_b32_e32 v3, s8
3138 ; CI-NEXT: v_mov_b32_e32 v4, s0
3139 ; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3142 ; VI-LABEL: s_test_umin_ult_v8i16:
3144 ; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
3145 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3146 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3147 ; VI-NEXT: s_lshr_b32 s2, s11, 16
3148 ; VI-NEXT: s_lshr_b32 s4, s10, 16
3149 ; VI-NEXT: s_and_b32 s5, s10, 0xffff
3150 ; VI-NEXT: s_lshr_b32 s10, s15, 16
3151 ; VI-NEXT: s_and_b32 s3, s11, 0xffff
3152 ; VI-NEXT: s_and_b32 s11, s15, 0xffff
3153 ; VI-NEXT: s_lshr_b32 s15, s14, 16
3154 ; VI-NEXT: s_min_u32 s2, s2, s10
3155 ; VI-NEXT: s_lshr_b32 s6, s9, 16
3156 ; VI-NEXT: s_and_b32 s7, s9, 0xffff
3157 ; VI-NEXT: s_lshr_b32 s9, s8, 16
3158 ; VI-NEXT: s_and_b32 s14, s14, 0xffff
3159 ; VI-NEXT: s_lshr_b32 s16, s13, 16
3160 ; VI-NEXT: s_lshr_b32 s17, s12, 16
3161 ; VI-NEXT: s_min_u32 s4, s4, s15
3162 ; VI-NEXT: s_min_u32 s3, s3, s11
3163 ; VI-NEXT: s_lshl_b32 s2, s2, 16
3164 ; VI-NEXT: s_and_b32 s8, s8, 0xffff
3165 ; VI-NEXT: s_and_b32 s13, s13, 0xffff
3166 ; VI-NEXT: s_and_b32 s12, s12, 0xffff
3167 ; VI-NEXT: s_min_u32 s9, s9, s17
3168 ; VI-NEXT: s_min_u32 s6, s6, s16
3169 ; VI-NEXT: s_min_u32 s5, s5, s14
3170 ; VI-NEXT: s_or_b32 s2, s3, s2
3171 ; VI-NEXT: s_lshl_b32 s3, s4, 16
3172 ; VI-NEXT: s_min_u32 s8, s8, s12
3173 ; VI-NEXT: s_min_u32 s7, s7, s13
3174 ; VI-NEXT: s_or_b32 s3, s5, s3
3175 ; VI-NEXT: s_lshl_b32 s4, s6, 16
3176 ; VI-NEXT: s_lshl_b32 s5, s9, 16
3177 ; VI-NEXT: s_or_b32 s4, s7, s4
3178 ; VI-NEXT: s_or_b32 s5, s8, s5
3179 ; VI-NEXT: v_mov_b32_e32 v5, s1
3180 ; VI-NEXT: v_mov_b32_e32 v0, s5
3181 ; VI-NEXT: v_mov_b32_e32 v1, s4
3182 ; VI-NEXT: v_mov_b32_e32 v2, s3
3183 ; VI-NEXT: v_mov_b32_e32 v3, s2
3184 ; VI-NEXT: v_mov_b32_e32 v4, s0
3185 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
3188 ; GFX9-LABEL: s_test_umin_ult_v8i16:
3190 ; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
3191 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3192 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
3193 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3194 ; GFX9-NEXT: v_mov_b32_e32 v0, s15
3195 ; GFX9-NEXT: v_mov_b32_e32 v1, s14
3196 ; GFX9-NEXT: v_pk_min_u16 v3, s11, v0
3197 ; GFX9-NEXT: v_mov_b32_e32 v0, s13
3198 ; GFX9-NEXT: v_pk_min_u16 v2, s10, v1
3199 ; GFX9-NEXT: v_pk_min_u16 v1, s9, v0
3200 ; GFX9-NEXT: v_mov_b32_e32 v0, s12
3201 ; GFX9-NEXT: v_pk_min_u16 v0, s8, v0
3202 ; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3203 ; GFX9-NEXT: s_endpgm
3205 ; GFX10-LABEL: s_test_umin_ult_v8i16:
3207 ; GFX10-NEXT: s_clause 0x1
3208 ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x10
3209 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3210 ; GFX10-NEXT: v_mov_b32_e32 v4, 0
3211 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3212 ; GFX10-NEXT: v_pk_min_u16 v3, s11, s15
3213 ; GFX10-NEXT: v_pk_min_u16 v2, s10, s14
3214 ; GFX10-NEXT: v_pk_min_u16 v1, s9, s13
3215 ; GFX10-NEXT: v_pk_min_u16 v0, s8, s12
3216 ; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
3217 ; GFX10-NEXT: s_endpgm
3219 ; GFX11-LABEL: s_test_umin_ult_v8i16:
3221 ; GFX11-NEXT: s_clause 0x1
3222 ; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x10
3223 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
3224 ; GFX11-NEXT: v_mov_b32_e32 v4, 0
3225 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3226 ; GFX11-NEXT: v_pk_min_u16 v3, s7, s11
3227 ; GFX11-NEXT: v_pk_min_u16 v2, s6, s10
3228 ; GFX11-NEXT: v_pk_min_u16 v1, s5, s9
3229 ; GFX11-NEXT: v_pk_min_u16 v0, s4, s8
3230 ; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
3231 ; GFX11-NEXT: s_nop 0
3232 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3233 ; GFX11-NEXT: s_endpgm
3234 %cmp = icmp ult <8 x i16> %a, %b
3235 %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
3236 store <8 x i16> %val, ptr addrspace(1) %out
3240 ; Make sure redundant and removed
3242 define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspace(1) %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
3243 ; EG-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3245 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
3247 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
3248 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
3251 ; EG-NEXT: Fetch clause starting at 6:
3252 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 72, #3
3253 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 108, #3
3254 ; EG-NEXT: ALU clause starting at 10:
3255 ; EG-NEXT: MOV * T0.X, 0.0,
3256 ; EG-NEXT: ALU clause starting at 11:
3257 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
3258 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3259 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3260 ; EG-NEXT: MIN_UINT T0.X, PV.Z, PV.W,
3261 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3262 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3264 ; CI-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3266 ; CI-NEXT: s_load_dword s2, s[4:5], 0xa
3267 ; CI-NEXT: s_load_dword s3, s[4:5], 0x13
3268 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3269 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3270 ; CI-NEXT: s_and_b32 s2, s2, 0xffff
3271 ; CI-NEXT: s_and_b32 s3, s3, 0xffff
3272 ; CI-NEXT: s_min_u32 s2, s2, s3
3273 ; CI-NEXT: v_mov_b32_e32 v0, s0
3274 ; CI-NEXT: v_mov_b32_e32 v1, s1
3275 ; CI-NEXT: v_mov_b32_e32 v2, s2
3276 ; CI-NEXT: flat_store_dword v[0:1], v2
3279 ; VI-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3281 ; VI-NEXT: s_load_dword s2, s[4:5], 0x28
3282 ; VI-NEXT: s_load_dword s3, s[4:5], 0x4c
3283 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3284 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3285 ; VI-NEXT: s_and_b32 s2, s2, 0xffff
3286 ; VI-NEXT: s_and_b32 s3, s3, 0xffff
3287 ; VI-NEXT: s_min_u32 s2, s2, s3
3288 ; VI-NEXT: v_mov_b32_e32 v0, s0
3289 ; VI-NEXT: v_mov_b32_e32 v1, s1
3290 ; VI-NEXT: v_mov_b32_e32 v2, s2
3291 ; VI-NEXT: flat_store_dword v[0:1], v2
3294 ; GFX9-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3296 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28
3297 ; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c
3298 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3299 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3301 ; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
3302 ; GFX9-NEXT: s_and_b32 s3, s3, 0xffff
3303 ; GFX9-NEXT: s_min_u32 s2, s2, s3
3304 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3305 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
3306 ; GFX9-NEXT: s_endpgm
3308 ; GFX10-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3310 ; GFX10-NEXT: s_clause 0x2
3311 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28
3312 ; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c
3313 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3314 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3315 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3316 ; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
3317 ; GFX10-NEXT: s_and_b32 s3, s3, 0xffff
3318 ; GFX10-NEXT: s_min_u32 s2, s2, s3
3319 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
3320 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
3321 ; GFX10-NEXT: s_endpgm
3323 ; GFX11-LABEL: simplify_demanded_bits_test_umin_ult_i16:
3325 ; GFX11-NEXT: s_clause 0x2
3326 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28
3327 ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c
3328 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
3329 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3330 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3331 ; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
3332 ; GFX11-NEXT: s_and_b32 s3, s3, 0xffff
3333 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3334 ; GFX11-NEXT: s_min_u32 s2, s2, s3
3335 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
3336 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
3337 ; GFX11-NEXT: s_nop 0
3338 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3339 ; GFX11-NEXT: s_endpgm
3340 %a.ext = zext i16 %a to i32
3341 %b.ext = zext i16 %b to i32
3342 %cmp = icmp ult i32 %a.ext, %b.ext
3343 %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
3344 %mask = and i32 %val, 65535
3345 store i32 %mask, ptr addrspace(1) %out
3349 ; Make sure redundant sign_extend_inreg removed.
3351 define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace(1) %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
3352 ; EG-LABEL: simplify_demanded_bits_test_min_slt_i16:
3354 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
3356 ; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
3357 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
3360 ; EG-NEXT: Fetch clause starting at 6:
3361 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 72, #3
3362 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 108, #3
3363 ; EG-NEXT: ALU clause starting at 10:
3364 ; EG-NEXT: MOV * T0.X, 0.0,
3365 ; EG-NEXT: ALU clause starting at 11:
3366 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
3367 ; EG-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3368 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
3369 ; EG-NEXT: MIN_INT T0.X, PV.Z, PV.W,
3370 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3371 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3373 ; CI-LABEL: simplify_demanded_bits_test_min_slt_i16:
3375 ; CI-NEXT: s_load_dword s2, s[4:5], 0xa
3376 ; CI-NEXT: s_load_dword s3, s[4:5], 0x13
3377 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3378 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3379 ; CI-NEXT: s_sext_i32_i16 s2, s2
3380 ; CI-NEXT: s_sext_i32_i16 s3, s3
3381 ; CI-NEXT: s_min_i32 s2, s2, s3
3382 ; CI-NEXT: v_mov_b32_e32 v0, s0
3383 ; CI-NEXT: v_mov_b32_e32 v1, s1
3384 ; CI-NEXT: v_mov_b32_e32 v2, s2
3385 ; CI-NEXT: flat_store_dword v[0:1], v2
3388 ; VI-LABEL: simplify_demanded_bits_test_min_slt_i16:
3390 ; VI-NEXT: s_load_dword s2, s[4:5], 0x28
3391 ; VI-NEXT: s_load_dword s3, s[4:5], 0x4c
3392 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3393 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3394 ; VI-NEXT: s_sext_i32_i16 s2, s2
3395 ; VI-NEXT: s_sext_i32_i16 s3, s3
3396 ; VI-NEXT: s_min_i32 s2, s2, s3
3397 ; VI-NEXT: v_mov_b32_e32 v0, s0
3398 ; VI-NEXT: v_mov_b32_e32 v1, s1
3399 ; VI-NEXT: v_mov_b32_e32 v2, s2
3400 ; VI-NEXT: flat_store_dword v[0:1], v2
3403 ; GFX9-LABEL: simplify_demanded_bits_test_min_slt_i16:
3405 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x28
3406 ; GFX9-NEXT: s_load_dword s3, s[4:5], 0x4c
3407 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3408 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3409 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3410 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
3411 ; GFX9-NEXT: s_sext_i32_i16 s3, s3
3412 ; GFX9-NEXT: s_min_i32 s2, s2, s3
3413 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3414 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
3415 ; GFX9-NEXT: s_endpgm
3417 ; GFX10-LABEL: simplify_demanded_bits_test_min_slt_i16:
3419 ; GFX10-NEXT: s_clause 0x2
3420 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x28
3421 ; GFX10-NEXT: s_load_dword s3, s[4:5], 0x4c
3422 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3423 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3424 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3425 ; GFX10-NEXT: s_sext_i32_i16 s2, s2
3426 ; GFX10-NEXT: s_sext_i32_i16 s3, s3
3427 ; GFX10-NEXT: s_min_i32 s2, s2, s3
3428 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
3429 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
3430 ; GFX10-NEXT: s_endpgm
3432 ; GFX11-LABEL: simplify_demanded_bits_test_min_slt_i16:
3434 ; GFX11-NEXT: s_clause 0x2
3435 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x28
3436 ; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x4c
3437 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
3438 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3439 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3440 ; GFX11-NEXT: s_sext_i32_i16 s2, s2
3441 ; GFX11-NEXT: s_sext_i32_i16 s3, s3
3442 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3443 ; GFX11-NEXT: s_min_i32 s2, s2, s3
3444 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
3445 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
3446 ; GFX11-NEXT: s_nop 0
3447 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3448 ; GFX11-NEXT: s_endpgm
3449 %a.ext = sext i16 %a to i32
3450 %b.ext = sext i16 %b to i32
3451 %cmp = icmp slt i32 %a.ext, %b.ext
3452 %val = select i1 %cmp, i32 %a.ext, i32 %b.ext
3453 %shl = shl i32 %val, 16
3454 %sextinreg = ashr i32 %shl, 16
3455 store i32 %sextinreg, ptr addrspace(1) %out
3459 define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i16 %b) #0 {
3460 ; EG-LABEL: s_test_imin_sle_i16:
3462 ; EG-NEXT: ALU 0, @10, KC0[], KC1[]
3464 ; EG-NEXT: ALU 14, @11, KC0[CB0:0-32], KC1[]
3465 ; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
3468 ; EG-NEXT: Fetch clause starting at 6:
3469 ; EG-NEXT: VTX_READ_16 T1.X, T0.X, 40, #3
3470 ; EG-NEXT: VTX_READ_16 T0.X, T0.X, 42, #3
3471 ; EG-NEXT: ALU clause starting at 10:
3472 ; EG-NEXT: MOV * T0.X, 0.0,
3473 ; EG-NEXT: ALU clause starting at 11:
3474 ; EG-NEXT: BFE_INT T0.Z, T1.X, 0.0, literal.x,
3475 ; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
3476 ; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
3477 ; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
3478 ; EG-NEXT: MIN_INT * T0.W, PV.Z, PV.W,
3479 ; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
3480 ; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
3481 ; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
3482 ; EG-NEXT: LSHL T0.X, PV.W, PS,
3483 ; EG-NEXT: LSHL * T0.W, literal.x, PS,
3484 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
3485 ; EG-NEXT: MOV T0.Y, 0.0,
3486 ; EG-NEXT: MOV * T0.Z, 0.0,
3487 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3488 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3490 ; CI-LABEL: s_test_imin_sle_i16:
3492 ; CI-NEXT: s_load_dword s2, s[4:5], 0x2
3493 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3494 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3495 ; CI-NEXT: s_sext_i32_i16 s3, s2
3496 ; CI-NEXT: s_ashr_i32 s2, s2, 16
3497 ; CI-NEXT: s_min_i32 s2, s3, s2
3498 ; CI-NEXT: v_mov_b32_e32 v0, s0
3499 ; CI-NEXT: v_mov_b32_e32 v1, s1
3500 ; CI-NEXT: v_mov_b32_e32 v2, s2
3501 ; CI-NEXT: flat_store_short v[0:1], v2
3504 ; VI-LABEL: s_test_imin_sle_i16:
3506 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8
3507 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3508 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3509 ; VI-NEXT: s_sext_i32_i16 s3, s2
3510 ; VI-NEXT: s_ashr_i32 s2, s2, 16
3511 ; VI-NEXT: s_min_i32 s2, s3, s2
3512 ; VI-NEXT: v_mov_b32_e32 v0, s0
3513 ; VI-NEXT: v_mov_b32_e32 v1, s1
3514 ; VI-NEXT: v_mov_b32_e32 v2, s2
3515 ; VI-NEXT: flat_store_short v[0:1], v2
3518 ; GFX9-LABEL: s_test_imin_sle_i16:
3520 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8
3521 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3522 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3523 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3524 ; GFX9-NEXT: s_sext_i32_i16 s3, s2
3525 ; GFX9-NEXT: s_ashr_i32 s2, s2, 16
3526 ; GFX9-NEXT: s_min_i32 s2, s3, s2
3527 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3528 ; GFX9-NEXT: global_store_short v0, v1, s[0:1]
3529 ; GFX9-NEXT: s_endpgm
3531 ; GFX10-LABEL: s_test_imin_sle_i16:
3533 ; GFX10-NEXT: s_clause 0x1
3534 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8
3535 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3536 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3537 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3538 ; GFX10-NEXT: s_sext_i32_i16 s3, s2
3539 ; GFX10-NEXT: s_ashr_i32 s2, s2, 16
3540 ; GFX10-NEXT: s_min_i32 s2, s3, s2
3541 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
3542 ; GFX10-NEXT: global_store_short v0, v1, s[0:1]
3543 ; GFX10-NEXT: s_endpgm
3545 ; GFX11-LABEL: s_test_imin_sle_i16:
3547 ; GFX11-NEXT: s_clause 0x1
3548 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x8
3549 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
3550 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3551 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3552 ; GFX11-NEXT: s_sext_i32_i16 s3, s2
3553 ; GFX11-NEXT: s_ashr_i32 s2, s2, 16
3554 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3555 ; GFX11-NEXT: s_min_i32 s2, s3, s2
3556 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
3557 ; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
3558 ; GFX11-NEXT: s_nop 0
3559 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3560 ; GFX11-NEXT: s_endpgm
3561 %cmp = icmp sle i16 %a, %b
3562 %val = select i1 %cmp, i16 %a, i16 %b
3563 store i16 %val, ptr addrspace(1) %out
3569 define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3570 ; EG-LABEL: test_umin_ult_i64:
3572 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3573 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3576 ; EG-NEXT: ALU clause starting at 4:
3577 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3578 ; EG-NEXT: SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X,
3579 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3580 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3581 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3582 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3583 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3584 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3586 ; CI-LABEL: test_umin_ult_i64:
3588 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3589 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
3590 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3591 ; CI-NEXT: v_mov_b32_e32 v0, s0
3592 ; CI-NEXT: v_mov_b32_e32 v1, s4
3593 ; CI-NEXT: v_mov_b32_e32 v2, s5
3594 ; CI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
3595 ; CI-NEXT: v_mov_b32_e32 v1, s1
3596 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3597 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3598 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3599 ; CI-NEXT: v_mov_b32_e32 v2, s1
3600 ; CI-NEXT: v_mov_b32_e32 v3, s0
3601 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3604 ; VI-LABEL: test_umin_ult_i64:
3606 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3607 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
3608 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3609 ; VI-NEXT: v_mov_b32_e32 v0, s0
3610 ; VI-NEXT: v_mov_b32_e32 v1, s4
3611 ; VI-NEXT: v_mov_b32_e32 v2, s5
3612 ; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[1:2]
3613 ; VI-NEXT: v_mov_b32_e32 v1, s1
3614 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3615 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3616 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3617 ; VI-NEXT: v_mov_b32_e32 v2, s1
3618 ; VI-NEXT: v_mov_b32_e32 v3, s0
3619 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3622 ; GFX9-LABEL: test_umin_ult_i64:
3624 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
3625 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3626 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3627 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3628 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3629 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3630 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
3631 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
3632 ; GFX9-NEXT: s_cselect_b32 s3, s3, s7
3633 ; GFX9-NEXT: s_cselect_b32 s2, s2, s6
3634 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3635 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3636 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3637 ; GFX9-NEXT: s_endpgm
3639 ; GFX10-LABEL: test_umin_ult_i64:
3641 ; GFX10-NEXT: s_clause 0x1
3642 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3643 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
3644 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3645 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3646 ; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[2:3], s[6:7]
3647 ; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
3648 ; GFX10-NEXT: s_cselect_b32 s2, s2, s6
3649 ; GFX10-NEXT: s_cselect_b32 s3, s3, s7
3650 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3651 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3652 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3653 ; GFX10-NEXT: s_endpgm
3655 ; GFX11-LABEL: test_umin_ult_i64:
3657 ; GFX11-NEXT: s_clause 0x1
3658 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
3659 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
3660 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3661 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3662 ; GFX11-NEXT: v_cmp_lt_u64_e64 s2, s[6:7], s[0:1]
3663 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3664 ; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
3665 ; GFX11-NEXT: s_cselect_b32 s0, s6, s0
3666 ; GFX11-NEXT: s_cselect_b32 s1, s7, s1
3667 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
3668 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
3669 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
3670 ; GFX11-NEXT: s_nop 0
3671 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3672 ; GFX11-NEXT: s_endpgm
3673 %tmp = icmp ult i64 %a, %b
3674 %val = select i1 %tmp, i64 %a, i64 %b
3675 store i64 %val, ptr addrspace(1) %out, align 8
3679 define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3680 ; EG-LABEL: test_umin_ule_i64:
3682 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3683 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3686 ; EG-NEXT: ALU clause starting at 4:
3687 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3688 ; EG-NEXT: SETGT_UINT * T0.W, KC0[3].Z, KC0[3].X,
3689 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3690 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3691 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3692 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3693 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3694 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3696 ; CI-LABEL: test_umin_ule_i64:
3698 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3699 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
3700 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3701 ; CI-NEXT: v_mov_b32_e32 v0, s0
3702 ; CI-NEXT: v_mov_b32_e32 v1, s4
3703 ; CI-NEXT: v_mov_b32_e32 v2, s5
3704 ; CI-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
3705 ; CI-NEXT: v_mov_b32_e32 v1, s1
3706 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3707 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3708 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3709 ; CI-NEXT: v_mov_b32_e32 v2, s1
3710 ; CI-NEXT: v_mov_b32_e32 v3, s0
3711 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3714 ; VI-LABEL: test_umin_ule_i64:
3716 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3717 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
3718 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3719 ; VI-NEXT: v_mov_b32_e32 v0, s0
3720 ; VI-NEXT: v_mov_b32_e32 v1, s4
3721 ; VI-NEXT: v_mov_b32_e32 v2, s5
3722 ; VI-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[1:2]
3723 ; VI-NEXT: v_mov_b32_e32 v1, s1
3724 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3725 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3726 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3727 ; VI-NEXT: v_mov_b32_e32 v2, s1
3728 ; VI-NEXT: v_mov_b32_e32 v3, s0
3729 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3732 ; GFX9-LABEL: test_umin_ule_i64:
3734 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
3735 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3736 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3737 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3738 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3739 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3740 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, s[2:3], v[0:1]
3741 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
3742 ; GFX9-NEXT: s_cselect_b32 s3, s3, s7
3743 ; GFX9-NEXT: s_cselect_b32 s2, s2, s6
3744 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3745 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3746 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3747 ; GFX9-NEXT: s_endpgm
3749 ; GFX10-LABEL: test_umin_ule_i64:
3751 ; GFX10-NEXT: s_clause 0x1
3752 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3753 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
3754 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3755 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3756 ; GFX10-NEXT: v_cmp_le_u64_e64 s4, s[2:3], s[6:7]
3757 ; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
3758 ; GFX10-NEXT: s_cselect_b32 s2, s2, s6
3759 ; GFX10-NEXT: s_cselect_b32 s3, s3, s7
3760 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3761 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3762 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3763 ; GFX10-NEXT: s_endpgm
3765 ; GFX11-LABEL: test_umin_ule_i64:
3767 ; GFX11-NEXT: s_clause 0x1
3768 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
3769 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
3770 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3771 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3772 ; GFX11-NEXT: v_cmp_le_u64_e64 s2, s[6:7], s[0:1]
3773 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3774 ; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
3775 ; GFX11-NEXT: s_cselect_b32 s0, s6, s0
3776 ; GFX11-NEXT: s_cselect_b32 s1, s7, s1
3777 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
3778 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
3779 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
3780 ; GFX11-NEXT: s_nop 0
3781 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3782 ; GFX11-NEXT: s_endpgm
3783 %tmp = icmp ule i64 %a, %b
3784 %val = select i1 %tmp, i64 %a, i64 %b
3785 store i64 %val, ptr addrspace(1) %out, align 8
3789 define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3790 ; EG-LABEL: test_imin_slt_i64:
3792 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3793 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3796 ; EG-NEXT: ALU clause starting at 4:
3797 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3798 ; EG-NEXT: SETGT_INT * T0.W, KC0[3].Z, KC0[3].X,
3799 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3800 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3801 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3802 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3803 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3804 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3806 ; CI-LABEL: test_imin_slt_i64:
3808 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3809 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
3810 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3811 ; CI-NEXT: v_mov_b32_e32 v0, s0
3812 ; CI-NEXT: v_mov_b32_e32 v1, s4
3813 ; CI-NEXT: v_mov_b32_e32 v2, s5
3814 ; CI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
3815 ; CI-NEXT: v_mov_b32_e32 v1, s1
3816 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3817 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3818 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3819 ; CI-NEXT: v_mov_b32_e32 v2, s1
3820 ; CI-NEXT: v_mov_b32_e32 v3, s0
3821 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3824 ; VI-LABEL: test_imin_slt_i64:
3826 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3827 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
3828 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3829 ; VI-NEXT: v_mov_b32_e32 v0, s0
3830 ; VI-NEXT: v_mov_b32_e32 v1, s4
3831 ; VI-NEXT: v_mov_b32_e32 v2, s5
3832 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
3833 ; VI-NEXT: v_mov_b32_e32 v1, s1
3834 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3835 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3836 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3837 ; VI-NEXT: v_mov_b32_e32 v2, s1
3838 ; VI-NEXT: v_mov_b32_e32 v3, s0
3839 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3842 ; GFX9-LABEL: test_imin_slt_i64:
3844 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
3845 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3846 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3847 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3848 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3849 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3850 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
3851 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
3852 ; GFX9-NEXT: s_cselect_b32 s3, s3, s7
3853 ; GFX9-NEXT: s_cselect_b32 s2, s2, s6
3854 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3855 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3856 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3857 ; GFX9-NEXT: s_endpgm
3859 ; GFX10-LABEL: test_imin_slt_i64:
3861 ; GFX10-NEXT: s_clause 0x1
3862 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3863 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
3864 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3865 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3866 ; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[2:3], s[6:7]
3867 ; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
3868 ; GFX10-NEXT: s_cselect_b32 s2, s2, s6
3869 ; GFX10-NEXT: s_cselect_b32 s3, s3, s7
3870 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3871 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3872 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3873 ; GFX10-NEXT: s_endpgm
3875 ; GFX11-LABEL: test_imin_slt_i64:
3877 ; GFX11-NEXT: s_clause 0x1
3878 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
3879 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
3880 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3881 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3882 ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[0:1]
3883 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3884 ; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
3885 ; GFX11-NEXT: s_cselect_b32 s0, s6, s0
3886 ; GFX11-NEXT: s_cselect_b32 s1, s7, s1
3887 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
3888 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
3889 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
3890 ; GFX11-NEXT: s_nop 0
3891 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3892 ; GFX11-NEXT: s_endpgm
3893 %tmp = icmp slt i64 %a, %b
3894 %val = select i1 %tmp, i64 %a, i64 %b
3895 store i64 %val, ptr addrspace(1) %out, align 8
3899 define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
3900 ; EG-LABEL: test_imin_sle_i64:
3902 ; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[]
3903 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
3906 ; EG-NEXT: ALU clause starting at 4:
3907 ; EG-NEXT: SETE_INT T0.Z, KC0[3].X, KC0[3].Z,
3908 ; EG-NEXT: SETGT_INT * T0.W, KC0[3].Z, KC0[3].X,
3909 ; EG-NEXT: SETGT_UINT * T1.W, KC0[3].Y, KC0[2].W,
3910 ; EG-NEXT: CNDE_INT * T0.W, T0.Z, T0.W, PV.W,
3911 ; EG-NEXT: CNDE_INT * T0.Y, PV.W, KC0[3].Z, KC0[3].X,
3912 ; EG-NEXT: CNDE_INT * T0.X, T0.W, KC0[3].Y, KC0[2].W,
3913 ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
3914 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
3916 ; CI-LABEL: test_imin_sle_i64:
3918 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3919 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
3920 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3921 ; CI-NEXT: v_mov_b32_e32 v0, s0
3922 ; CI-NEXT: v_mov_b32_e32 v1, s4
3923 ; CI-NEXT: v_mov_b32_e32 v2, s5
3924 ; CI-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
3925 ; CI-NEXT: v_mov_b32_e32 v1, s1
3926 ; CI-NEXT: s_and_b64 s[0:1], vcc, exec
3927 ; CI-NEXT: s_cselect_b32 s0, s3, s5
3928 ; CI-NEXT: s_cselect_b32 s1, s2, s4
3929 ; CI-NEXT: v_mov_b32_e32 v2, s1
3930 ; CI-NEXT: v_mov_b32_e32 v3, s0
3931 ; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3934 ; VI-LABEL: test_imin_sle_i64:
3936 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3937 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
3938 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3939 ; VI-NEXT: v_mov_b32_e32 v0, s0
3940 ; VI-NEXT: v_mov_b32_e32 v1, s4
3941 ; VI-NEXT: v_mov_b32_e32 v2, s5
3942 ; VI-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[1:2]
3943 ; VI-NEXT: v_mov_b32_e32 v1, s1
3944 ; VI-NEXT: s_and_b64 s[0:1], vcc, exec
3945 ; VI-NEXT: s_cselect_b32 s0, s3, s5
3946 ; VI-NEXT: s_cselect_b32 s1, s2, s4
3947 ; VI-NEXT: v_mov_b32_e32 v2, s1
3948 ; VI-NEXT: v_mov_b32_e32 v3, s0
3949 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
3952 ; GFX9-LABEL: test_imin_sle_i64:
3954 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
3955 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3956 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3957 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3958 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3959 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3960 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, s[2:3], v[0:1]
3961 ; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec
3962 ; GFX9-NEXT: s_cselect_b32 s3, s3, s7
3963 ; GFX9-NEXT: s_cselect_b32 s2, s2, s6
3964 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3965 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3966 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3967 ; GFX9-NEXT: s_endpgm
3969 ; GFX10-LABEL: test_imin_sle_i64:
3971 ; GFX10-NEXT: s_clause 0x1
3972 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
3973 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
3974 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3975 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3976 ; GFX10-NEXT: v_cmp_le_i64_e64 s4, s[2:3], s[6:7]
3977 ; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
3978 ; GFX10-NEXT: s_cselect_b32 s2, s2, s6
3979 ; GFX10-NEXT: s_cselect_b32 s3, s3, s7
3980 ; GFX10-NEXT: v_mov_b32_e32 v0, s2
3981 ; GFX10-NEXT: v_mov_b32_e32 v1, s3
3982 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
3983 ; GFX10-NEXT: s_endpgm
3985 ; GFX11-LABEL: test_imin_sle_i64:
3987 ; GFX11-NEXT: s_clause 0x1
3988 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
3989 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
3990 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
3991 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3992 ; GFX11-NEXT: v_cmp_le_i64_e64 s2, s[6:7], s[0:1]
3993 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3994 ; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
3995 ; GFX11-NEXT: s_cselect_b32 s0, s6, s0
3996 ; GFX11-NEXT: s_cselect_b32 s1, s7, s1
3997 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
3998 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
3999 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
4000 ; GFX11-NEXT: s_nop 0
4001 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4002 ; GFX11-NEXT: s_endpgm
4003 %tmp = icmp sle i64 %a, %b
4004 %val = select i1 %tmp, i64 %a, i64 %b
4005 store i64 %val, ptr addrspace(1) %out, align 8
4009 define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
4010 ; EG-LABEL: v_test_imin_sle_v2i16:
4012 ; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
4014 ; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
4015 ; EG-NEXT: TEX 0 @10
4016 ; EG-NEXT: ALU 16, @16, KC0[CB0:0-32], KC1[]
4017 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
4020 ; EG-NEXT: Fetch clause starting at 8:
4021 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
4022 ; EG-NEXT: Fetch clause starting at 10:
4023 ; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
4024 ; EG-NEXT: ALU clause starting at 12:
4025 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
4026 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4027 ; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W,
4028 ; EG-NEXT: ALU clause starting at 15:
4029 ; EG-NEXT: ADD_INT * T7.X, KC0[2].W, T0.W,
4030 ; EG-NEXT: ALU clause starting at 16:
4031 ; EG-NEXT: LSHR T1.W, T0.X, literal.x,
4032 ; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
4033 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4034 ; EG-NEXT: BFE_INT T8.X, PS, 0.0, literal.x,
4035 ; EG-NEXT: BFE_INT T0.Y, PV.W, 0.0, literal.x,
4036 ; EG-NEXT: BFE_INT T0.Z, T7.X, 0.0, literal.x,
4037 ; EG-NEXT: BFE_INT * T1.W, T0.X, 0.0, literal.x, BS:VEC_120/SCL_212
4038 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4039 ; EG-NEXT: MIN_INT T1.W, PV.W, PV.Z,
4040 ; EG-NEXT: MIN_INT * T2.W, PV.Y, PV.X,
4041 ; EG-NEXT: LSHL T2.W, PS, literal.x,
4042 ; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
4043 ; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
4044 ; EG-NEXT: OR_INT T0.X, PS, PV.W,
4045 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
4046 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
4047 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4049 ; CI-LABEL: v_test_imin_sle_v2i16:
4051 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4052 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
4053 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
4054 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4055 ; CI-NEXT: v_mov_b32_e32 v1, s3
4056 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
4057 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4058 ; CI-NEXT: v_mov_b32_e32 v3, s5
4059 ; CI-NEXT: flat_load_dword v4, v[0:1]
4060 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
4061 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
4062 ; CI-NEXT: flat_load_dword v3, v[0:1]
4063 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
4064 ; CI-NEXT: v_mov_b32_e32 v1, s1
4065 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4066 ; CI-NEXT: s_waitcnt vmcnt(1)
4067 ; CI-NEXT: v_bfe_i32 v2, v4, 0, 16
4068 ; CI-NEXT: v_ashrrev_i32_e32 v4, 16, v4
4069 ; CI-NEXT: s_waitcnt vmcnt(0)
4070 ; CI-NEXT: v_bfe_i32 v5, v3, 0, 16
4071 ; CI-NEXT: v_ashrrev_i32_e32 v3, 16, v3
4072 ; CI-NEXT: v_min_i32_e32 v3, v4, v3
4073 ; CI-NEXT: v_min_i32_e32 v2, v2, v5
4074 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
4075 ; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
4076 ; CI-NEXT: v_or_b32_e32 v2, v2, v3
4077 ; CI-NEXT: flat_store_dword v[0:1], v2
4080 ; VI-LABEL: v_test_imin_sle_v2i16:
4082 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4083 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
4084 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
4085 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4086 ; VI-NEXT: v_mov_b32_e32 v1, s3
4087 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
4088 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4089 ; VI-NEXT: v_mov_b32_e32 v3, s5
4090 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
4091 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
4092 ; VI-NEXT: flat_load_dword v5, v[0:1]
4093 ; VI-NEXT: flat_load_dword v2, v[2:3]
4094 ; VI-NEXT: v_mov_b32_e32 v1, s1
4095 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
4096 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4097 ; VI-NEXT: s_waitcnt vmcnt(0)
4098 ; VI-NEXT: v_min_i16_e32 v3, v5, v2
4099 ; VI-NEXT: v_min_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4100 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
4101 ; VI-NEXT: flat_store_dword v[0:1], v2
4104 ; GFX9-LABEL: v_test_imin_sle_v2i16:
4106 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4107 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
4108 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4109 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4110 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
4111 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
4112 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4113 ; GFX9-NEXT: v_pk_min_i16 v1, v1, v2
4114 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
4115 ; GFX9-NEXT: s_endpgm
4117 ; GFX10-LABEL: v_test_imin_sle_v2i16:
4119 ; GFX10-NEXT: s_clause 0x1
4120 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4121 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
4122 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4123 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4124 ; GFX10-NEXT: s_clause 0x1
4125 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
4126 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
4127 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4128 ; GFX10-NEXT: v_pk_min_i16 v1, v1, v2
4129 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
4130 ; GFX10-NEXT: s_endpgm
4132 ; GFX11-LABEL: v_test_imin_sle_v2i16:
4134 ; GFX11-NEXT: s_clause 0x1
4135 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
4136 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
4137 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4138 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4139 ; GFX11-NEXT: s_clause 0x1
4140 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
4141 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
4142 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4143 ; GFX11-NEXT: v_pk_min_i16 v1, v1, v2
4144 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
4145 ; GFX11-NEXT: s_nop 0
4146 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4147 ; GFX11-NEXT: s_endpgm
4148 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4149 %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
4150 %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
4151 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
4152 %a = load <2 x i16>, ptr addrspace(1) %a.gep
4153 %b = load <2 x i16>, ptr addrspace(1) %b.gep
4154 %cmp = icmp sle <2 x i16> %a, %b
4155 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
4156 store <2 x i16> %val, ptr addrspace(1) %out.gep
4162 define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
4163 ; EG-LABEL: v_test_imin_ule_v2i16:
4165 ; EG-NEXT: ALU 2, @12, KC0[CB0:0-32], KC1[]
4167 ; EG-NEXT: ALU 0, @15, KC0[CB0:0-32], KC1[]
4168 ; EG-NEXT: TEX 0 @10
4169 ; EG-NEXT: ALU 13, @16, KC0[CB0:0-32], KC1[]
4170 ; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
4173 ; EG-NEXT: Fetch clause starting at 8:
4174 ; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
4175 ; EG-NEXT: Fetch clause starting at 10:
4176 ; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
4177 ; EG-NEXT: ALU clause starting at 12:
4178 ; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
4179 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4180 ; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W,
4181 ; EG-NEXT: ALU clause starting at 15:
4182 ; EG-NEXT: ADD_INT * T7.X, KC0[2].Z, T0.W,
4183 ; EG-NEXT: ALU clause starting at 16:
4184 ; EG-NEXT: LSHR T1.W, T0.X, literal.x,
4185 ; EG-NEXT: LSHR * T2.W, T7.X, literal.x,
4186 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4187 ; EG-NEXT: AND_INT T0.Z, T0.X, literal.x,
4188 ; EG-NEXT: AND_INT T3.W, T7.X, literal.x, BS:VEC_120/SCL_212
4189 ; EG-NEXT: MIN_UINT * T1.W, PS, PV.W,
4190 ; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
4191 ; EG-NEXT: LSHL T1.W, PS, literal.x,
4192 ; EG-NEXT: MIN_UINT * T2.W, PV.W, PV.Z,
4193 ; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
4194 ; EG-NEXT: OR_INT T0.X, PS, PV.W,
4195 ; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W,
4196 ; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
4197 ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
4199 ; CI-LABEL: v_test_imin_ule_v2i16:
4201 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4202 ; CI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
4203 ; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
4204 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4205 ; CI-NEXT: v_mov_b32_e32 v1, s3
4206 ; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
4207 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4208 ; CI-NEXT: v_mov_b32_e32 v3, s5
4209 ; CI-NEXT: flat_load_dword v4, v[0:1]
4210 ; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
4211 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
4212 ; CI-NEXT: flat_load_dword v3, v[0:1]
4213 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
4214 ; CI-NEXT: v_mov_b32_e32 v1, s1
4215 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4216 ; CI-NEXT: s_waitcnt vmcnt(1)
4217 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
4218 ; CI-NEXT: v_and_b32_e32 v4, 0xffff, v4
4219 ; CI-NEXT: s_waitcnt vmcnt(0)
4220 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
4221 ; CI-NEXT: v_and_b32_e32 v3, 0xffff, v3
4222 ; CI-NEXT: v_min_u32_e32 v2, v2, v5
4223 ; CI-NEXT: v_min_u32_e32 v3, v4, v3
4224 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
4225 ; CI-NEXT: v_or_b32_e32 v2, v3, v2
4226 ; CI-NEXT: flat_store_dword v[0:1], v2
4229 ; VI-LABEL: v_test_imin_ule_v2i16:
4231 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4232 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
4233 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
4234 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4235 ; VI-NEXT: v_mov_b32_e32 v1, s3
4236 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
4237 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4238 ; VI-NEXT: v_mov_b32_e32 v3, s5
4239 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
4240 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
4241 ; VI-NEXT: flat_load_dword v5, v[0:1]
4242 ; VI-NEXT: flat_load_dword v2, v[2:3]
4243 ; VI-NEXT: v_mov_b32_e32 v1, s1
4244 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
4245 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
4246 ; VI-NEXT: s_waitcnt vmcnt(0)
4247 ; VI-NEXT: v_min_u16_e32 v3, v5, v2
4248 ; VI-NEXT: v_min_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4249 ; VI-NEXT: v_or_b32_e32 v2, v3, v2
4250 ; VI-NEXT: flat_store_dword v[0:1], v2
4253 ; GFX9-LABEL: v_test_imin_ule_v2i16:
4255 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4256 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
4257 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4258 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4259 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
4260 ; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
4261 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4262 ; GFX9-NEXT: v_pk_min_u16 v1, v1, v2
4263 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
4264 ; GFX9-NEXT: s_endpgm
4266 ; GFX10-LABEL: v_test_imin_ule_v2i16:
4268 ; GFX10-NEXT: s_clause 0x1
4269 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
4270 ; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
4271 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4272 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
4273 ; GFX10-NEXT: s_clause 0x1
4274 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
4275 ; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
4276 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4277 ; GFX10-NEXT: v_pk_min_u16 v1, v1, v2
4278 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
4279 ; GFX10-NEXT: s_endpgm
4281 ; GFX11-LABEL: v_test_imin_ule_v2i16:
4283 ; GFX11-NEXT: s_clause 0x1
4284 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x0
4285 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x10
4286 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
4287 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
4288 ; GFX11-NEXT: s_clause 0x1
4289 ; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
4290 ; GFX11-NEXT: global_load_b32 v2, v0, s[0:1]
4291 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4292 ; GFX11-NEXT: v_pk_min_u16 v1, v1, v2
4293 ; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
4294 ; GFX11-NEXT: s_nop 0
4295 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
4296 ; GFX11-NEXT: s_endpgm
4297 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4298 %a.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %a.ptr, i32 %tid
4299 %b.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %b.ptr, i32 %tid
4300 %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
4301 %a = load <2 x i16>, ptr addrspace(1) %a.gep
4302 %b = load <2 x i16>, ptr addrspace(1) %b.gep
4303 %cmp = icmp ule <2 x i16> %a, %b
4304 %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
4305 store <2 x i16> %val, ptr addrspace(1) %out.gep
4309 declare i32 @llvm.amdgcn.workitem.id.x() #1
4311 attributes #0 = { nounwind }
4312 attributes #1 = { nounwind readnone }